In [2]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-3.5.1-cp36-cp36m-manylinux1_x86_64.whl (365kB)
[K    100% |████████████████████████████████| 368kB 1.6MB/s ta 0:00:01
[?25hInstalling collected packages: pymongo
Successfully installed pymongo-3.5.1


In [3]:
import re
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import pymongo
import json

In [4]:
client = pymongo.MongoClient('54.201.199.246', 27016)

wiki_db = client.wikipedia

wiki_col = wiki_db.my_collection

In [5]:
wiki_col.count()

1636

## Create functions to get data from Wiki API

In [6]:
def category_request(category):
    """
    Scrape a category page from Wikipedia API.
    
    Params:
    --------
    category: str
        The name of the category to be scraped.
        
    Returns:
    --------
    DataFrame
        Pandas DataFrame containing categories 
        
    """
    my_params = {
        'action':'query',
        'format':'json',
        'list':'categorymembers',
        'cmtitle': 'Category:{}'.format(category),
        'cmlimit': 'max'
        }
    page = requests.get('http://en.wikipedia.org/w/api.php', params=my_params)
    return pd.DataFrame(page.json()['query']['categorymembers'])


In [7]:
def get_content(title):
    """
    Scrape a page from Wikipedia API to get the conecnt.
    
    Params:
    --------
    title: str
        The name of the page to be scraped.
        
    Returns:
    --------
    List of dictionaries
        list of the content of the page
        
    """
    my_params = {
        'action':'query',
        'format':'json',
        'titles': title,
        'prop': 'revisions',
        'rvprop': 'content'
    }
    content = requests.get('http://en.wikipedia.org/w/api.php', params=my_params)
    return list(content.json()['query']['pages'].values())[0]['revisions'][0]


In [8]:
def get_cats_and_pages(category):
    """
    Returns the pages and subcategories of a category
    
    Params
    ------
    category : str
        Name of a category
    
    Returns
    -------
    sub_categories: list 
        list of sub categories 
    pages: list
        list of pages on the category
        
    """
    cats = pd.DataFrame(category_request(category))
    cats['title'] = cats.title.astype(str) 
    #returns a boolean mask of all titles with 'category' in the str
    subs_mask = cats['title'].str.contains('Category:')
    
    #creates list of new sub catagories
    children = list(cats['title'][subs_mask].str.replace('Category:', ""))
    pages = list(cats['title'][~cats.title.str.contains('Category:')])
    page_id = list(cats['pageid'][~cats.title.str.contains('Category:')])
    return page_id, pages, children

#sub_categories, pages

In [9]:
def wiki_traverse(category):
    """ 
    Returns a list of dictionary of categories, page titles and page contents
    
    Params
    ------
    category : str
        Name of a category
    
    Returns
    -------
    page_content: list 
        list of dictionaries with categories, page titles and page contents 
        
    """
        
    q = []
    q.append(category) 

    page_content = []
 
    #while the q is not empty
    while q: 
        current_node = q.pop(0) #pop the first element off the list you've created 
#         print(current_node)
        
        page_id, pages, children = get_cats_and_pages(current_node)
        
        for child in children:
            q.append(child)   
     
        for index, article in enumerate(pages):    
            article_dict = {}
            article_dict['category'] = current_node
            article_dict['article'] = article
            article_dict['content'] = get_content(article)
#             article_dict['page_id'] = str(page_id[index])
            
            wiki_col.insert_one(article_dict)
            
            page_content.append(article_dict)      
            
    return page_content

In [93]:
Machine_learning = wiki_traverse('Machine learning')

In [95]:
Machine_learning_df = pd.DataFrame(Machine_learning)

In [16]:
Machine_learning_df.to_csv('machine_learning_df.csv')

## Insert machine learning data into Mongo database

In [116]:
wiki_col.find_

Collection(Database(MongoClient(host=['54.201.199.246:27016'], document_class=dict, tz_aware=False, connect=True), 'wikipedia'), 'my_collection.find_')

In [117]:
mong_db = list(wiki_col.find())

In [123]:
wiki_col.drop()

In [124]:
client.database_names(), wiki_db.collection_names()

(['admin', 'local', 'my_database', 'test'], [])