In [1]:
import re
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import pymongo
import json

In [2]:
client = pymongo.MongoClient('54.201.199.246', 27016)

wiki_db = client.wikipedia

wiki_col = wiki_db.my_collection

## Create functions to get data from Wiki API

In [3]:
def category_request(category):
    """
    Scrape a category page from Wikipedia API.
    
    Params:
    --------
    category: str
        The name of the category to be scraped.
        
    Returns:
    --------
    DataFrame
        Pandas DataFrame containing categories 
        
    """
    my_params = {
        'action':'query',
        'format':'json',
        'list':'categorymembers',
        'cmtitle': 'Category:{}'.format(category),
        'cmlimit': 'max'
        }
    page = requests.get('http://en.wikipedia.org/w/api.php', params=my_params)
    return pd.DataFrame(page.json()['query']['categorymembers'])


In [4]:
def get_content(title):
    """
    Scrape a page from Wikipedia API to get the conecnt.
    
    Params:
    --------
    title: str
        The name of the page to be scraped.
        
    Returns:
    --------
    List of dictionaries
        list of the content of the page
        
    """
    my_params = {
        'action':'query',
        'format':'json',
        'titles': title,
        'prop': 'revisions',
        'rvprop': 'content'
    }
    content = requests.get('http://en.wikipedia.org/w/api.php', params=my_params)
    return list(content.json()['query']['pages'].values())[0]['revisions'][0]


In [5]:
def get_cats_and_pages(category):
    """
    Returns the pages and subcategories of a category
    
    Params
    ------
    category : str
        Name of a category
    
    Returns
    -------
    sub_categories: list 
        list of sub categories 
    pages: list
        list of pages on the category
        
    """
    cats = pd.DataFrame(category_request(category))
    cats['title'] = cats.title.astype(str) 
    #returns a boolean mask of all titles with 'category' in the str
    subs_mask = cats['title'].str.contains('Category:')
    
    #creates list of new sub catagories
    children = list(cats['title'][subs_mask].str.replace('Category:', ""))
    pages = list(cats['title'][~cats.title.str.contains('Category:')])
    page_id = list(cats['pageid'][~cats.title.str.contains('Category:')])
    return page_id, pages, children

#sub_categories, pages

In [6]:
def wiki_traverse(category):
    """ 
    Returns a list of dictionary of categories, page titles and page contents
    
    Params
    ------
    category : str
        Name of a category
    
    Returns
    -------
    page_content: list 
        list of dictionaries with categories, page titles and page contents 
        
    """
        
    q = []
    q.append(category) 

    page_content = []
 
    #while the q is not empty
    while q: 
        current_node = q.pop(0) #pop the first element off the list you've created 
#         print(current_node)
        
        page_id, pages, children = get_cats_and_pages(current_node)
        
        for child in children:
            q.append(child)   
     
        for index, article in enumerate(pages):    
            article_dict = {}
            article_dict['category'] = current_node
            article_dict['article'] = article
            article_dict['content'] = get_content(article)
            article_dict['page_id'] = str(page_id[index])
            
            #this line adds each article onto mongo database as each article is being called 
            wiki_col.insert_one(article_dict) 
            
            page_content.append(article_dict)      
            
    return page_content

In [7]:
test = wiki_traverse('Machine learning portal')

In [22]:
Machine_learning = wiki_traverse('Machine learning')

##  Look up article in Mongo Database

In [27]:
client.database_names(), wiki_db.collection_names()

(['admin', 'local', 'my_database', 'test', 'wikipedia'], ['my_collection'])

In [25]:
wiki_col.count()

1636

In [23]:
cursor = wiki_col.find()

In [24]:
next(cursor)['content']

{'*': '{{Multiple issues|{{refimprove|date=July 2017}}{{more footnotes|date=July 2017}}}}\n\n\'\'\'Data exploration\'\'\' is an approach similar to initial data analysis, whereby a data analyst uses visual exploration to understand what is in a dataset and the characteristics of the data, rather than through traditional data management systems<ref name="Foster">[https://www.fosteropenscience.eu/sites/default/files/pdf/2933.pdf FOSTER Open Science], Overview of Data Exploration Techniques: Stratos Idreos, Olga Papaemmonouil, Surajit Chaudhuri.</ref>. These characteristics can include size or amount of data, completeness of the data, correctness of the data, possible relationships amongst data elements or files/tables in the data.\n\nData exploration is typically conducted using a combination of automated and manual activities.<ref name="Foster" /><ref name="Stanford2011">[http://vis.stanford.edu/files/2011-Wrangler-CHI.pdf Stanford.edu], 2011 Wrangler: Interactive Visual Specification o