In [8]:
import re
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

#### Below is the wikipedia api call for a category search:

`http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3A+machine+learning&cmlimit=max`

`action=query`: query the wikipedia api

`format=json`: return a json format

`list=categorymembers`: List of pages that belong to a given category, ordered by page sort title

`cmtitle=Category%3A+machine+learning`: title of category

`climit=max`: return up to the maximum amount of responses (500)

You may use this to get page titles from the wikipedia API. Things to watch out for:
* The responses contain categories
* You will want to fetch articles in those subcategories

The API's detailed documentation can be found [here](https://www.mediawiki.org/wiki/API:Main_page)

#### Make a function that formats a request for pages of a category

In [2]:
# # use regex to replace name of category in the search string
# #'http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3A+machine+learning&cmlimit=max'
# category = re.sub('\s', '+', category) # replace spaces in category with +s so can insert into search string
# &cmtitle=Category%3A+machine+learning&

In [9]:
def category_request(category):
    """
    Scrape a category page from Wikipedia API.
    
    Params:
    --------
    category: str
        The name of the category to be scraped.
        
    Returns:
    --------
    DataFrame
        Pandas DataFrame containing categories 
        
    """
    my_params = {
        'action':'query',
        'format':'json',
        'list':'categorymembers',
        'cmtitle': 'Category:{}'.format(category),
        'cmlimit': 'max'
        }
    page = requests.get('http://en.wikipedia.org/w/api.php', params=my_params)
    return pd.DataFrame(page.json()['query']['categorymembers'])


In [10]:
def get_page(title):
    """
    Scrape a page from Wikipedia API.
    
    Params:
    --------
    title: str
        The name of the page to be scraped.
        
    Returns:
    --------
    List of dictionaries
        list of the content of the page
        
    """
    my_params = {
        'action':'query',
        'format':'json',
        'titles': title,
        'prop': 'revisions',
        'rvprop': 'content'
    }
    content = requests.get('http://en.wikipedia.org/w/api.php', params=my_params)
    return list(content.json()['query']['pages'].values())[0]['revisions'][0]
# get_page('Machine learning')

In [11]:
def get_cats_and_pages(category):
    """
    Returns the pages and subcategories of a category
    
    Params
    ------
    category : str
        Name of a category
    
    Returns
    -------
    sub_categories: list 
        list of sub categories 
    pages: list
        list of pages on the category
        
    """
    cats = pd.DataFrame(category_request(category))
    cats['title'] = cats.title.astype(str) 
    #returns a boolean mask of all titles with 'category' in the str
    subs_mask = cats['title'].str.contains('Category:')
    
    #creates list of new sub catagories
    children = list(cats['title'][subs_mask].str.replace('Category:', ""))
    pages = list(cats['title'][~cats.title.str.contains('Category:')])
    return pages, children

#sub_categories, pages

In [12]:
def wiki_traverse (category):
    """ 
    Returns a list of dictionary of categories, page titles and page contents
    
    Params
    ------
    category : str
        Name of a category
    
    Returns
    -------
    page_content: list 
        list of dictionaries with categories, page titles and page contents 
        
    """
        
    q = []
    q.append(category) 

    page_content = []
 
    #while the q is not empty
    while q: 
        current_node = q.pop(0) #pop the first element off the list you've created 
#         print(current_node)
        
        pages, children = get_cats_and_pages(current_node)
        for child in children:
            q.append(child)
#         print(pages)
     
        for article in pages:
            article_dict = {}
            article_dict['category'] = current_node
            article_dict['article'] = article
            article_dict['content'] = get_page(article)
            
            page_content.append(article_dict)
            
    return page_content
#     return current_node, page_content



In [None]:
Machine_learning = wiki_traverse('Machine learning')

In [None]:
Machine_learning = pd.DataFrame(Machine_learning)

In [None]:
Machine_learning.to_csv('machine_learning_df.csv')

In [None]:
Business_software = wiki_traverse('Business software')