In [1]:
import re
import requests
import pandas as pd

In [2]:
def generate_category(category):
    '''
    format a category for insertion in to a wikipedia api call
    '''
    category = re.sub('\s','+',category)
    return category

In [3]:
generate_category('machine learning')

'machine+learning'

In [4]:
def generate_query(category):
    '''
    Format an api call for requests
    '''
    query = """
            http://en.wikipedia.org/w/api.php?
            action=query&
            format=json&
            list=categorymembers&
            cmtitle=Category:{}& 
            cmlimit=max
            """.format(generate_category(category))
    query = re.sub('\s','',query)
    return query

In [5]:
generate_category('machine_learning')

'machine_learning'

In [6]:
def execute_category_query(category):
    '''
    Executes a category qeury and returns a 
    DataFrame of the category members
    '''
    
    r = requests.get(generate_query(category))
    response = r.json()
    return response

In [7]:
def execute_category_query(category):
    '''
    Executes a category qeury and returns a 
    DataFrame of the category members
    '''
    
    r = requests.get(generate_query(category))
    response = r.json()
    return pd.DataFrame(response['query']['categorymembers'])

In [8]:
test = execute_category_query("machine learning")

In [9]:
test

Unnamed: 0,ns,pageid,title
0,2,54972729,User:CustIntelMngt/sandbox/Customer Intelligen...
1,0,43385931,Data exploration
2,0,49082762,List of datasets for machine learning research
3,0,233488,Machine learning
4,0,53587467,Outline of machine learning
5,0,3771060,Accuracy paradox
6,0,43808044,Action model learning
7,0,28801798,Active learning (machine learning)
8,0,45049676,Adversarial machine learning
9,0,52642349,AIVA


In [10]:
category_mask = test['title'].str.contains('Category:')

In [11]:
subcat_df_list = []

In [12]:
def remove_category(category):
    category = re.sub('Category:', '', category)
    return category

In [13]:
test[category_mask].head()['title'].apply(remove_category)

200      Applied machine learning
201    Artificial neural networks
202             Bayesian networks
203     Classification algorithms
204              Cluster analysis
Name: title, dtype: object

In [14]:
categories_to_query = test[category_mask]['title'].apply(remove_category).tolist()

In [15]:
for category in categories_to_query:
    subcat_df_list.append(execute_category_query(category))

In [16]:
test[category_mask].head()['title'].apply(remove_category)

200      Applied machine learning
201    Artificial neural networks
202             Bayesian networks
203     Classification algorithms
204              Cluster analysis
Name: title, dtype: object

In [17]:
subcat_df_list[1]

Unnamed: 0,ns,pageid,title
0,0,21523,Artificial neural network
1,0,28016652,Types of artificial neural networks
2,0,14179835,Activation function
3,0,8220913,ADALINE
4,0,31663887,Adaptive neuro fuzzy inference system
5,0,3056879,Adaptive resonance theory
6,0,4231161,ALOPEX
7,0,16167377,Artificial Intelligence System
8,0,349771,Artificial neuron
9,0,51404222,Artisto


In [18]:
test[~category_mask].head()

Unnamed: 0,ns,pageid,title
0,2,54972729,User:CustIntelMngt/sandbox/Customer Intelligen...
1,0,43385931,Data exploration
2,0,49082762,List of datasets for machine learning research
3,0,233488,Machine learning
4,0,53587467,Outline of machine learning


In [18]:
def get_all_pages_rec(category):
    category_df = execute_category_query(category)
    pages_list = []
    category_mask = category_df['title'].str.contains('Category:')
    pages_df = category_df[~category_mask]
    pages_list.append(pages_df)
    categories = category_df[category_mask]['title']\
                            .str.replace('Category:','').tolist()
    if len(categories) > 0:
        for cat in categories:
            pages_list.append(get_all_pages_rec(cat))
    
    pages_df = pd.concat(pages_list)
    pages_df.reset_index()
    return pages_df

In [19]:
rec_test = get_all_pages_rec('machine learning')

In [20]:
rec_test

Unnamed: 0,ns,pageid,title
0,2,54972729,User:CustIntelMngt/sandbox/Customer Intelligen...
1,0,43385931,Data exploration
2,0,49082762,List of datasets for machine learning research
3,0,233488,Machine learning
4,0,53587467,Outline of machine learning
5,0,3771060,Accuracy paradox
6,0,43808044,Action model learning
7,0,28801798,Active learning (machine learning)
8,0,45049676,Adversarial machine learning
9,0,52642349,AIVA


In [21]:
rec_test.reset_index(drop=True).shape

(1614, 3)

In [22]:
rec_test = rec_test.reset_index(drop=True).drop_duplicates()

In [23]:
def get_whole_category(category):
    df = get_all_pages_rec(category)
    df = df.drop_duplicates().reset_index(drop=True)
    df['category'] = category
    return df

In [24]:
gwc_test = get_whole_category("machine learning")

In [25]:
gwc_test.tail()

Unnamed: 0,ns,pageid,title,category
1102,0,46096,Simpson's paradox,machine learning
1103,0,39177819,Cognitive computer,machine learning
1104,0,50073184,Generative adversarial networks,machine learning
1105,0,404084,Hebbian theory,machine learning
1106,0,47805,Vector quantization,machine learning


In [29]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz
Building wheels for collected packages: wikipedia
  Running setup.py bdist_wheel for wikipedia ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/bf/87/25/df698dd7b66a42c1c5f3bd36f8155d4518d210f5e2c128b440
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [26]:
import wikipedia

In [27]:
my_list = [wikipedia.WikipediaPage(r['title']).content for i, r in rec_test.iterrows()]

In [28]:
rec_test['page'] = my_list

In [29]:
rec_test.head()

Unnamed: 0,ns,pageid,title,page
0,2,54972729,User:CustIntelMngt/sandbox/Customer Intelligen...,\n= Customer Intelligence Management =\n\n\n==...
1,0,43385931,Data exploration,Data exploration is an approach similar to ini...
2,0,49082762,List of datasets for machine learning research,These datasets are used for machine learning r...
3,0,233488,Machine learning,Machine learning is the subfield of computer s...
4,0,53587467,Outline of machine learning,The following outline is provided as an overvi...


In [30]:
MLdf = rec_test

In [31]:
MLdf.to_pickle('../data/MLdf.pickle')

In [1]:
!pip install psycopg2



In [2]:
import psycopg2
from psycopg2.extras import RealDictCursor