In [7]:
import re
import requests
import pandas as pd

In [8]:
def generate_category(category):
    '''
    format a category for insertion in to a wikipedia api call
    '''
    category = re.sub('\s','+',category)
    return category

In [9]:
generate_category('business software')

'business+software'

In [10]:
def generate_query(category):
    '''
    Format an api call for requests
    '''
    query = """
            http://en.wikipedia.org/w/api.php?
            action=query&
            format=json&
            list=categorymembers&
            cmtitle=Category:{}& 
            cmlimit=max
            """.format(generate_category(category))
    query = re.sub('\s','',query)
    return query

In [11]:
generate_category('business_software')

'business_software'

In [12]:
def execute_category_query(category):
    '''
    Executes a category qeury and returns a 
    DataFrame of the category members
    '''
    
    r = requests.get(generate_query(category))
    response = r.json()
    return response

In [13]:
def execute_category_query(category):
    '''
    Executes a category qeury and returns a 
    DataFrame of the category members
    '''
    
    r = requests.get(generate_query(category))
    response = r.json()
    return pd.DataFrame(response['query']['categorymembers'])

In [14]:
test = execute_category_query("business software")

In [15]:
test

Unnamed: 0,ns,pageid,title
0,0,1037763,Business software
1,0,41270069,AccuSystems
2,0,5211212,Active policy management
3,0,28502793,Alexandria (library software)
4,0,44133735,Alteryx
5,0,12715119,Amadeus CRS
6,0,24061342,AMS Device Manager
7,0,54594603,Angelfish software
8,0,1762176,Applicant tracking system
9,0,22847264,Application retirement


In [16]:
category_mask = test['title'].str.contains('Category:')

In [17]:
def remove_category(category):
    category = re.sub('Category:', '', category)
    return category

In [18]:
test[category_mask].head()['title'].apply(remove_category)

297        Administrative software
298      Business simulation games
299    Business software companies
300    Business software for Linux
301    Business software for MacOS
Name: title, dtype: object

In [19]:
subcat_df_list=[]

In [20]:
categories_to_query = test[category_mask]['title'].apply(remove_category).tolist()

In [21]:
for category in categories_to_query:
    subcat_df_list.append(execute_category_query(category))

In [22]:
test[category_mask].head()['title'].apply(remove_category)

297        Administrative software
298      Business simulation games
299    Business software companies
300    Business software for Linux
301    Business software for MacOS
Name: title, dtype: object

In [23]:
subcat_df_list[1]

Unnamed: 0,ns,pageid,title
0,0,4171371,Business simulation game
1,0,21362278,List of business simulation video games
2,0,20274682,Project management simulation
3,0,43302474,3rd World Farmer
4,0,13767273,A-Train
5,0,47155408,AdvertCity
6,0,444481,Aerobiz
7,0,949299,Aerobiz Supersonic
8,0,5764136,Air Bucks
9,0,4286484,Airline Tycoon


In [24]:
test[~category_mask].head()

Unnamed: 0,ns,pageid,title
0,0,1037763,Business software
1,0,41270069,AccuSystems
2,0,5211212,Active policy management
3,0,28502793,Alexandria (library software)
4,0,44133735,Alteryx


In [25]:
def get_all_pages_rec(category):
    category_df = execute_category_query(category)
    pages_list = []
    category_mask = category_df['title'].str.contains('Category:')
    pages_df = category_df[~category_mask]
    pages_list.append(pages_df)
    categories = category_df[category_mask]['title']\
                            .str.replace('Category:','').tolist()
    #if len(categories) > 0:
        #for cat in categories:
            #pages_list.append(get_all_pages_rec(cat))
    
    #pages_df = pd.concat(pages_list)
    #pages_df.reset_index()
    return category_df

In [26]:
rec_test = get_all_pages_rec('business software')

In [27]:
rec_test

Unnamed: 0,ns,pageid,title
0,0,1037763,Business software
1,0,41270069,AccuSystems
2,0,5211212,Active policy management
3,0,28502793,Alexandria (library software)
4,0,44133735,Alteryx
5,0,12715119,Amadeus CRS
6,0,24061342,AMS Device Manager
7,0,54594603,Angelfish software
8,0,1762176,Applicant tracking system
9,0,22847264,Application retirement


In [28]:
!pip install wikipedia



In [29]:
import wikipedia

In [30]:
my_list = [wikipedia.WikipediaPage(r['title']).content for i, r in rec_test.iterrows()]

In [31]:
rec_test['page'] = my_list

In [32]:
rec_test.head()

Unnamed: 0,ns,pageid,title,page
0,0,1037763,Business software,Business software or a business application is...
1,0,41270069,AccuSystems,AccuSystems LLC is an American company headqua...
2,0,5211212,Active policy management,Active policy management is business-oriented ...
3,0,28502793,Alexandria (library software),Alexandria is browser based cross-platform lib...
4,0,44133735,Alteryx,Alteryx is an American computer software compa...


In [33]:
rec_test.reset_index(drop=True).shape

(329, 4)

In [34]:
BSdf = rec_test

In [35]:
BSdf

Unnamed: 0,ns,pageid,title,page
0,0,1037763,Business software,Business software or a business application is...
1,0,41270069,AccuSystems,AccuSystems LLC is an American company headqua...
2,0,5211212,Active policy management,Active policy management is business-oriented ...
3,0,28502793,Alexandria (library software),Alexandria is browser based cross-platform lib...
4,0,44133735,Alteryx,Alteryx is an American computer software compa...
5,0,12715119,Amadeus CRS,Amadeus is a computer reservations system (or ...
6,0,24061342,AMS Device Manager,AMS Device Manager is plant asset management s...
7,0,54594603,Angelfish software,"Angelfish Software is an on-premises, self-hos..."
8,0,1762176,Applicant tracking system,An applicant tracking system (ATS) is a softwa...
9,0,22847264,Application retirement,"Application retirement, also called applicatio..."


In [36]:
BSdf.to_pickle('../data/BSdf.pickle')

In [37]:
MLdf = pd.read_pickle('../data/MLdf.pickle')

In [38]:
MLdf 

Unnamed: 0,ns,pageid,title,page
0,2,54972729,User:CustIntelMngt/sandbox/Customer Intelligen...,\n= Customer Intelligence Management =\n\n\n==...
1,0,43385931,Data exploration,Data exploration is an approach similar to ini...
2,0,49082762,List of datasets for machine learning research,These datasets are used for machine learning r...
3,0,233488,Machine learning,Machine learning is the subfield of computer s...
4,0,53587467,Outline of machine learning,The following outline is provided as an overvi...
5,0,3771060,Accuracy paradox,The accuracy paradox for predictive analytics ...
6,0,43808044,Action model learning,Action model learning (sometimes abbreviated a...
7,0,28801798,Active learning (machine learning),Active learning is a special case of semi-supe...
8,0,45049676,Adversarial machine learning,Adversarial machine learning is a research fie...
9,0,52642349,AIVA,AIVA (Artificial Intelligence Virtual Artist) ...


In [39]:
MLdf['category'] = 'machine learning'

In [40]:
MLdf.head(4)

Unnamed: 0,ns,pageid,title,page,category
0,2,54972729,User:CustIntelMngt/sandbox/Customer Intelligen...,\n= Customer Intelligence Management =\n\n\n==...,machine learning
1,0,43385931,Data exploration,Data exploration is an approach similar to ini...,machine learning
2,0,49082762,List of datasets for machine learning research,These datasets are used for machine learning r...,machine learning
3,0,233488,Machine learning,Machine learning is the subfield of computer s...,machine learning


In [42]:
BSdf

Unnamed: 0,ns,pageid,title,page
0,0,1037763,Business software,Business software or a business application is...
1,0,41270069,AccuSystems,AccuSystems LLC is an American company headqua...
2,0,5211212,Active policy management,Active policy management is business-oriented ...
3,0,28502793,Alexandria (library software),Alexandria is browser based cross-platform lib...
4,0,44133735,Alteryx,Alteryx is an American computer software compa...
5,0,12715119,Amadeus CRS,Amadeus is a computer reservations system (or ...
6,0,24061342,AMS Device Manager,AMS Device Manager is plant asset management s...
7,0,54594603,Angelfish software,"Angelfish Software is an on-premises, self-hos..."
8,0,1762176,Applicant tracking system,An applicant tracking system (ATS) is a softwa...
9,0,22847264,Application retirement,"Application retirement, also called applicatio..."


In [41]:
BSdf['category'] = 'business software'

In [42]:
BSdf.head(10)

Unnamed: 0,ns,pageid,title,page,category
0,0,1037763,Business software,Business software or a business application is...,business software
1,0,41270069,AccuSystems,AccuSystems LLC is an American company headqua...,business software
2,0,5211212,Active policy management,Active policy management is business-oriented ...,business software
3,0,28502793,Alexandria (library software),Alexandria is browser based cross-platform lib...,business software
4,0,44133735,Alteryx,Alteryx is an American computer software compa...,business software
5,0,12715119,Amadeus CRS,Amadeus is a computer reservations system (or ...,business software
6,0,24061342,AMS Device Manager,AMS Device Manager is plant asset management s...,business software
7,0,54594603,Angelfish software,"Angelfish Software is an on-premises, self-hos...",business software
8,0,1762176,Applicant tracking system,An applicant tracking system (ATS) is a softwa...,business software
9,0,22847264,Application retirement,"Application retirement, also called applicatio...",business software


In [43]:
merge_data = BSdf.merge(MLdf, how='outer')


In [44]:
merge_data.shape

(1436, 5)

In [45]:
page_df = merge_data.drop('category', axis= 1)
page_df

Unnamed: 0,ns,pageid,title,page
0,0,1037763,Business software,Business software or a business application is...
1,0,41270069,AccuSystems,AccuSystems LLC is an American company headqua...
2,0,5211212,Active policy management,Active policy management is business-oriented ...
3,0,28502793,Alexandria (library software),Alexandria is browser based cross-platform lib...
4,0,44133735,Alteryx,Alteryx is an American computer software compa...
5,0,12715119,Amadeus CRS,Amadeus is a computer reservations system (or ...
6,0,24061342,AMS Device Manager,AMS Device Manager is plant asset management s...
7,0,54594603,Angelfish software,"Angelfish Software is an on-premises, self-hos..."
8,0,1762176,Applicant tracking system,An applicant tracking system (ATS) is a softwa...
9,0,22847264,Application retirement,"Application retirement, also called applicatio..."


In [46]:
category_df = merge_data[['category']]
category_df

Unnamed: 0,category
0,business software
1,business software
2,business software
3,business software
4,business software
5,business software
6,business software
7,business software
8,business software
9,business software


In [152]:
!pip install psycopg2



In [197]:
import psycopg2
from psycopg2.extras import RealDictCursor
HOST = 'postgres'
DBNAME = 'postgres'
USER = 'postgres'
# Connect to an existing database
con = psycopg2.connect(host=HOST, dbname = DBNAME, user = USER)
#Open a cursor to perform database operations
cur = con.cursor(cursor_factory=RealDictCursor)
#Execute a command: this creates a new table 
#cur.execute("CREATE TABLE page(page_id INTEGER PRIMARY KEY, page_title TEXT, page_page TEXT);") 
#cur.execute("CREATE TABLE category(category TEXT);")
cur.execute("CREATE TABLE linked(category_id INTEGER, page_id INTEGER);")
conn.commit()


In [206]:
# set up the 'category' table
# (cheat, since we already know all the values for it)

cur.execute("INSERT INTO category VALUES(0, 'machine learning');")
cur.execute("INSERT INTO category VALUES(1, 'business software');")
conn.commit()


InternalError: current transaction is aborted, commands ignored until end of transaction block


In [None]:
for row in BSdf.values():

    # add to the 'page' table
    page_id = BSdf['page_id']
    title = BSdf['title']
    content = BSdf['content']
    
    INSERT INTO page VALUES(page_id, title, content);