In [63]:
#!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.19.4-py2.py3-none-any.whl (50kB)
[K    100% |████████████████████████████████| 51kB 603kB/s ta 0:00:01
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.19.4


In [None]:
# cd to directory above to acces sub-directories
cd ..

In [2]:
import re
import pickle
import pymongo
import requests

from tqdm import tqdm

In [14]:
def get_wiki_data(url_api, params):
    
    '''This function gets data requested from a specified api url.
       url_api: a url to api such as "https://en.wikipedia.org/w/api.php?"
       params : a dictionary to setup the fetch request'''
    
    response = requests.get(url_api, params=params)
    return(response.json())

def get_wiki_pages(url_api, params, tree_depth):

    '''This function recursively gets all pages, from the top level and subcategories,
       for the specified category, in the params dictionary.       
       url_api: a url to api such as "https://en.wikipedia.org/w/api.php?"
       params : a dictionary to setup the fetch request'''

    global cat_cnt; global pg_cnt
    
    # fetch data.
    data = get_wiki_data(url_api, params)
    
    # Pages in category.
    pages = [{'pageid':entry['pageid'], 'title':entry['title']} 
             for entry in data['query']['categorymembers'] if entry['ns']==0]

    # subcategories in this category
    sub_categories = [{'pageid':entry['pageid'], 'title':entry['title']}
                      for entry in data['query']['categorymembers'] if entry['ns']==14]
    
    cat_cnt+=len(sub_categories)
    pg_cnt+=len(pages)
    
    # If there are subcategories in this category's page, also get their pages.
    if ((len(sub_categories) != 0) & (tree_depth != 0)):
        tree_depth -=1
        for category in sub_categories:
            
            # Update the category we want to fetch as this current subcategory
            params["cmtitle"] = category['title']
            
            # Recursively get all pages and subcategory pages for this 
            # current subcategory
            pgs = get_wiki_pages(url_api, params=params, tree_depth=tree_depth)
            
            # append all pages from this current subcategory
            pages += pgs            
        
    return(pages)

def get_wiki_full_category_page_list(category, tree_depth):
    
    '''This function gets the entire requested data, for a category.
       category : a wiki page category, such as "Category:machine_learning"'''

    # url address of site's api, which we want to scrape.
    url_api = "https://en.wikipedia.org/w/api.php?"

    params = {
    "action": "query",
    "format": "json",
    "list": "categorymembers",
    "cmtitle": category,
    "cmlimit": "max"}  
    
    full_page_list = get_wiki_pages(url_api, params, tree_depth)
    return(full_page_list)

def get_wiki_page_content(page_title, page_ID):
    
    '''This function extracts the content for the requested page.
       page_title : title of wiki page
       page_ID    : the id of the page"'''

    # url address of site's api, which we want to scrape.
    url_api = "https://en.wikipedia.org/w/api.php?"

    params = {
    "action": "query",
    "format": "json",
    "prop": "extracts",
    "titles": page_title,
    "exlimit": "max"
    }
    
    # extracts contents of the specified page.
    data = get_wiki_data(url_api, params) 
    page_content = data['query']['pages'][page_ID]['extract']
    
    return(page_content)

def get_wiki_full_category_content(category):
    
    '''This function get the content for all the pages/subpages of the requested category.
       Top-level function. Call this function with a specified category.
       category : a wiki page category, such as "Category:machine_learning"'''
    
    # retrieve a list of dictionaries with all the pages and subpages related to this wiki category.
    entire_category_page_lst = get_wiki_full_category_page_list(category)
    
    # get the text of wiki pages and add them to that pages' dictionary, with key value 'text'
    for page_dict in tqdm(entire_category_page_lst):
        page_dict['text'] = get_wiki_page_content(page_dict['title'], str(page_dict['pageid']))

    return(entire_category_page_lst)


In [4]:
def mongoDB_create_collection(db_name, collection_name, collection_lst):
    
    # Instantiate client to our Mongo Server
    client = pymongo.MongoClient('35.163.170.219', 27016)

    # Make a new database
    db_ref = client[db_name]

    # Create a reference to my_collection
    coll_ref = db_ref[collection_name]

    # Use the collection reference to insert the ML_page_contents
    for doc in tqdm(collection_lst):
        coll_ref.insert_one(doc)

def mongoDB_read_collection(db_name, collection_name):
    
    # Instantiate client to our Mongo Server
    client = pymongo.MongoClient('35.163.170.219', 27016)

    # Make a new database
    db_ref = client[db_name]

    # Create a reference to my_collection
    coll_ref = db_ref[collection_name] 
    
    # Read collection into a list.
    coll_lst = list(coll_ref.find())
    
    return(coll_lst)
    
def mongoDB_get_DBnames():  
    
    # Instantiate client to our Mongo Server
    client = pymongo.MongoClient('35.163.170.219', 27016)    
    
    # Databases on our MongoDB Server.
    names = client.database_names()
    
    return(names)

def mongoDB_get_collection_names(db_name):
    
    # Instantiate client to our Mongo Server
    client = pymongo.MongoClient('35.163.170.219', 27016)    
    
    # Make a new database
    db_ref = client[db_name]
    
    names = db_ref.collection_names()
    
    return(names)
    
    

In [28]:
# clean text.
def remove_tags(text):
    TAG_RE = re.compile(r'<[^>]+>')
    text = TAG_RE.sub('', text)
    text = re.sub('\n', '', text)
    return text

def clean_text(page_lst):
    for page in tqdm(page_lst):
        page['text'] = remove_tags(page['text'])
    
    return(page_lst)

In [6]:
# pickle list of dictionaries for each page in the category: machine learning.
def pickle_obj(filename, objname):
    filehandler = open(filename+'.obj',"wb")
    pickle.dump(objname,filehandler)

# read pickled list of dictionaries for each page in the category: machine learning.
def read_pickle_obj(filename):
    file = open(filename+'.obj','rb')
    object_content = pickle.load(file)
    return(object_content)

In [32]:
print("num pages: ", pg_cnt, "num categories: ", cat_cnt)

num pages:  1620 num categories:  62


In [70]:
# fetch wikipedia's 'category: machine learning' page content, 
# for pages in its categories and subcategories.
category = "Category:Machine_learning"
cat_cnt = 1; pg_cnt = 0

entire_category_data_list = get_wiki_full_category_content(category)

100%|██████████| 1620/1620 [19:55<00:00,  1.36it/s]


In [32]:
# get database names on mongo server.
mongoDB_get_DBnames()

['admin', 'local', 'my_database', 'test', 'wiki_database']

In [40]:
# get collection names on specified mongo db.
mongoDB_get_collection_names('wiki_database')

['wiki_ML_collection', 'wiki_ML_clean_collection']

In [41]:
# get ML collection from mongoDB.
ML_raw_collection = mongoDB_read_collection('wiki_database', 'wiki_ML_collection')

In [42]:
# get clean text ML collection from mongoDB.
ML_clean_collection = mongoDB_read_collection('wiki_database', 'wiki_ML_clean_collection')

In [48]:
# pickle raw ML pages list.
pickle_obj('wiki_ML_rawtext_pages_lst', ML_raw_collection)

In [49]:
# pickle clean ML pages list.
pickle_obj('wiki_ML_cleantext_pages_lst', ML_clean_collection)

In [43]:
ML_raw_collection[0]['text']

'<p><b>Data exploration</b> is an approach similar to initial data analysis, whereby a data analyst uses visual exploration to understand what is in a dataset and the characteristics of the data, rather than through traditional data management systems. These characteristics can include size or amount of data, completeness of the data, correctness of the data, possible relationships amongst data elements or files/tables in the data.</p>\n<p>Data exploration is typically conducted using a combination of automated and manual activities. Automated activities can include data profiling or data visualization or tabular reports to give the analyst an initial view into the data and an understanding of key characteristics.</p>\n<p>This is often followed by manual drill-down or filtering of the data to identify anomalies or patterns identified through the automated actions. Data exploration can also require manual scripting and queries into the data (e.g. using languages such as SQL or R) or usi

In [44]:
ML_clean_collection[0]['text']

'Data exploration is an approach similar to initial data analysis, whereby a data analyst uses visual exploration to understand what is in a dataset and the characteristics of the data, rather than through traditional data management systems. These characteristics can include size or amount of data, completeness of the data, correctness of the data, possible relationships amongst data elements or files/tables in the data.Data exploration is typically conducted using a combination of automated and manual activities. Automated activities can include data profiling or data visualization or tabular reports to give the analyst an initial view into the data and an understanding of key characteristics.This is often followed by manual drill-down or filtering of the data to identify anomalies or patterns identified through the automated actions. Data exploration can also require manual scripting and queries into the data (e.g. using languages such as SQL or R) or using Excel or similar tools to

In [8]:
# fetch wikipedia's 'category: Business software' page content, 
# for pages in its categories and subcategories.
# category = "Category:Business_software"
category = "Category:Business_software"

cat_cnt = 1; pg_cnt = 0

entire_category_data_list = get_wiki_full_category_content(category)

print("num pages: ", pg_cnt, "num categories: ", cat_cnt)

subcat:  Category:Administrative software depth:  1
subcat:  Category:School-administration software depth:  1
subcat:  Category:Time-tracking software depth:  2
subcat:  Category:Business simulation games depth:  2
subcat:  Category:Amusement park simulation games depth:  1
subcat:  Category:M.U.L.E. depth:  2


  0%|          | 0/564 [00:00<?, ?it/s]

EXIT subcat:  Category:Roller coaster games and simulations depth:  2
EXIT subcat:  Category:Business software companies depth:  2


100%|██████████| 564/564 [02:15<00:00,  4.17it/s]

num pages:  564 num categories:  42





In [9]:
len(entire_category_data_list)

564

In [15]:
BS_page_lst = get_wiki_full_category_page_list(category = "Category:Business_software", 
                                               tree_depth = 2)

In [16]:
len(BS_page_lst)

4077