In [2]:
import re
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import pymongo
import json
import pymongo

from nltk.corpus import stopwords
from spacy.en import English
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from spacy.en import STOP_WORDS
from bs4 import BeautifulSoup
nlp = English()
stop = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [36]:
client = pymongo.MongoClient('54.201.199.246', 27016)

wiki_db = client.wikipedia

wiki_col = wiki_db.my_collection

## Create functions to get data from Wiki API

In [37]:
def category_request(category):
    """
    Scrape a category page from Wikipedia API.
    
    Params:
    --------
    category: str
        The name of the category to be scraped.
        
    Returns:
    --------
    DataFrame
        Pandas DataFrame containing categories 
        
    """
    my_params = {
        'action':'query',
        'format':'json',
        'list':'categorymembers',
        'cmtitle': 'Category:{}'.format(category),
        'cmlimit': 'max'
        }
    page = requests.get('http://en.wikipedia.org/w/api.php', params=my_params)
    return pd.DataFrame(page.json()['query']['categorymembers'])


In [38]:
def get_content(title):
    """
    Scrape a page from Wikipedia API to get the conecnt.
    
    Params:
    --------
    title: str
        The name of the page to be scraped.
        
    Returns:
    --------
    List of the contents of the page
        
    """
    my_params = {
        'action':'query',
        'format':'json',
        'titles': title,
        'prop': 'revisions',
        'rvprop': 'content'
    }
    content = requests.get('http://en.wikipedia.org/w/api.php', params=my_params)
    return list(content.json()['query']['pages'].values())[0]['revisions'][0]['*']


In [39]:
# get_content('Machine_learning')

In [40]:
def get_cats_and_pages(category):
    """
    Returns the pages and subcategories of a category
    
    Params
    ------
    category : str
        Name of a category
    
    Returns
    -------
    children: list 
        list of sub categories 
    pages: list
        list of pages on the category
    
    page_id = list of page_ids for each page
        
    """
    cats = pd.DataFrame(category_request(category))
    cats['title'] = cats.title.astype(str) 
    #returns a boolean mask of all titles with 'category' in the str
    subs_mask = cats['title'].str.contains('Category:')
    
    #creates list of new sub catagories
    children = list(cats['title'][subs_mask].str.replace('Category:', ""))
    pages = list(cats['title'][~cats.title.str.contains('Category:')])
    page_id = list(cats['pageid'][~cats.title.str.contains('Category:')])
    return page_id, pages, children

#sub_categories, pages

In [41]:
def cleaner(text):
    text = re.sub('&#39;','',text).lower()
    text = re.sub('<br />','',text)
    text = re.sub('<.*>.*</.*>','', text)
    text = re.sub('[\d]','',text)
    text = re.sub('[^a-z ]',' ',text)
    text = re.sub(u'<.*>','',text)
    text = re.sub(u'[^a-z\s]',' ',text)
    text = re.sub("\\s+", " ", text)
    text = nlp(text)
    text = [str(i.lemma_) for i in text if str(i.orth_) not in stop]
    text = ' '.join(text)

    return text

In [42]:

def wiki_traverse(main_cat, category, max_depth=-1):
    """ 
    
    
    Params
    ------
    category : str
        Name of a category
    
    Returns
    -------
    Does not return anything, function automatically feeds dictionaries of category, articles/
    and content into Mongo database. 
        
    """
    
    if max_depth != 0:

        page_id, pages, children = get_cats_and_pages(category)
        
        for index, article in enumerate(pages):    
            article_dict = {}
            article_dict['main_cat'] = main_cat
            article_dict['sub_cat'] = category
            article_dict['article'] = article
            article_dict['page_id'] = str(page_id[index])  
            article_dict['content'] = cleaner(get_content(article))
            
            #this line adds each article onto mongo database as each article is being called 
            wiki_col.insert_one(article_dict)
#             print(category)
            
        for child in children:
            wiki_traverse(main_cat, child, max_depth-1)
            

In [44]:
wiki_traverse('Business software', 'Business software', max_depth=3)

Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business software
Business s

Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business simulation games
Business sim

Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software companies
Business software co

Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborative software
Collaborati

Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise application integration
Enterprise applicati

Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Accounting software
Computer-aided audit tools
Computer-aided audit tools
Computer-aided audit tools
Computer-aided audit tools
Computer-aided audit tools
Cost analysis software
Cost analysis software
Financial markets software
Financial markets software
Financial markets software
Financial markets software
Financial markets software
Financial software companies
Financial software companies
Financial software companies
Financial software companies
Financial software companies
Financial software 

Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free content management systems
Free con

Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Electronic health records
Free healthcare software
Free healthcare software
Free healthcare software
Free healthcare software
Free healthcare software
Free healthcare software
Free healthcare software
Free healthcare software
Free healthcare software
Free healthcare softw

Java enterprise platform
Java enterprise platform
Java enterprise platform
Java enterprise platform
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content management systems
Content m

Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management software
Project management s

Desktop publishing software
Desktop publishing software
Desktop publishing software
Desktop publishing software
Desktop publishing software
Desktop publishing software
Desktop publishing software
Desktop publishing software
Desktop publishing software
Desktop publishing software
Desktop publishing software
Desktop publishing software
Desktop publishing software
Desktop publishing software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Reporting software
Free reporting software
Free reporting software
Free reporting software
Free reporting software
Free reporting software
Free reportin

Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business software stubs
Business softwar

In [51]:
wiki_traverse('Machine learning', 'Machine learning', max_depth=3)           


Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learning
Machine learni

Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Neural network software
Bayesian networks
Bayesian networks
Bayesian networks
Bayesian networks
Bayesian networks
Bayesian networks
Bayesian networks
Bayesian networks
Bayesian networks
Bayesian networks
Bayesian networks
Bayesian networks
Classification algorithms
Classification algorithms
Classification algorithms
Classification algorithms
Classification algorithms
Classification algorithms
Classification algorithms
Classification algorithms
Classification algorithm

Cluster analysis algorithms
Cluster analysis algorithms
Cluster analysis algorithms
Cluster analysis algorithms
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Clustering criteria
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theory
Computational learning theor

Gene expression programming
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic algorithms
Genetic programming
Genetic programming
Genetic programming
Genetic programming
Genetic programming
Genetic programming
Genetic programming
Genetic programming
Genetic progra

Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learning researchers
Machine learni

In [52]:
wiki_col.count()

5654