In [1]:
import re
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import pymongo
import json

from nltk.corpus import stopwords
from spacy.en import English
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from spacy.en import STOP_WORDS
nlp = English()
stop = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [2]:
client = pymongo.MongoClient('54.201.199.246', 27016)

wiki_db = client.wikipedia

wiki_col = wiki_db.my_collection

In [34]:
client.database_names(), wiki_db.collection_names()

(['admin', 'local', 'my_database', 'test', 'wikipedia'], ['my_collection'])

In [3]:
wiki_col.count()

5654

In [4]:
cursor = wiki_col.find()

In [5]:
wiki_df = pd.DataFrame(list(cursor))

In [6]:
wiki_df['main_cat'].value_counts()

Business software    4117
Machine learning     1537
Name: main_cat, dtype: int64

In [7]:
wiki_df.head()

Unnamed: 0,_id,article,content,main_cat,page_id,sub_cat
0,5a15de5730b30c01325f0260,Business software,merge enterprise software date october softw...,Business software,1037763,Business software
1,5a15de5830b30c01325f0261,AccuSystems,multiple issue orphan date february notabili...,Business software,41270069,Business software
2,5a15de5830b30c01325f0262,Active policy management,active policy management business orient ent...,Business software,5211212,Business software
3,5a15de5830b30c01325f0263,Alexandria (library software),use alexandria alexandria browser base softw...,Business software,28502793,Business software
4,5a15de5930b30c01325f0264,Alteryx,infobox company name alteryx logo file alter...,Business software,44133735,Business software


In [8]:
wiki_df.drop_duplicates(subset=['page_id'], inplace=True)

## Use TIFIDF to vectorize words

In [9]:
tfidf_vectorizer = TfidfVectorizer(min_df = 5, stop_words = 'english')

article_term_matrix_sps = tfidf_vectorizer.fit_transform(wiki_df.content)

article_term_matrix_df = pd.DataFrame(article_term_matrix_sps.toarray(),
                                       index=wiki_df.index,
                                       columns=tfidf_vectorizer.get_feature_names())

In [10]:
article_term_matrix_df.head()


Unnamed: 0,aa,aaa,aaai,aachen,aalst,aalto,aaron,ab,abacus,abandon,...,zoom,zoomable,zootycoon,zope,zoubin,zserie,zu,zur,zurich,zx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# pd.concat([wiki_df.article, wiki_df.content, article_term_matrix_df], axis=1).sample(4)

## Use SVD to reduce number of features

In [12]:
from sklearn.decomposition import TruncatedSVD

In [13]:
n_components = 500
SVD = TruncatedSVD(n_components)
component_names = ["component_"+str(i+1) for i in range(n_components)]

In [14]:
svd_matrix = SVD.fit_transform(article_term_matrix_df)

In [15]:
sum(SVD.explained_variance_ratio_)

0.52821702070259036

In [16]:
svd_df = pd.DataFrame(svd_matrix,
                      index=article_term_matrix_df.index,
                      columns=component_names)
svd_df['article'] = wiki_df.article

vocabulary_expression = pd.DataFrame(SVD.components_,
                                     index=component_names,
                                     columns=tfidf_vectorizer.get_feature_names()).T

In [17]:
svd_df.head()

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_492,component_493,component_494,component_495,component_496,component_497,component_498,component_499,component_500,article
0,0.404168,-0.056097,-0.039684,0.106238,-0.025485,0.182379,0.199905,0.079294,-0.107343,0.030513,...,0.008019,-0.00122,-0.000431,-0.010245,0.002946,0.018834,0.005699,0.001815,-0.005592,Business software
1,0.390635,-0.060633,-0.163631,0.281441,-0.074152,-0.036461,-0.064391,0.024975,-0.010817,-0.091205,...,0.010298,0.01278,-0.017377,-0.00012,0.024399,-0.006418,-0.018228,0.022363,0.029026,AccuSystems
2,0.183262,-0.027491,-0.010343,0.080166,0.060557,0.076863,0.041822,-0.033945,-0.070309,0.053343,...,0.012001,-0.000839,-0.02181,0.017554,0.011512,0.006574,-0.023655,-0.036717,0.008784,Active policy management
3,0.166694,-0.027517,-0.030793,-0.014078,0.000305,-0.000567,-0.013724,0.005905,0.001836,-0.002672,...,0.006311,0.016423,0.030078,-0.024864,0.022472,-0.05002,-0.019248,-0.000754,0.002544,Alexandria (library software)
4,0.291406,-0.043149,-0.110445,0.221959,-0.089297,-0.045727,-0.089188,0.003913,-0.038563,-0.061572,...,0.031528,-0.002506,-0.002626,-0.013431,0.005758,-0.010359,0.013569,0.009147,0.007183,Alteryx


In [18]:
for i in range(1,11):
    vocabulary_expression['abs_component_{}'.format(i)] = \
    np.abs(vocabulary_expression['component_{}'.format(i)])

In [19]:
vocabulary_expression['abs_component_1'].sort_values(ascending=False).head(7)

software      0.292050
company       0.180620
management    0.175485
category      0.172117
com           0.142446
http          0.140439
game          0.135387
Name: abs_component_1, dtype: float64

## Create function to search for top 5 related articles

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [43]:
def search_for_pages(search_terms):
    '''
    Takes search terms and returns the top 5 articles within the wikipedia corpus 
    that relate to that search terms using cosine similarity.
    
    Params
    ------
    search_terms: str
        takes a string of words  
    
    Returns
    -------
    A Dataframe of the top 5 articles with the highest cosine similarities.
     
    '''
    
    temp_svd_df = svd_df.copy()
    
    search_terms = [search_terms]

    search_terms_encoded = tfidf_vectorizer.transform(search_terms)
    
    search_term_svd_vector = SVD.transform(search_terms_encoded)
    
    temp_svd_df['cosine_sim'] = cosine_similarity(temp_svd_df.drop('article', axis=1), search_term_svd_vector)
    
    return temp_svd_df[['article', 'cosine_sim']].sort_values('cosine_sim', ascending=False).head(5)

In [50]:
search_for_pages('recommender systems for movies')

Unnamed: 0,article,cosine_sim
992,Collaborative filtering,0.65593
2819,Content discovery platform,0.44693
4262,Preference learning,0.319424
767,Qloo,0.277688
4930,Jubatus,0.272441


## Create label for main categories 

In [28]:
from sklearn.preprocessing import LabelEncoder

In [29]:
le = LabelEncoder()
wiki_df['cat_numerical'] = le.fit_transform(wiki_df['main_cat'])

In [30]:
wiki_df.shape

(4139, 7)