In [2]:
import re
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import pymongo
import json

from nltk.corpus import stopwords
from spacy.en import English
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from spacy.en import STOP_WORDS
nlp = English()
stop = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [3]:
client = pymongo.MongoClient('54.201.199.246', 27016)

wiki_db = client.wikipedia

wiki_col = wiki_db.my_collection

In [4]:
client.database_names(), wiki_db.collection_names()

(['admin', 'local', 'my_database', 'test', 'wikipedia'], ['my_collection'])

In [5]:
wiki_col.count()

5654

In [6]:
cursor = wiki_col.find()

In [7]:
wiki_df = pd.DataFrame(list(cursor))

In [8]:
wiki_df['main_cat'].value_counts()

Business software    4117
Machine learning     1537
Name: main_cat, dtype: int64

In [9]:
wiki_df.head()

Unnamed: 0,_id,article,content,main_cat,page_id,sub_cat
0,5a15de5730b30c01325f0260,Business software,merge enterprise software date october softw...,Business software,1037763,Business software
1,5a15de5830b30c01325f0261,AccuSystems,multiple issue orphan date february notabili...,Business software,41270069,Business software
2,5a15de5830b30c01325f0262,Active policy management,active policy management business orient ent...,Business software,5211212,Business software
3,5a15de5830b30c01325f0263,Alexandria (library software),use alexandria alexandria browser base softw...,Business software,28502793,Business software
4,5a15de5930b30c01325f0264,Alteryx,infobox company name alteryx logo file alter...,Business software,44133735,Business software


In [10]:
wiki_df.drop_duplicates(subset=['page_id'], inplace=True)

## Use TIFIDF to vectorize words

In [12]:
tfidf_vectorizer = TfidfVectorizer(min_df = 20, stop_words = 'english')

article_term_matrix_sps = tfidf_vectorizer.fit_transform(wiki_df.content)

article_term_matrix_df = pd.DataFrame(article_term_matrix_sps.toarray(),
                                       index=wiki_df.index,
                                       columns=tfidf_vectorizer.get_feature_names())

In [None]:
article_term_matrix_df.head()


In [None]:
# pd.concat([wiki_df.article, wiki_df.content, article_term_matrix_df], axis=1).sample(4)

## Use SVD to reduce number of features

In [13]:
from sklearn.decomposition import TruncatedSVD

In [14]:
n_components = 500
SVD = TruncatedSVD(n_components)
component_names = ["component_"+str(i+1) for i in range(n_components)]

In [16]:
svd_matrix = SVD.fit_transform(article_term_matrix_df)

In [None]:
sum(SVD.explained_variance_ratio_)

In [17]:
svd_df = pd.DataFrame(svd_matrix,
                      index=article_term_matrix_df.index,
                      columns=component_names)
svd_df['article'] = wiki_df.article

vocabulary_expression = pd.DataFrame(SVD.components_,
                                     index=component_names,
                                     columns=tfidf_vectorizer.get_feature_names()).T

In [18]:
svd_df.head()

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_492,component_493,component_494,component_495,component_496,component_497,component_498,component_499,component_500,article
0,0.404951,-0.051309,-0.040997,0.105726,0.005325,0.123327,0.257478,-0.055066,-0.03025,0.069374,...,-0.010739,-0.010833,-0.023893,-0.023662,0.0032,-0.016697,0.034757,0.009308,0.01991,Business software
1,0.392458,-0.058191,-0.1752,0.280272,-0.069831,-0.002597,-0.03498,-0.052762,-0.030091,-0.091638,...,-0.014407,-0.00893,0.022893,-0.022598,0.018009,0.003248,0.01368,0.011187,-0.004865,AccuSystems
2,0.182103,-0.024958,-0.010863,0.079786,0.076378,0.052743,0.067091,0.026575,-0.068131,0.055673,...,-0.016108,-0.009719,0.012636,-0.013827,0.012805,0.001512,0.005479,-0.000555,0.008461,Active policy management
3,0.222011,-0.035081,-0.032633,-0.01674,0.000543,0.002462,-0.014987,-0.016434,-0.001359,0.003528,...,-0.006281,0.015953,-0.012347,0.00061,0.000705,-0.024892,0.001886,-0.020234,0.024531,Alexandria (library software)
4,0.328022,-0.045848,-0.13605,0.251924,-0.101908,-0.008932,-0.08142,-0.045797,-0.059878,-0.074248,...,-0.011901,0.003449,-0.022862,-0.007964,0.026287,0.029398,-0.009497,-0.012136,-0.020458,Alteryx


In [None]:
for i in range(1,11):
    vocabulary_expression['abs_component_{}'.format(i)] = \
    np.abs(vocabulary_expression['component_{}'.format(i)])

In [None]:
vocabulary_expression['abs_component_1'].sort_values(ascending=False).head(7)

## Create function to search for top 5 related articles

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
def search_for_pages(search_terms):
    '''
    Takes search terms and returns the top 5 articles within the wikipedia corpus 
    that relate to that search terms based on cosine similarity.
    
    Params
    ------
    search_terms: str
    A string of words  
    
    Returns
    -------
    A Dataframe of the top 5 articles with the highest cosine similarities.
     
    '''
    
    temp_svd_df = svd_df.copy()
    
    search_terms = [search_terms]

    search_terms_encoded = tfidf_vectorizer.transform(search_terms)
    
    search_term_svd_vector = SVD.transform(search_terms_encoded)
    
    temp_svd_df['cosine_sim'] = cosine_similarity(temp_svd_df.drop('article', axis=1), search_term_svd_vector)
    
    return temp_svd_df[['article', 'cosine_sim']].sort_values('cosine_sim', ascending=False).head(5)

In [21]:
search_for_pages('There are two types of investor apps: Native investor apps and HTML5 investor apps. Most investor apps offer access to public company content such as stock quotes, corporate materials')

Unnamed: 0,article,cosine_sim
2879,Investor application,0.833884
1625,Alpha capture system,0.393927
109,Fundamental Analysis Software,0.382128
181,PandaDoc,0.371676
1643,FatKat (investment software),0.345342


In [24]:
search_for_pages('Artificial intelligence')

Unnamed: 0,article,cosine_sim
4882,AAAI Conference on Artificial Intelligence,0.821924
4884,Conference on Artificial General Intelligence,0.697031
4887,Dartmouth workshop,0.665428
4891,International Joint Conference on Artificial I...,0.624944
4334,Language Acquisition Device (computer),0.613826


## Create models to predict categories

In [28]:
from sklearn.preprocessing import LabelEncoder

In [29]:
le = LabelEncoder()
wiki_df['cat_numerical'] = le.fit_transform(wiki_df['main_cat'])

In [30]:
wiki_df['cat_numerical'].value_counts()

0    3052
1    1087
Name: cat_numerical, dtype: int64

In [31]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

In [32]:
X = wiki_df['content']
y = wiki_df['cat_numerical']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

nlp_pipe = Pipeline([
    ('vec', TfidfVectorizer()),
#     ('scaler', StandardScaler(with_mean=False)), try with and without scaler and tell Sylvia
    #add SVD to decrease dimensionality 
    #experiment with Bayes 
    ('svd', TruncatedSVD()),
    ('clf', LogisticRegression())
])

params = {
    'vec__ngram_range':[(1,2)],#don't need up to 4, too much 
    'vec__min_df':[10,30],
    'svd__n_components':[100,300,500],
    'clf__C': np.logspace(-2,4,7)
}

nlp_gs = GridSearchCV(nlp_pipe, 
                      params, 
                      cv=StratifiedShuffleSplit(5, random_state=42))

nlp_gs.fit(X_train, y_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.1,
            train_size=None),
       error_score='raise',
       estimator=Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'vec__ngram_range': [(1, 2)], 'vec__min_df': [10, 30], 'svd__n_components': [100, 300, 500], 'clf__C': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02,   1.00000e+03,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [33]:
nlp_gs.best_params_

{'clf__C': 10000.0,
 'svd__n_components': 300,
 'vec__min_df': 10,
 'vec__ngram_range': (1, 2)}

In [35]:
nlp_gs.best_score_

0.99163987138263665

In [36]:
nlp_gs.score(X_test, y_test)

0.97971014492753628

In [37]:
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix, classification_report

In [38]:
def score_eval(model, X_test, y_test):
    preds = model.predict(X_test)
    return {
        'roc_auc': roc_auc_score(y_test, preds),
        'accuracy': accuracy_score(y_test, preds),
        'confmat': confusion_matrix(y_test, preds),
        'clf_rep': classification_report(y_test, preds)
    }

In [48]:
results = score_eval(nlp_gs, X_test, y_test)

print('-'*60)
for key, value in results.items():
    print(key)
    print(value)
    print('-'*60)

------------------------------------------------------------
roc_auc
0.971668714888
------------------------------------------------------------
accuracy
0.979710144928
------------------------------------------------------------
confmat
[[759   9]
 [ 12 255]]
------------------------------------------------------------
clf_rep
             precision    recall  f1-score   support

          0       0.98      0.99      0.99       768
          1       0.97      0.96      0.96       267

avg / total       0.98      0.98      0.98      1035

------------------------------------------------------------


In [57]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

nbayes = BernoulliNB()

In [60]:
X = wiki_df['content']
y = wiki_df['cat_numerical']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

nlp_pipe = Pipeline([
    ('vec', TfidfVectorizer()),
#     ('scaler', StandardScaler(with_mean=False)), try with and without scaler and tell Sylvia
    #add SVD to decrease dimensionality 
    #experiment with Bayes 
    ('svd', TruncatedSVD()),
    ('clf', BernoulliNB())
])

params = {
    'vec__min_df':[10,30],
    'svd__n_components':[100,300,500],
}

nlp_bayes_gs = GridSearchCV(nlp_pipe, 
                      params, 
                      cv=StratifiedShuffleSplit(5, random_state=42))

nlp_bayes_gs.fit(X_train, y_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.1,
            train_size=None),
       error_score='raise',
       estimator=Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...te=None, tol=0.0)), ('clf', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'vec__min_df': [10, 30], 'svd__n_components': [100, 300, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [61]:
nlp_bayes_gs.score(X_test, y_test)

0.9198067632850242

In [62]:
bayes_results = score_eval(nlp_bayes_gs, X_test, y_test)

print('-'*60)
for key, value in bayes_results.items():
    print(key)
    print(value)
    print('-'*60)

------------------------------------------------------------
roc_auc
0.88243943118
------------------------------------------------------------
accuracy
0.919806763285
------------------------------------------------------------
confmat
[[737  31]
 [ 52 215]]
------------------------------------------------------------
clf_rep
             precision    recall  f1-score   support

          0       0.93      0.96      0.95       768
          1       0.87      0.81      0.84       267

avg / total       0.92      0.92      0.92      1035

------------------------------------------------------------
