In [2]:
import re
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import pymongo
import json


In [3]:
client = pymongo.MongoClient('54.201.199.246', 27016)

wiki_db = client.wikipedia

wiki_col = wiki_db.my_collection

In [4]:
client.database_names(), wiki_db.collection_names()

(['admin', 'local', 'my_database', 'test', 'wikipedia'], ['my_collection'])

In [5]:
wiki_col.count()

5654

In [6]:
cursor = wiki_col.find()

In [7]:
wiki_df = pd.DataFrame(list(cursor))

In [8]:
wiki_df.drop_duplicates(subset=['page_id'], inplace=True)

## Create models to predict categories

In [28]:
from sklearn.preprocessing import LabelEncoder

In [29]:
le = LabelEncoder()
wiki_df['cat_numerical'] = le.fit_transform(wiki_df['main_cat'])

In [30]:
wiki_df['cat_numerical'].value_counts()

0    3052
1    1087
Name: cat_numerical, dtype: int64

In [31]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

In [32]:
X = wiki_df['content']
y = wiki_df['cat_numerical']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

nlp_pipe = Pipeline([
    ('vec', TfidfVectorizer()),
#     ('scaler', StandardScaler(with_mean=False)), try with and without scaler and tell Sylvia
    #add SVD to decrease dimensionality 
    #experiment with Bayes 
    ('svd', TruncatedSVD()),
    ('clf', LogisticRegression())
])

params = {
    'vec__ngram_range':[(1,2)],#don't need up to 4, too much 
    'vec__min_df':[10,30],
    'svd__n_components':[100,300,500],
    'clf__C': np.logspace(-2,4,7)
}

nlp_gs = GridSearchCV(nlp_pipe, 
                      params, 
                      cv=StratifiedShuffleSplit(5, random_state=42))

nlp_gs.fit(X_train, y_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.1,
            train_size=None),
       error_score='raise',
       estimator=Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'vec__ngram_range': [(1, 2)], 'vec__min_df': [10, 30], 'svd__n_components': [100, 300, 500], 'clf__C': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02,   1.00000e+03,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [33]:
nlp_gs.best_params_

{'clf__C': 10000.0,
 'svd__n_components': 300,
 'vec__min_df': 10,
 'vec__ngram_range': (1, 2)}

In [35]:
nlp_gs.best_score_

0.99163987138263665

In [36]:
nlp_gs.score(X_test, y_test)

0.97971014492753628

In [37]:
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix, classification_report

In [38]:
def score_eval(model, X_test, y_test):
    preds = model.predict(X_test)
    return {
        'roc_auc': roc_auc_score(y_test, preds),
        'accuracy': accuracy_score(y_test, preds),
        'confmat': confusion_matrix(y_test, preds),
        'clf_rep': classification_report(y_test, preds)
    }

In [48]:
results = score_eval(nlp_gs, X_test, y_test)

print('-'*60)
for key, value in results.items():
    print(key)
    print(value)
    print('-'*60)

------------------------------------------------------------
roc_auc
0.971668714888
------------------------------------------------------------
accuracy
0.979710144928
------------------------------------------------------------
confmat
[[759   9]
 [ 12 255]]
------------------------------------------------------------
clf_rep
             precision    recall  f1-score   support

          0       0.98      0.99      0.99       768
          1       0.97      0.96      0.96       267

avg / total       0.98      0.98      0.98      1035

------------------------------------------------------------


In [57]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

nbayes = BernoulliNB()

In [60]:
X = wiki_df['content']
y = wiki_df['cat_numerical']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

nlp_pipe = Pipeline([
    ('vec', TfidfVectorizer()),
#     ('scaler', StandardScaler(with_mean=False)), try with and without scaler and tell Sylvia
    #add SVD to decrease dimensionality 
    #experiment with Bayes 
    ('svd', TruncatedSVD()),
    ('clf', BernoulliNB())
])

params = {
    'vec__min_df':[10,30],
    'svd__n_components':[100,300,500],
}

nlp_bayes_gs = GridSearchCV(nlp_pipe, 
                      params, 
                      cv=StratifiedShuffleSplit(5, random_state=42))

nlp_bayes_gs.fit(X_train, y_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.1,
            train_size=None),
       error_score='raise',
       estimator=Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...te=None, tol=0.0)), ('clf', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'vec__min_df': [10, 30], 'svd__n_components': [100, 300, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [61]:
nlp_bayes_gs.score(X_test, y_test)

0.9198067632850242

In [62]:
bayes_results = score_eval(nlp_bayes_gs, X_test, y_test)

print('-'*60)
for key, value in bayes_results.items():
    print(key)
    print(value)
    print('-'*60)

------------------------------------------------------------
roc_auc
0.88243943118
------------------------------------------------------------
accuracy
0.919806763285
------------------------------------------------------------
confmat
[[737  31]
 [ 52 215]]
------------------------------------------------------------
clf_rep
             precision    recall  f1-score   support

          0       0.93      0.96      0.95       768
          1       0.87      0.81      0.84       267

avg / total       0.92      0.92      0.92      1035

------------------------------------------------------------
