In [26]:
import requests
import re
import pickle
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [14]:
df = pd.read_pickle('./data/cleaned.pkl')

In [15]:
df['subreddit'] = df['subreddit'].map({'marvelstudios':0, 'DC_Cinematic':1})

In [16]:
X = df[['title_combined']]
y = df['subreddit']

In [17]:
y.value_counts(normalize=True)

0    0.500251
1    0.499749
Name: subreddit, dtype: float64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    stratify=y)

In [19]:
# the default NLTK stopword list
stop_words = set(stopwords.words('english'))  

# add additional stopwords
additional_stopwords = {'like','think','just','new'}
stop_words = stop_words.union(additional_stopwords)

In [20]:
def lemmatise(text):
    
   # Split and lemmatize words
    words = text.split(" ")
    lemmatizer = WordNetLemmatizer()
    words_lem = [lemmatizer.lemmatize(i) for i in words]

    # Join the words back into one string separated by space, 
    # and return the result.
    words=[]
    for i in (words_lem):   
        if i != "":
            words.append(i)
    
    no_stop_words = [token for token in words if token not in stop_words]

    return (' '.join(no_stop_words))
 

In [21]:
X_train['title_combined']= X_train['title_combined'].apply(lambda x: lemmatise(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['title_combined']= X_train['title_combined'].apply(lambda x: lemmatise(x))


## Narrow down search range using RandomSearchCV

### Investigate Countvector or Tfidf is better fit as well

In [22]:
# Parameter Grid for RandomisedCV 
params_ran_rf = {
    'n_estimators': np.arange(10,200,10),
    'max_depth' : np.arange(1,6,1),
    'min_samples_split': np.arange(2,20,2),
    'min_samples_leaf': np.arange(2,10,2),
    'max_features': ['None',2,4,6,8,10]
}

params_ran_nb = {
    'alpha':[1,0.9,0.8,0.7,0.6]
    
}

params_ran_dt = {
    'max_depth' : np.arange(1,6,1),
    'min_samples_split': np.arange(2,20,2),
    'min_samples_leaf': np.arange(2,10,2),
    'max_features': ['None',2,4,6,8,10]
}

params_ran_sv = {
    'C': np.logspace(-2, 10, 20),
    'gamma':['scale',1,0.1,0.01,0.001,0.0001],
    'kernel':['rbf','linear','poly']}

params_ran_log = {
    'penalty': ['l1','l2','none'],
    'C': np.logspace(-2, 10, 20),
    'class_weight': ['None','balanced']
}


In [23]:
ran_result = {}

def random_gridsearch(model,parameter,name): 

  t0 = time.time()
  preprocess_step = None
  preprocessor_cvec = ColumnTransformer([("cvec", CountVectorizer(),'title_combined')])
  preprocessor_tfi = ColumnTransformer([("tfidf", TfidfVectorizer(),'title_combined')])
  pipe_ran = Pipeline([('preprocess',preprocessor_cvec),("classifier", model)])

  param1 = {}
  param1['preprocess'] = [preprocessor_cvec]
  param1['preprocess__cvec__max_features'] = np.arange(1500,5000,500)
  param1['preprocess__cvec__min_df'] = [1,2,3]
  param1['preprocess__cvec__max_df'] = [0.85,0.95,1]
  param1['preprocess__cvec__ngram_range'] = [(1,1), (1,2), (1,3)]
  for key,value in parameter.items():
    param1['classifier'+"__"+key] = value


  param2 = {}
  param2['preprocess'] = [preprocessor_tfi]
  param2['preprocess__tfidf__max_features'] = np.arange(1500,5000,500)
  param2['preprocess__tfidf__min_df'] = [1,2,3]
  param2['preprocess__tfidf__max_df'] = [0.85,0.95,1]
  param2['preprocess__tfidf__ngram_range'] = [(1,1), (1,2), (1,3)]
  for key,value in parameter.items():
    param2['classifier'+"__"+key] = value
  
  params = []
  params = [param1, param2] 
  # run RandomsearchCV
  gs_rand = RandomizedSearchCV(pipe_ran, params, cv=3, scoring='accuracy',n_iter=100, n_jobs=-1,verbose=1,random_state=42)
  gs_rand.fit(X_train, y_train)
  

  if 'tfidf' in str(gs_rand.best_params_['preprocess']):
    preprocess_step = 'tfvec'
  elif 'cvec' in str(gs_rand.best_params_['preprocess']):
    preprocess_step = 'cvec'

  model = gs_rand.best_estimator_
  pred = model.predict(X_test)

  tn, fp, fn, tp = confusion_matrix(y_test,
                                  pred).ravel()
  
  accuracy = (tp + tn) / (tp + tn + fp + fn)
  misclassification = 1 - accuracy
  sensitivity = tp / (tp + fn)
  specificity = tn / (tn + fp)
  precision = tp / (tp + fp)

  runtime = time.time() - t0

  ran_result[name] = [preprocess_step, model.score(X_train, y_train), model.score(X_test, y_test),accuracy,
                           misclassification, sensitivity, specificity, precision, runtime]

  print(model)
   

In [30]:
random_gridsearch(RandomForestClassifier(),params_ran_rf,'random_forest')

Fitting 3 folds for each of 100 candidates, totalling 300 fits


 0.71634343 0.75418458 0.73645301        nan 0.68887745        nan
 0.80274566        nan 0.55190982 0.51273032 0.81413392 0.6922346
 0.5        0.74414374        nan 0.64802022 0.56463744 0.64903231
 0.50268007 0.56495799 0.64836398 0.52109073        nan        nan
        nan 0.755531          nan 0.75719965 0.57031476 0.72973737
 0.55559726        nan 0.63564106 0.60449604 0.7967303  0.76054604
 0.77762171 0.67045973        nan        nan 0.68386713 0.80675768
        nan        nan        nan        nan 0.67881644        nan
        nan 0.58941596        nan 0.63094186        nan 0.54722744
 0.55727466        nan        nan 0.64568559 0.73309923 0.50033501
 0.74683357 0.80978184        nan 0.74279867 0.7642244         nan
        nan 0.77261239 0.81949305 0.69054577        nan        nan
 0.59811911 0.5        0.69055048        nan 0.5               nan
        nan 0.53081203        nan        nan 0.57870813        nan
 0.6175311  0.50033501 0.5               nan        nan        

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cvec',
                                                  CountVectorizer(max_df=0.95,
                                                                  max_features=2000,
                                                                  min_df=3,
                                                                  ngram_range=(1,
                                                                               3)),
                                                  'title_combined')])),
                ('classifier',
                 RandomForestClassifier(max_depth=4, max_features=6,
                                        min_samples_leaf=2,
                                        min_samples_split=18,
                                        n_estimators=120))])


In [27]:
random_gridsearch(MultinomialNB(),params_ran_nb,'multinomial_Naive_Bayes')

Fitting 3 folds for each of 100 candidates, totalling 300 fits


 0.86737637 0.87039078 0.86637236 0.86469832 0.86268726 0.54621669
 0.55458921 0.8730695  0.8569965         nan 0.87039044        nan
        nan 0.85900486 0.85833586 0.86737671 0.54085723        nan
        nan 0.85833586 0.87005644 0.86704103 0.86704237 0.86402831
 0.86737604 0.85900621 0.87206515        nan 0.85933954        nan
 0.86637135        nan        nan 0.85733083 0.85967623 0.86034523
 0.85867019 0.86134757 0.558612   0.86302362 0.85934054        nan
 0.8697211  0.55727365 0.87106012 0.86603701 0.87206515 0.87072545
 0.56363343 0.86570234 0.85900486        nan 0.85833518 0.85699515
        nan 0.85833586 0.86704271 0.85632547        nan 0.8687154
        nan 0.86402763 0.8667077         nan 0.86101491 0.86469832
 0.86871607        nan 0.86938609 0.87005644 0.85565579 0.558612
 0.52679293 0.85833586 0.85766517 0.558612          nan        nan
 0.86704237 0.87373951 0.87340518        nan 0.52779762 0.87039044
 0.87206515        nan 0.8667077         nan 0.8667077         na

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('tfidf',
                                                  TfidfVectorizer(max_df=0.95,
                                                                  max_features=4500,
                                                                  ngram_range=(1,
                                                                               2)),
                                                  'title_combined')])),
                ('classifier', MultinomialNB(alpha=0.8))])


In [28]:
random_gridsearch(DecisionTreeClassifier(),params_ran_dt,'decision_tree')

Fitting 3 folds for each of 100 candidates, totalling 300 fits


 0.50468844        nan 0.51439123 0.54655372        nan 0.50636314
 0.51037853 0.50033501 0.52445225        nan        nan        nan
 0.50033501 0.50066968 0.51272796 0.50937923        nan 0.50368408
 0.527802   0.51574304        nan 0.51271586 0.50100503 0.50033501
 0.5103819  0.50368375 0.50066968 0.52411758        nan 0.50569043
 0.51440166 0.50133936        nan        nan        nan        nan
        nan 0.51875879        nan        nan        nan        nan
 0.50033501 0.50435275        nan        nan        nan        nan
        nan        nan 0.50903816 0.50401909        nan        nan
 0.51105528        nan 0.51205391 0.50134003        nan 0.5043541
        nan        nan        nan 0.50033501 0.50971524        nan
        nan 0.50937553        nan 0.50033501 0.51507235 0.50033501
        nan        nan        nan 0.50970986 0.5110435  0.50267771
 0.53180898 0.50669882 0.51741808 0.50033501 0.5        0.50100503
        nan        nan 0.50067002 0.50535946 0.50669882        

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cvec',
                                                  CountVectorizer(max_df=0.95,
                                                                  max_features=4500,
                                                                  min_df=3),
                                                  'title_combined')])),
                ('classifier',
                 DecisionTreeClassifier(max_depth=4, max_features=4,
                                        min_samples_leaf=2,
                                        min_samples_split=14))])


In [32]:
random_gridsearch(LogisticRegression(),params_ran_log,'logistic_regre')

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('tfidf',
                                                  TfidfVectorizer(max_df=0.95,
                                                                  max_features=2000,
                                                                  min_df=2),
                                                  'title_combined')])),
                ('classifier',
                 LogisticRegression(C=3.359818286283781, class_weight='None'))])


 0.8439352         nan 0.84159116 0.85130203        nan 0.84661494
 0.57300593 0.83957505        nan 0.86469967 0.84895865        nan
 0.55459257 0.84326452 0.58003841        nan 0.8744102         nan
        nan 0.83020659 0.83690608 0.84159116 0.8352223  0.8439352
        nan 0.84962597        nan        nan        nan        nan
 0.84293119 0.83824578        nan        nan 0.83020491 0.83456271
 0.83556571 0.83020558 0.8523084  0.84326552 0.83824544 0.84226151
        nan 0.55559222 0.8620196         nan        nan        nan
        nan 0.84895966 0.83724277 0.84159048 0.58405717        nan
        nan 0.84493855 0.83523138 0.84962833        nan 0.84962799
        nan        nan        nan        nan 0.83690137 0.833222
 0.84694692 0.83154662 0.8365704  0.86637303        nan        nan
 0.83724076 0.833222   0.83321628        nan 0.84694994        nan
 0.82752249 0.82853592 0.84393352 0.83590207 0.83421996        nan
        nan 0.83523272        nan        nan 0.86335728 0.5358361

In [None]:
random_gridsearch(SVC(),params_ran_sv,'SVM')

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [33]:
ran_result

{'multinomial_Naive_Bayes': ['tfvec',
  0.955793703951775,
  0.8534136546184738,
  0.8534136546184738,
  0.14658634538152615,
  0.8172690763052208,
  0.8895582329317269,
  0.8809523809523809,
  7.637492895126343],
 'decision_tree': ['cvec',
  0.5060281312793035,
  0.5090361445783133,
  0.5090361445783133,
  0.49096385542168675,
  1.0,
  0.018072289156626505,
  0.5045592705167173,
  6.98026442527771],
 'random_forest': ['cvec',
  0.825853985264568,
  0.7901606425702812,
  0.7901606425702812,
  0.20983935742971882,
  0.642570281124498,
  0.9377510040160643,
  0.9116809116809117,
  16.61593270301819],
 'logistic_regre': ['tfvec',
  0.9671801741460148,
  0.8524096385542169,
  0.8524096385542169,
  0.14759036144578308,
  0.8514056224899599,
  0.8534136546184738,
  0.8531187122736419,
  11.620300531387329]}

In [132]:
gs_rand.best_params_

{'preprocess__tfidf__ngram_range': (1, 1),
 'preprocess__tfidf__min_df': 1,
 'preprocess__tfidf__max_features': 2500,
 'preprocess__tfidf__max_df': 0.85,
 'preprocess': ColumnTransformer(transformers=[('tfidf',
                                  TfidfVectorizer(max_df=0.85,
                                                  max_features=2500),
                                  'title_combined')]),
 'classifier__alpha': 0.8}

In [97]:
model = gs_rand.best_estimator_

In [98]:
model.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('tfidf',
                                                  TfidfVectorizer(max_df=0.85,
                                                                  max_features=2500),
                                                  'title_combined')])),
                ('classifier', MultinomialNB(alpha=0.8))])>

In [125]:
pd.DataFrame(gs_rand.cv_results_).sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocess__tfidf__ngram_range,param_preprocess__tfidf__min_df,param_preprocess__tfidf__max_features,param_preprocess__tfidf__max_df,param_preprocess,param_classifier__alpha,...,param_preprocess__cvec__min_df,param_preprocess__cvec__max_features,param_preprocess__cvec__max_df,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
9,0.048676,0.005313,0.021675,0.00411,"(1, 1)",1.0,2500.0,0.85,"ColumnTransformer(transformers=[('tfidf',\n ...",0.8,...,,,,"{'preprocess__tfidf__ngram_range': (1, 1), 'pr...",0.886546,0.865327,0.866332,0.872735,0.009775,1
37,0.137221,0.019368,0.026673,0.00094,"(1, 2)",1.0,4000.0,0.95,"ColumnTransformer(transformers=[('tfidf',\n ...",0.8,...,,,,"{'preprocess__tfidf__ngram_range': (1, 2), 'pr...",0.883534,0.869347,0.864322,0.872401,0.008135,2
13,0.068348,0.016135,0.038804,0.008193,"(1, 1)",2.0,2000.0,0.85,"ColumnTransformer(transformers=[('tfidf',\n ...",0.9,...,,,,"{'preprocess__tfidf__ngram_range': (1, 1), 'pr...",0.884538,0.866332,0.863317,0.871395,0.009374,3
0,0.122971,0.012255,0.02834,0.0105,"(1, 2)",1.0,3000.0,0.85,"ColumnTransformer(transformers=[('tfidf',\n ...",0.9,...,,,,"{'preprocess__tfidf__ngram_range': (1, 2), 'pr...",0.885542,0.864322,0.863317,0.87106,0.010249,4
5,0.129364,0.008223,0.022336,0.00047,,,,,"ColumnTransformer(transformers=[('cvec', Count...",1.0,...,1.0,4000.0,0.95,"{'preprocess__cvec__ngram_range': (1, 2), 'pre...",0.881526,0.873367,0.855276,0.870056,0.010969,5


In [25]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(model,
                      X_test,
                      y_test,
                      values_format='d',
                      display_labels=['Marvel','DC'] )

NameError: name 'model' is not defined