<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/tim-updates/all-models-cv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys

!{sys.executable} -m pip install -U xgboost==1.6.0 -q

[K     |████████████████████████████████| 193.7 MB 39 kB/s 
[?25h

In [2]:
from google.colab import drive
import joblib
import pickle
import os

import pandas as pd
import numpy as np

from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# If downloading from GitHub, these paths will not work.
repo_path = '/content/drive/MyDrive/milestone-ii/'
train_path = ''.join([repo_path, 'Training_set.csv'])
test_path = ''.join([repo_path, 'Testing_set.csv'])

df_train = pd.read_csv(train_path, sep='\t', index_col='ix')
df_train['lemmatized_text'] = df_train['lemmatized_text'].fillna('')
df_test = pd.read_csv(test_path, sep='\t', index_col='ix')
df_test['lemmatized_text'] = df_test['lemmatized_text'].fillna('')

df = pd.concat([df_train, df_test])
df.head(2)

Unnamed: 0_level_0,original_text,lemmatized_text,d_chall_score,aoa_mean,aoa_min,aoa_max,conc_rating_mean,conc_rating_min,conc_rating_max,num_lemmas,label
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,There is manuscript evidence that Austen conti...,there be manuscript evidence that austen conti...,8.236551,5.80931,3.57,12.12,2.495517,1.33,4.57,37.0,1
1,"In a remarkable comparative analysis , Mandaea...",in a remarkable comparative analysis mandaean ...,12.320171,7.499286,2.89,11.94,2.251429,1.46,3.77,21.0,1


In [6]:
model_dir = '/content/drive/MyDrive/milestone-ii/Models'
model_map = [
  ('dummyclassifier-clf.joblib', 'Dummy Classifier'),
  ('decisontrees-clf.joblib', 'Decision Tree'),
  ('xgboost-clf.joblib', 'XGBoost RF'),
  ('random-forest-clf.joblib', 'Random Forest'),
  ('multinomialnaivebayes-clf.joblib', 'Naive Bayes'),
  ('logistic-regression-clf.joblib','Logistic Regression'),
  ('svc-model_final.joblib', 'Support Vector Machines'),
]

In [9]:
from sklearn.model_selection import StratifiedKFold
cv_results = {}
n_splits=10

skfold = StratifiedKFold(n_splits=n_splits)

for model_path, model_name in model_map:
  clf = joblib.load(os.path.join(model_dir, model_path))

  train_accuracy_scores = np.zeros(n_splits)
  train_roc_auc_scores = np.zeros(n_splits)
  train_f1_scores = np.zeros(n_splits)

  test_accuracy_scores = np.zeros(n_splits)
  test_roc_auc_scores = np.zeros(n_splits)
  test_f1_scores = np.zeros(n_splits)

  for i, (train_ix, test_ix) in enumerate(skfold.split(df, df.label)):
    X_train, y_train = df.iloc[train_ix, :-1], df.iloc[train_ix].label
    X_test, y_test = df.iloc[test_ix, :-1], df.iloc[test_ix].label

    clf.fit(X_train, y_train)

    y_proba = clf.predict_proba(X_train)[:, 1]
    y_hat = clf.predict(X_train)
    
    train_accuracy_scores[i] = metrics.accuracy_score(y_train, y_hat)
    train_roc_auc_scores[i] = metrics.roc_auc_score(y_train, y_proba)
    train_f1_scores[i] = metrics.f1_score(y_train, y_hat)

    y_proba = clf.predict_proba(X_test)[:, 1]
    y_hat = clf.predict(X_test)

    test_accuracy_scores[i] = metrics.accuracy_score(y_test, y_hat)
    test_roc_auc_scores[i] = metrics.roc_auc_score(y_test, y_proba)
    test_f1_scores[i] = metrics.f1_score(y_test, y_hat)
    print(model_name, f'fit {i+1} complete')

  cv_results[model_name] = dict(
      train_accuracy_scores=train_accuracy_scores,
      train_roc_auc_scores=train_roc_auc_scores,
      train_f1_scores=train_f1_scores,

      test_accuracy_scores=test_accuracy_scores,
      test_roc_auc_scores=test_roc_auc_scores,
      test_f1_scores=test_f1_scores,
  )
  print(cv_results[model_name])

Dummy Classifier fit 1 complete
Dummy Classifier fit 2 complete
Dummy Classifier fit 3 complete
Dummy Classifier fit 4 complete
Dummy Classifier fit 5 complete
Dummy Classifier fit 6 complete
Dummy Classifier fit 7 complete
Dummy Classifier fit 8 complete
Dummy Classifier fit 9 complete
Dummy Classifier fit 10 complete
{'train_accuracy_scores': array([0.49929217, 0.5012277 , 0.50118505, 0.50048122, 0.499996  ,
       0.50044389, 0.5005452 , 0.49962009, 0.50049055, 0.49960276]), 'train_roc_auc_scores': array([0.49936949, 0.49930284, 0.49915354, 0.49886827, 0.5004279 ,
       0.5006705 , 0.5001613 , 0.50008398, 0.50130901, 0.49940548]), 'train_f1_scores': array([0.50005723, 0.500927  , 0.5005779 , 0.50052649, 0.50044083,
       0.50129215, 0.50056384, 0.49959741, 0.50043727, 0.49929975]), 'test_accuracy_scores': array([0.500012  , 0.50224344, 0.50008398, 0.49818845, 0.50205149,
       0.49948413, 0.50034791, 0.49773256, 0.50064785, 0.50019196]), 'test_roc_auc_scores': array([0.50142761, 

  "decreasing the number of bins." % jj


Decision Tree fit 1 complete


  "decreasing the number of bins." % jj


Decision Tree fit 2 complete


  "decreasing the number of bins." % jj


Decision Tree fit 3 complete


  "decreasing the number of bins." % jj


Decision Tree fit 4 complete


  "decreasing the number of bins." % jj


Decision Tree fit 5 complete


  "decreasing the number of bins." % jj


Decision Tree fit 6 complete


  "decreasing the number of bins." % jj


Decision Tree fit 7 complete


  "decreasing the number of bins." % jj


Decision Tree fit 8 complete


  "decreasing the number of bins." % jj


Decision Tree fit 9 complete


  "decreasing the number of bins." % jj


Decision Tree fit 10 complete
{'train_accuracy_scores': array([0.66024245, 0.66081031, 0.66026644, 0.66090895, 0.66083697,
       0.65991986, 0.65996518, 0.66043973, 0.66062726, 0.65968349]), 'train_roc_auc_scores': array([0.72435964, 0.72504025, 0.7243457 , 0.72484712, 0.72534635,
       0.72388624, 0.72413132, 0.72506295, 0.72483228, 0.72425151]), 'train_f1_scores': array([0.68689192, 0.68779953, 0.68732656, 0.6852356 , 0.68467169,
       0.68378454, 0.68639755, 0.68692142, 0.68693804, 0.68362587]), 'test_accuracy_scores': array([0.65868465, 0.65717302, 0.65861266, 0.65770089, 0.65798882,
       0.65786885, 0.66050819, 0.65827675, 0.65776466, 0.66100394]), 'test_roc_auc_scores': array([0.72161069, 0.72125324, 0.72370532, 0.72109037, 0.71999394,
       0.72162845, 0.7225952 , 0.71929865, 0.72039795, 0.72407956]), 'test_f1_scores': array([0.68702559, 0.68394974, 0.6863454 , 0.6817614 , 0.6823692 ,
       0.6822507 , 0.68593372, 0.68473015, 0.68350161, 0.6846147 ])}
XGBoost RF fit 1 com

  "decreasing the number of bins." % jj


Naive Bayes fit 1 complete


  "decreasing the number of bins." % jj


Naive Bayes fit 2 complete


  "decreasing the number of bins." % jj


Naive Bayes fit 3 complete


  "decreasing the number of bins." % jj


Naive Bayes fit 4 complete


  "decreasing the number of bins." % jj


Naive Bayes fit 5 complete


  "decreasing the number of bins." % jj


Naive Bayes fit 6 complete


  "decreasing the number of bins." % jj


Naive Bayes fit 7 complete


  "decreasing the number of bins." % jj


Naive Bayes fit 8 complete


  "decreasing the number of bins." % jj


Naive Bayes fit 9 complete


  "decreasing the number of bins." % jj


Naive Bayes fit 10 complete
{'train_accuracy_scores': array([0.75207883, 0.75249739, 0.7523321 , 0.75228411, 0.75249739,
       0.752764  , 0.75206283, 0.75272934, 0.75313523, 0.75409766]), 'train_roc_auc_scores': array([0.84055061, 0.84100023, 0.84073518, 0.84075027, 0.84121254,
       0.84099356, 0.84084697, 0.84113035, 0.84137529, 0.84242266]), 'train_f1_scores': array([0.7555781 , 0.75620798, 0.75583486, 0.75592868, 0.75604141,
       0.75617605, 0.75556105, 0.75639615, 0.75668418, 0.75767925]), 'test_accuracy_scores': array([0.7108957 , 0.71125561, 0.71190345, 0.71010389, 0.70612088,
       0.71111164, 0.7088562 , 0.70621686, 0.70608984, 0.69298877]), 'test_roc_auc_scores': array([0.78924478, 0.78763174, 0.79048437, 0.78916525, 0.78168241,
       0.78699403, 0.78691991, 0.78388422, 0.78273941, 0.76708069]), 'test_f1_scores': array([0.72064177, 0.7188055 , 0.72011003, 0.71848642, 0.71457867,
       0.71921642, 0.71654831, 0.71465859, 0.71313146, 0.70171349])}
Logistic Regression fi

In [10]:
with open('/content/drive/MyDrive/milestone-ii/all-model-cv-results', 'wb') as fh:
  pickle.dump(cv_results, fh)

In [11]:
with open('/content/drive/MyDrive/milestone-ii/all-model-cv-results', 'rb') as fh:
  print(pickle.load(fh))

{'Dummy Classifier': {'train_accuracy_scores': array([0.49929217, 0.5012277 , 0.50118505, 0.50048122, 0.499996  ,
       0.50044389, 0.5005452 , 0.49962009, 0.50049055, 0.49960276]), 'train_roc_auc_scores': array([0.49936949, 0.49930284, 0.49915354, 0.49886827, 0.5004279 ,
       0.5006705 , 0.5001613 , 0.50008398, 0.50130901, 0.49940548]), 'train_f1_scores': array([0.50005723, 0.500927  , 0.5005779 , 0.50052649, 0.50044083,
       0.50129215, 0.50056384, 0.49959741, 0.50043727, 0.49929975]), 'test_accuracy_scores': array([0.500012  , 0.50224344, 0.50008398, 0.49818845, 0.50205149,
       0.49948413, 0.50034791, 0.49773256, 0.50064785, 0.50019196]), 'test_roc_auc_scores': array([0.50142761, 0.500276  , 0.49886032, 0.50406701, 0.50401907,
       0.49636494, 0.49950811, 0.50274718, 0.49628083, 0.50424705]), 'test_f1_scores': array([0.5013401 , 0.50229121, 0.50139996, 0.49776668, 0.50049342,
       0.49809923, 0.49963958, 0.49628221, 0.50094722, 0.50143609])}, 'Decision Tree': {'train_acc