<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/tim-updates/all-models-cv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys

!{sys.executable} -m pip install -U xgboost==1.6.0 -q

[K     |████████████████████████████████| 193.7 MB 39 kB/s 
[?25h

In [2]:
from google.colab import drive
import joblib
import pickle
import os

import pandas as pd
import numpy as np

from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# If downloading from GitHub, these paths will not work.
repo_path = '/content/drive/MyDrive/milestone-ii/'
train_path = ''.join([repo_path, 'Training_set.csv'])
test_path = ''.join([repo_path, 'Testing_set.csv'])

df_train = pd.read_csv(train_path, sep='\t', index_col='ix')
df_train['lemmatized_text'] = df_train['lemmatized_text'].fillna('')
df_test = pd.read_csv(test_path, sep='\t', index_col='ix')
df_test['lemmatized_text'] = df_test['lemmatized_text'].fillna('')

df = pd.concat([df_train, df_test])
df.head(2)

Unnamed: 0_level_0,original_text,lemmatized_text,d_chall_score,aoa_mean,aoa_min,aoa_max,conc_rating_mean,conc_rating_min,conc_rating_max,num_lemmas,label
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,There is manuscript evidence that Austen conti...,there be manuscript evidence that austen conti...,8.236551,5.80931,3.57,12.12,2.495517,1.33,4.57,37.0,1
1,"In a remarkable comparative analysis , Mandaea...",in a remarkable comparative analysis mandaean ...,12.320171,7.499286,2.89,11.94,2.251429,1.46,3.77,21.0,1


In [6]:
model_dir = '/content/drive/MyDrive/milestone-ii/Models'
model_map = [
  ('dummyclassifier-clf.joblib', 'Dummy Classifier'),
  ('decisontrees-clf.joblib', 'Decision Tree'),
  ('xgboost-clf.joblib', 'XGBoost RF'),
  ('random-forest-clf.joblib', 'Random Forest'),
  ('multinomialnaivebayes-clf.joblib', 'Naive Bayes'),
  ('logistic-regression-clf.joblib','Logistic Regression'),
  ('svc-model_final.joblib', 'Support Vector Machines'),
]

In [None]:
from sklearn.model_selection import StratifiedKFold
cv_results = {}
n_splits=10

skfold = StratifiedKFold(n_splits=n_splits)

for model_path, model_name in model_map:
  clf = joblib.load(os.path.join(model_dir, model_path))

  train_accuracy_scores = np.zeros(n_splits)
  train_roc_auc_scores = np.zeros(n_splits)
  train_f1_scores = np.zeros(n_splits)

  test_accuracy_scores = np.zeros(n_splits)
  test_roc_auc_scores = np.zeros(n_splits)
  test_f1_scores = np.zeros(n_splits)

  for i, (train_ix, test_ix) in enumerate(skfold.split(df, df.label)):
    X_train, y_train = df.iloc[train_ix, :-1], df.iloc[train_ix].label
    X_test, y_test = df.iloc[test_ix, :-1], df.iloc[test_ix].label

    clf.fit(X_train, y_train)

    y_proba = clf.predict_proba(X_train)[:, 1]
    y_hat = clf.predict(X_train)
    
    train_accuracy_scores[i] = metrics.accuracy_score(y_train, y_hat)
    train_roc_auc_scores[i] = metrics.roc_auc_score(y_train, y_proba)
    train_f1_scores[i] = metrics.f1_score(y_train, y_hat)

    y_proba = clf.predict_proba(X_test)[:, 1]
    y_hat = clf.predict(X_test)

    test_accuracy_scores[i] = metrics.accuracy_score(y_test, y_hat)
    test_roc_auc_scores[i] = metrics.roc_auc_score(y_test, y_proba)
    test_f1_scores[i] = metrics.f1_score(y_test, y_hat)
    print(model_name, f'fit {i+1} complete')
  cv_results[model_name] = dict(
      test_accuracy_scores=test_accuracy_scores,
      test_roc_auc_scores=test_roc_auc_scores,
      test_f1_scores=test_f1_scores,

      train_accuracy_scores=train_accuracy_scores,
      train_roc_auc_scores=train_roc_auc_scores,
      train_f1_scores=train_f1_scores,
  )
  print(cv_results[model_name])

In [None]:
with open('/content/drive/MyDrive/milestone-ii/all-model-cv-results', 'wb') as fh:
  pickle.dump(cv_results, fh)

In [None]:
with open('/content/drive/MyDrive/milestone-ii/all-model-cv-results', 'rb') as fh:
  print(pickle.load(fh))