<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/josh-updates/josh_model_linear_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.ensemble import BaggingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# parameters
sample_size = 5000

# Which numeric features to include
features = {
    'd_chall_score': True, 
    'aoa_mean': False, 
    'aoa_min': False, 
    'aoa_max': True,
    'conc_rating_mean': False, 
    'conc_rating_min': False, 
    'conc_rating_max': True, 
    'num_lemmas': True
}

In [4]:
# load datasets
train = pd.read_csv('/content/drive/Shareddrives/Milestone 2/Training_set.csv', delimiter='\t', index_col='ix')
train['lemmatized_text'] = train['lemmatized_text'].fillna('')

test = pd.read_csv('/content/drive/Shareddrives/Milestone 2/Testing_set.csv', delimiter='\t', index_col='ix')
test['lemmatized_text'] = test['lemmatized_text'].fillna('')

# collect feature list for df slicing
num_features = [k for k,v in features.items() if v]
txt_features = 'lemmatized_text'

In [95]:
# build pipeline object

# pipeline for numeric features
num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler()),
    ('binner', KBinsDiscretizer(n_bins=4)),
])

# pipeline for text features
text_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
])

# integration of numeric and text features
preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('text', text_pipe, txt_features),
])

# final pipeline
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('svc', LinearSVC(dual=False))
])

In [96]:
n = len(train) if sample_size == -1 else sample_size # to sample or not to sample

# sample the dataframe with balanced classes
sample_df = train.groupby('label').apply(lambda grp: grp.sample(int(n/2), random_state=99))

In [97]:
# prep grid search
tfidf_prefix = 'preprocessor__text__tfidf__'
svc_prefix = 'svc__'

# specify grid parameters
param_grid = {tfidf_prefix + 'ngram_range':[(1,2),(1,3),(2,3)],
              svc_prefix + 'C': [0.1, 0.5, 1], 
              # svc_prefix + 'C': [0.5], 
              # svc_prefix + 'penalty': ['l1','l2'],
              svc_prefix + 'penalty': ['l1'],
              # svc_prefix + 'kernel':['rbf'],
              svc_prefix + 'random_state':[99]}

# create and fit grid object
grid = (GridSearchCV(clf, 
                    param_grid=param_grid,
                    scoring='accuracy', 
                    cv=StratifiedKFold(n_splits=10),
                    refit=False,
                    verbose=3,
                    return_train_score=True,
                    n_jobs=-1)
        .fit(sample_df, sample_df['label']))

Fitting 10 folds for each of 9 candidates, totalling 90 fits


In [13]:
print(grid.best_params_)
cvr = pd.DataFrame(grid.cv_results_)
cvr[cvr['rank_test_score'] <= 5].sort_values(by='rank_test_score') .T

{'preprocessor__text__tfidf__max_df': 0.5, 'preprocessor__text__tfidf__min_df': 0, 'preprocessor__text__tfidf__ngram_range': (1, 3), 'svc__C': 0.5, 'svc__penalty': 'l1', 'svc__random_state': 99}


Unnamed: 0,6,2,10,3,11
mean_fit_time,1.185879,1.228118,1.186136,1.275732,1.268074
std_fit_time,0.059873,0.095539,0.05,0.067766,0.0904
mean_score_time,0.050166,0.050062,0.052659,0.053406,0.049095
std_score_time,0.003818,0.002842,0.007368,0.008322,0.00589
param_preprocessor__text__tfidf__max_df,0.5,0.1,1.0,0.1,1.0
param_preprocessor__text__tfidf__min_df,0,0,0,0,0
param_preprocessor__text__tfidf__ngram_range,"(1, 3)","(1, 3)","(1, 3)","(1, 3)","(1, 3)"
param_svc__C,0.5,0.5,0.5,1,1
param_svc__penalty,l1,l1,l1,l1,l1
param_svc__random_state,99,99,99,99,99


In [98]:
# setup final model
num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler()),
    ('binner', KBinsDiscretizer(n_bins=4)),
])

text_pipe = Pipeline([ 
    ('tfidf', TfidfVectorizer(max_df=0.5, ngram_range=(1,3))),
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('text', text_pipe, txt_features),
])

# used CalibratedClassifier to allow use of predict_proba
svc = Pipeline([
    ('preprocessor', preprocessor),
    ('svc', CalibratedClassifierCV(LinearSVC(penalty='l1', C=0.5, dual=False, max_iter=10000, random_state=99))),
])

In [99]:
# split with balanced classes
X_train, X_val, y_train, y_val = train_test_split(train[num_features + [txt_features]], 
                                                  train['label'],
                                                  stratify=train['label'], 
                                                  random_state=99)

svc = svc.fit(X_train, y_train)

# predictions
train_y_proba = svc.predict_proba(X_train)[:, 1]
train_y_hat = svc.predict(X_train)

val_y_proba = svc.predict_proba(X_val)[:, 1]
val_y_hat = svc.predict(X_val)

# calculate and display metrics
train_roc_auc = metrics.roc_auc_score(y_train, train_y_proba)
train_accuracy = metrics.accuracy_score(y_train, train_y_hat)
train_f1 = metrics.f1_score(y_train, train_y_hat)
train_precision = metrics.precision_score(y_train, train_y_hat)
train_recall = metrics.recall_score(y_train, train_y_hat)

val_roc_auc = metrics.roc_auc_score(y_val, val_y_proba)
val_accuracy = metrics.accuracy_score(y_val, val_y_hat)
val_f1 = metrics.f1_score(y_val, val_y_hat)
val_precision = metrics.precision_score(y_val, val_y_hat)
val_recall = metrics.recall_score(y_val, val_y_hat)

print()
print('Train ROC AUC:', train_roc_auc)
print('Train Accuracy Score:', train_accuracy)
print('Train F1 Score:', train_f1)
print('Train Precision Score:', train_precision)
print('Train Recall Score:', train_recall)
print()
print('Validation ROC AUC:', val_roc_auc)
print('Validation Accuracy Score:', val_accuracy)
print('Validation F1 Score:', val_f1)
print('Validation Precision Score:', val_precision)
print('Validation Recall Score:', val_recall)


Train ROC AUC: 0.8346854708598128
Train Accuracy Score: 0.7515631720573441
Train F1 Score: 0.7553402272615889
Train Precision Score: 0.744025985669262
Train Recall Score: 0.7670038888375432

Validation ROC AUC: 0.7935189204397641
Validation Accuracy Score: 0.7155151269555202
Validation F1 Score: 0.7228737936694265
Validation Precision Score: 0.7046540829552819
Validation Recall Score: 0.74206069912769


In [100]:
from joblib import dump

dump(svc, 'svc-model_final.joblib')

['svc-model_final.joblib']

In [40]:
# from joblib import load
# svc_old = load('/content/drive/Shareddrives/Milestone 2/svc-model_v4.joblib')

In [141]:
# # predictions
# train_y_proba = svc_old.predict_proba(X_train)[:, 1]
# train_y_hat = svc_old.predict(X_train)

# val_y_proba = svc_old.predict_proba(X_val)[:, 1]
# val_y_hat = svc_old.predict(X_val)

# # metrics
# train_roc_auc = metrics.roc_auc_score(y_train, train_y_proba)
# train_accuracy = metrics.accuracy_score(y_train, train_y_hat)
# train_f1 = metrics.f1_score(y_train, train_y_hat)
# train_precision = metrics.precision_score(y_train, train_y_hat)
# train_recall = metrics.recall_score(y_train, train_y_hat)

# val_roc_auc = metrics.roc_auc_score(y_val, val_y_proba)
# val_accuracy = metrics.accuracy_score(y_val, val_y_hat)
# val_f1 = metrics.f1_score(y_val, val_y_hat)
# val_precision = metrics.precision_score(y_val, val_y_hat)
# val_recall = metrics.recall_score(y_val, val_y_hat)

# print()
# print('Train ROC AUC:', train_roc_auc)
# print('Train Accuracy Score:', train_accuracy)
# print('Train F1 Score:', train_f1)
# print('Train Precision Score:', train_precision)
# print('Train Recall Score:', train_recall)
# print()
# print('Validation ROC AUC:', val_roc_auc)
# print('Validation Accuracy Score:', val_accuracy)
# print('Validation F1 Score:', val_f1)
# print('Validation Precision Score:', val_precision)
# print('Validation Recall Score:', val_recall)


Train ROC AUC: 0.8352929383264309
Train Accuracy Score: 0.7526189130488876
Train F1 Score: 0.7562493651688739
Train Precision Score: 0.7453089402830514
Train Recall Score: 0.7675157650772435

Validation ROC AUC: 0.7932213084485933
Validation Accuracy Score: 0.716762820854617
Validation F1 Score: 0.7234600807963016
Validation Precision Score: 0.7067559045507252
Validation Recall Score: 0.7409729775844051
