Summary:

Main experimentation notebook. Code modularized to select a dataloader and vectorizer, and then to test various models with cross validation. Additional results / statistics generated once a classifier is selected on.

In [1]:
from DataLoaders import AbstractDataLoader, OriginalDataLoader, StemDataLoader, NoSWLoader, GPTCleanedLoader, MatchLoader
from Vectorizers import AbstractVectorizer, BOWVectorizer, W2VVectorizer, PreW2V, FastTextVectorizer, AdaVectorizer
from type_utils import ProcessedData, MatchedData, UnprocessedData
from sklearn.model_selection import train_test_split
from typing import List
import numpy as np

Load and vectorize data:

In [2]:
np.random.seed(17)

dataloaders: List[AbstractDataLoader] = [
    OriginalDataLoader(data_path='../william_data/test_xml/'),
    StemDataLoader(data_path='../william_data/test_xml/'),
    NoSWLoader(data_path='../william_data/test_xml/'), # loader without stopwords
    GPTCleanedLoader(data_path='../william_data/test_xml/', cleaned_path='../william_data/cleanedjson/'),
    MatchLoader(data_path='../william_data/test_xml/')
]

dataloader = dataloaders[3] # GPT
# data: ProcessedData = dataloader.load_and_preprocess_data() # all but GPT loader
data: UnprocessedData = dataloader.load_and_preprocess_data() # type: ignore # GPT loader

d63cabe5b80fe79fd3fb6b1072c7642b


In [3]:
print(len(data['good']))
print(len(data['bad']))

328
2290


In [4]:
vectorizers: List[AbstractVectorizer] = [
    # BOWVectorizer(),
    # W2VVectorizer(),
    # PreW2V('frWac_non_lem_no_postag_no_phrase_200_skip_cut100'),
    # PreW2V('fr_w2v_web_w5', always_reload=True, strategy=0), # private vectorizer, fewest words out of cache atm
    # FastTextVectorizer("cc.fr.300", always_reload=True, strategy=0),
    AdaVectorizer()
]

vectorizer = vectorizers[0] # ada
X, y = vectorizer.vectorize(data) # TODO: make sure you're using the same data the whole time, this doesn't check if the underlying data input is different when loading from the local cache. This includes the different preprocessing methods.
print(X.shape)
print(y.shape)

loading from memory...
(2618, 1536)
(2618,)


Data Splitting:

In [5]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=17)

Data Preprocessing:

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_tr_sc = scaler.fit_transform(X_tr)
X_te_sc = scaler.transform(X_te)

Model Set-Up

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from CustomModels import AlwaysBad
from xgboost import XGBClassifier

models = {
    'Bad': AlwaysBad(),
    'LR': LogisticRegression(),
    'SVM': SVC(),
    'SGD-SVM': SGDClassifier(),
    'NB': GaussianNB(),
    'xgboost': XGBClassifier(colsample_bytree=0.6, subsample=0.7, n_estimators=100, max_depth=3, learning_rate=0.1),
    'xgboost-best': XGBClassifier(colsample_bytree=0.8, learning_rate=0.2, max_depth=4, n_estimators=300, subsample=0.8),
    'xgboost-nooverfit': XGBClassifier(colsample_bytree=0.4, learning_rate=0.2, max_depth=2, n_estimators=50, subsample=0.5, gamma=0.7, min_child_weight=1, eta=0.05, reg_alpha=0.1, reg_lambda=0.1),
}   

Model Experimentation

In [None]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

# suppress sklearn warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
# cross validate each model, with and without scaling, reporting accuracy, precision, recall, and f1
best_model = None
best_score = 0
best_acc_model = None
best_acc_score = 0
for model_name, model in models.items():
    print(f'Cross validating {model_name}...')
    cv_results = cross_validate(model, X_tr, y_tr, cv=10, scoring=['accuracy', 'precision', 'recall', 'f1']) # automatically stratified
    print(f'Accuracy: {np.mean(cv_results["test_accuracy"])}')
    print(f'Precision: {np.mean(cv_results["test_precision"])}')
    print(f'Recall: {np.mean(cv_results["test_recall"])}')
    print(f'F1: {np.mean(cv_results["test_f1"])}')
    print()

    if np.mean(cv_results["test_f1"]) > best_score:
        best_model = model_name
        best_score = np.mean(cv_results["test_f1"])

    if np.mean(cv_results["test_accuracy"]) > best_acc_score:
        best_acc_model = model_name
        best_acc_score = np.mean(cv_results["test_accuracy"])

    if model_name in ['SVM', 'SGD-SVM', 'LR']:
        model_name += ' (scaled)'
        print(f'Cross validating {model_name}...')
        cv_results = cross_validate(model, X_tr_sc, y_tr, cv=10, scoring=['accuracy', 'precision', 'recall', 'f1'])
        print(f'Accuracy: {np.mean(cv_results["test_accuracy"])}')
        print(f'Precision: {np.mean(cv_results["test_precision"])}')
        print(f'Recall: {np.mean(cv_results["test_recall"])}')
        print(f'F1: {np.mean(cv_results["test_f1"])}')
        print()

        if np.mean(cv_results["test_f1"]) > best_score:
            best_model = model_name
            best_score = np.mean(cv_results["test_f1"])

        if np.mean(cv_results["test_accuracy"]) > best_acc_score:
            best_acc_model = model_name
            best_acc_score = np.mean(cv_results["test_accuracy"])


print(f'Best model F1: {best_model}')
print(f'Best score F1: {best_score}')
print(f'Best model accuracy: {best_acc_model}')
print(f'Best score accuracy: {best_acc_score}')

notes: \
as of may 19, original strategies on preprocessing,\
xgboost best on public vecs does 0.953 F1, 0.082 test error, ~50/50 true good -> 95.4% F1 \
xgboost best on priv vecs does 0.952, 0.084 on te, 45/55 -> .953 F1 \
xgboost on our trained mini w2v does 0.937, .12, 20/80

Eval on best model:

In [None]:
BEST = 'xgboost-best'
clf = models[BEST]
clf.fit(X_tr, y_tr)

Evaluation:

In [None]:
# find train and test error
print('Train error:', 1 - clf.score(X_tr, y_tr))
print('Test error:', 1- clf.score(X_te, y_te))

# find precision, recall, and f1
y_pred = clf.predict(X_te)
print('Precision:', precision_score(y_te, y_pred))
print('Recall:', recall_score(y_te, y_pred))
print('F1:', f1_score(y_te, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix

# Make predictions on test data
y_pred = clf.predict(X_te)

# Create confusion matrix
cm = confusion_matrix(y_te, y_pred)

# Print confusion matrix
print("Confusion Matrix:\n", cm)

In [None]:
from util import provide_confusion_matrix

provide_confusion_matrix(y_te, y_pred)

In [None]:
# Create confusion matrix
cm = confusion_matrix(y_te, y_pred)
# f1
print('F1:', f1_score(y_te, y_pred))

provide_confusion_matrix(y_te, y_pred)

In [None]:
provide_confusion_matrix(y_tr, clf.predict(X_tr))

priv vectors, no stopwords:\
avg: 51/48; 2/98\
min: 42; 1.3\
max: 38; 2