In [None]:
from os import path

from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import ComplementNB, BernoulliNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics

from functions import load_bad_words, load_ethnic_slurs, build_data_path, print_report, run_on_test_data
from constants import LABEL_COLS

In [None]:
BAD_WORDS = set(load_bad_words())
ETHNIC_SLURS = set(load_ethnic_slurs())

In [None]:
training_data_path = build_data_path('augmented_train.csv')

In [None]:
df = pd.read_csv(training_data_path)

X = df['comment_text']
y = df[LABEL_COLS]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33)

In [None]:
clf = OneVsRestClassifier(ComplementNB())

tfidf = TfidfVectorizer()
bad_word_counter = CountVectorizer(vocabulary=BAD_WORDS)
slur_counter = CountVectorizer(vocabulary=ETHNIC_SLURS)
union = make_union(tfidf, bad_word_counter, slur_counter)

pipeline = make_pipeline(union, clf)

optimizer = pipeline

print('Would you like to perform an exhaustive search? NOTE: This will take several hours.')
autotune_hyperparameters = input('Please enter "yes" or "no".')

# Auto-tune hyperparameters
while autotune_hyperparameters.lower() not in ['yes', 'no']:
    autotune_hyperparameters = input('Please enter "yes" or "no".')
if autotune_hyperparameters == 'yes':
    parameters = {
        'featureunion__tfidfvectorizer__lowercase': [True, False],
        'onevsrestclassifier__estimator__alpha': [0.001, 0.01, 0.1, 1.0],
        'onevsrestclassifier__estimator__norm': [True, False],
        'featureunion__tfidfvectorizer__max_features': [1000, 5000, 10000, None]
    }
    optimizer = GridSearchCV(pipeline, parameters, scoring='roc_auc', verbose=3)

optimizer.fit(X_train, y_train)
y_predictions = optimizer.predict(X_valid)

# best_estimator_ = optimizer.best_estimator_

In [None]:
print_report(y_valid, y_predictions)

In [None]:
# run_on_test_data(optimizer)