In [1]:
from os import path

from matplotlib import pyplot as plt
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import ComplementNB, BernoulliNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics

from functions import load_bad_words, load_ethnic_slurs, add_indirect_features, build_data_path, print_report, run_on_test_data
from constants import LABEL_COLS

In [2]:
BAD_WORDS = set(load_bad_words())
ETHNIC_SLURS = set(load_ethnic_slurs())

In [3]:
training_data_path = build_data_path('train.csv')

In [4]:
df = pd.read_csv(training_data_path)

columns_of_interest = 'comment_text'
print('Would you like to include indirect features?')
yes_no = input('Please enter either "yes" or "no"')

if yes_no.lower() == 'yes':
    df, additional_columns = add_indirect_features(df)
    columns_of_interest = ['comment_text'] + additional_columns
X = df['comment_text']
y = df[LABEL_COLS]

Would you like to include indirect features?
Please enter either "yes" or "no"yes


NameError: name 'eng_stopwords' is not defined

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33)

In [None]:
clf = OneVsRestClassifier(ComplementNB(alpha=0.001))

tfidf = TfidfVectorizer(lowercase=True, max_features=1000)
bad_word_counter = CountVectorizer(vocabulary=BAD_WORDS, lowercase=True)
slur_counter = CountVectorizer(vocabulary=ETHNIC_SLURS, ngram_range=(1, 5), lowercase=True)
union = make_union(tfidf, bad_word_counter, slur_counter)

pipeline = make_pipeline(union, clf)

optimizer = pipeline

print('Would you like to perform an exhaustive search? NOTE: This will take several hours.')
autotune_hyperparameters = input('Please enter "yes" or "no".')

# Auto-tune hyperparameters
while autotune_hyperparameters.lower() not in ['yes', 'no']:
    autotune_hyperparameters = input('Please enter "yes" or "no".')
if autotune_hyperparameters == 'yes':
    parameters = {
        'featureunion__tfidfvectorizer__lowercase': [True, False],
        'onevsrestclassifier__estimator__alpha': [0.001, 0.01, 0.1, 1.0],
        'onevsrestclassifier__estimator__norm': [True, False],
        'featureunion__tfidfvectorizer__max_features': [1000, 5000, 10000, None],
        'featureunion__countvectorizer__binary': [True, False]
    }
    optimizer = GridSearchCV(pipeline, parameters, scoring='f1_macro', verbose=3)

optimizer.fit(X_train, y_train)
y_predictions = optimizer.predict(X_valid)

# best_estimator_ = optimizer.best_estimator_

In [None]:
print_report(y_valid, y_predictions)

In [None]:
# run_on_test_data(optimizer)