In [1]:
from os import path

from matplotlib import pyplot as plt
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics

from functions import load_bad_words, build_data_path, print_report
from constants import LABEL_COLS

In [2]:
BAD_WORDS = load_bad_words()

In [3]:
training_data_path = build_data_path('train.csv')

In [4]:
df = pd.read_csv(training_data_path)
X = df['comment_text']
y = df[LABEL_COLS]

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33)

In [6]:
clf = OneVsRestClassifier(ComplementNB(norm=True))

tfidf = TfidfVectorizer(lowercase=True, stop_words='english')
bad_word_counter = CountVectorizer(vocabulary=BAD_WORDS)

union = make_union(tfidf, bad_word_counter)

pipeline = make_pipeline(union, clf)

optimizer = pipeline

print('Would you like to perform an exhaustive search? NOTE: This will take several hours.')
autotune_hyperparameters = input('Please enter "yes" or "no".')

# Auto-tune hyperparameters
while autotune_hyperparameters.lower() not in ['yes', 'no']:
    autotune_hyperparameters = input('Please enter "yes" or "no".')
if autotune_hyperparameters == 'yes':
    parameters = {
        'featureunion__tfidfvectorizer__lowercase': [True, False],
        'onevsrestclassifier__estimator__alpha': [0.001, 0.01, 0.1, 1.0],
        'onevsrestclassifier__estimator__norm': [True, False],
        'featureunion__tfidfvectorizer__max_features': [1000, 5000, 10000, None],
        'featureunion__countvectorizer__binary': [True, False]
    }
    optimizer = GridSearchCV(pipeline, parameters, scoring='f1_weighted', verbose=3)

optimizer.fit(X_train, y_train)
y_predictions = optimizer.predict(X_valid)

# best_estimator_ = optimizer.best_estimator_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 128 candidates, totalling 384 fits
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True, score=0.6237248907084658, total=   9.6s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.5s remaining:    0.0s


[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True, score=0.6187187376895471, total=   9.3s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   30.7s remaining:    0.0s


[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True, score=0.6228410754453554, total=   9.4s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=False 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=False, score=0.6141144947757075, total=   9.1s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__es

[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=True, score=0.6507395843474645, total=   8.5s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=True 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=True, score=0.6550016511348316, total=   8.3s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__n

[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=True, score=0.4304431520102115, total=  39.8s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=True 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=True, score=0.4234112100449148, total=  17.3s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__n

[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=False, score=0.6121799848038734, total=  36.6s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=True 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=True, score=0.24250513523909448, total=  38.1s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__e

[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=False, score=0.5839238104492691, total=  39.7s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=False 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=False, score=0.5792227149804436, total=  36.8s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__esti

[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=False, score=0.6404497816920005, total=  40.2s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=False 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=False, score=0.6377676534413531, total=  36.8s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator

[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=True, score=0.6096930075007116, total=   8.5s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=False 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=False, score=0.6036258603589232, total=   8.4s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__es

[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True, score=0.3076089993837577, total=   9.6s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True, score=0.31834047006868, total=   9.8s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__es

[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=True, score=0.5962363473960756, total=   9.4s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=True 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=True, score=0.5906676514166594, total=   9.5s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimato

[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=False, score=0.6039582507847557, total=   9.2s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=True 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=True, score=0.43615831753172124, total=   9.3s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__e

[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=False, score=0.6101632347167216, total=   9.3s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=False 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=False, score=0.6169263898291388, total=   9.3s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier

[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=False, score=0.6133759491051857, total=   9.5s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=False 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=False, score=0.6105278087522087, total=   9.2s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estim

[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=True, score=0.6541663068968832, total=   8.6s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=False 
[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=False, score=0.6722191324193019, total=   8.6s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estima

[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=True, score=0.3696674999978957, total=   8.6s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=True 
[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=True, score=0.38703485979046526, total=   8.7s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__est

[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True, score=0.1943596872478183, total=   9.0s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True 
[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True, score=0.19182504266037778, total=   8.7s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassif

[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=False, score=0.6765104568418797, total=   9.2s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=False 
[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=False, score=0.6842578826021213, total=   8.9s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__e

[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=False, score=0.6805464296736744, total=   8.8s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=False 
[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=False, score=0.6648187972367384, total=   9.0s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__e

[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=True, score=0.6239163130439578, total=   8.8s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=False 
[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=False, score=0.6498374004046388, total=   8.7s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassi

[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=True, score=0.6350206610762098, total=   8.7s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=True 
[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=True, score=0.6369675719754194, total=   8.5s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__esti

[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.01, onevsrestclassifier__estimator__norm=False, score=0.6575443016799263, total=   8.7s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=True 
[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=True, score=0.49585489423387724, total=   8.9s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__e

[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=False, score=0.6493743317308278, total=   9.1s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=False 
[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.001, onevsrestclassifier__estimator__norm=False, score=0.664709816446881, total=   8.9s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=0.01, onevsrestcla

[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=True, score=0.6341633692011833, total=   8.9s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=False 
[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier__estimator__norm=False, score=0.653738336184931, total=   9.2s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, onevsrestclassifier__estimator__alpha=1.0, onevsrestclassifier_

[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=True, score=0.6718889589357558, total=   9.2s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=True 
[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__estimator__norm=True, score=0.6607402999191727, total=   9.2s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, onevsrestclassifier__estimator__alpha=0.1, onevsrestclassifier__esti

[Parallel(n_jobs=1)]: Done 384 out of 384 | elapsed: 132.0min finished


In [7]:
print_report(y_valid, y_predictions)

VALIDATION RESULTS:
Hamming loss (lower is better): 0.022702671907935967
Jaccard similarity (higher is better): 0.9317077802464916

               precision    recall  f1-score   support

        toxic       0.80      0.66      0.72      4993
 severe_toxic       0.33      0.65      0.44       530
      obscene       0.76      0.79      0.78      2746
       threat       0.24      0.05      0.08       159
       insult       0.67      0.67      0.67      2586
identity_hate       0.32      0.27      0.30       488

    micro avg       0.70      0.67      0.68     11502
    macro avg       0.52      0.52      0.50     11502
 weighted avg       0.71      0.67      0.68     11502
  samples avg       0.05      0.06      0.05     11502



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [8]:
test_data = build_data_path('test.csv')

data_df = pd.read_csv(test_data)

test_labels = build_data_path('test_labels.csv')
label_df = pd.read_csv(test_labels)

test_df = data_df.set_index('id').join(label_df.set_index('id'))
CONDITIONS = [f'{label} != -1' for label in LABEL_COLS]
QUERY_STRING = ' & '.join(CONDITIONS)
test_df = test_df.query(QUERY_STRING)
X_test = df['comment_text']
y_test = df[LABEL_COLS]

y_predictions = optimizer.predict(X_test)

print_report(y_test, y_predictions, data_type='TESTING')

TESTING RESULTS:
Hamming loss (lower is better): 0.018717895691155242
Jaccard similarity (higher is better): 0.9440518640605122

               precision    recall  f1-score   support

        toxic       0.85      0.74      0.79     15294
 severe_toxic       0.38      0.77      0.51      1595
      obscene       0.80      0.85      0.82      8449
       threat       0.59      0.13      0.21       478
       insult       0.71      0.74      0.72      7877
identity_hate       0.44      0.41      0.42      1405

    micro avg       0.75      0.74      0.74     35098
    macro avg       0.63      0.61      0.58     35098
 weighted avg       0.77      0.74      0.75     35098
  samples avg       0.06      0.07      0.06     35098



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
