In [1]:
from os import path

from matplotlib import pyplot as plt
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils.class_weight import compute_sample_weight
from sklearn import metrics

from functions import load_bad_words, build_data_path, print_report
from constants import LABEL_COLS

In [2]:
BAD_WORDS = load_bad_words()
training_data_path = build_data_path('train.csv')

In [3]:
df = pd.read_csv(training_data_path)
X = df['comment_text']
y = df[LABEL_COLS]

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4)

In [5]:
clf = RandomForestClassifier()

tfidf = TfidfVectorizer(lowercase=True, stop_words='english')
bad_word_counter = CountVectorizer(vocabulary=BAD_WORDS)

union = make_union(tfidf, bad_word_counter)

pipeline = make_pipeline(union, clf)

optimizer = pipeline

print('Would you like to perform an exhaustive search? NOTE: This will take several hours.')
autotune_hyperparameters = input('Please enter "yes" or "no".')


# Auto-tune hyperparameters
while autotune_hyperparameters.lower() not in ['yes', 'no']:
    autotune_hyperparameters = input('Please enter "yes" or "no".')
if autotune_hyperparameters == 'yes':
    parameters = {
        'featureunion__tfidfvectorizer__lowercase': [True, False],
        'featureunion__tfidfvectorizer__max_features': [1000, 5000, 10000, None],
        'featureunion__countvectorizer__binary': [True, False],
        'randomforestclassifier__class_weight': [None, 'balanced'],
    }
    optimizer = GridSearchCV(pipeline, parameters, scoring='f1_weighted', verbose=3)
fit_params = {
    'randomforestclassifier__sample_weights': compute_sample_weight('balanced', y_train)
}
optimizer.fit(X_train, y_train)

Would you like to perform an exhaustive search? NOTE: This will take several hours.
Please enter "yes" or "no".yes


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None, score=0.6460078991624626, total=  46.1s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   54.4s remaining:    0.0s


[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None, score=0.6506001728559206, total=  44.7s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.8min remaining:    0.0s


[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None, score=0.6311374354019982, total=  25.7s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced, score=0.4728377905142838, total=  34.7s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced, score=0.44830982031858835, total=  24.7s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced, score=0.45730706000105903, total=  25.1s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None, score=0.655983981531818, total=  38.7s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None, score=0.6613924877782911, total=  35.3s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None, score=0.6479209124234612, total=  42.4s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced, score=0.4838635412509964, total=  58.8s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced, score=0.4458064191072223, total=  32.4s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced, score=0.46279529706788747, total=  31.3s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None, score=0.6549282591154602, total=  44.5s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None, score=0.6644291711474666, total=  43.1s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None, score=0.6461615395587061, total=  45.9s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced, score=0.46646375029933745, total=  37.0s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced, score=0.45743840859483376, total=  31.8s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced, score=0.46070940798058757, total=  37.7s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None, score=0.641985438363726, total=  58.9s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None, score=0.6479862228102586, total= 1.2min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None, score=0.6312917748958865, total= 1.1min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced, score=0.47449103360020034, total= 1.4min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced, score=0.5005639107443073, total= 1.2min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced, score=0.4389303502339801, total= 1.1min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None, score=0.6083492134235435, total=  37.3s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None, score=0.6232980662742857, total=  37.7s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None, score=0.6074622246015257, total=  29.3s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced, score=0.44745719539816525, total=  25.6s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced, score=0.4233253186587818, total=  30.5s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced, score=0.4416884889487729, total=  28.0s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None, score=0.6386339457551139, total=  34.3s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None, score=0.6394152133989152, total=  30.8s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None, score=0.6229280659458416, total=  38.5s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced, score=0.47610830263232956, total=  34.7s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced, score=0.46243089427731887, total=  29.6s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced, score=0.4586724082441788, total=  29.8s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None, score=0.6396913547502036, total=  33.9s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None, score=0.6497207763252957, total=  38.2s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None, score=0.6273540649845956, total=  32.7s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced, score=0.46170685663453126, total=  31.5s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced, score=0.4377645409265773, total=  29.6s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced, score=0.4467275597611248, total=  31.8s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None, score=0.6190938927805254, total=  51.8s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None, score=0.6246649185476956, total= 1.1min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None, score=0.6045848729668517, total= 1.0min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced, score=0.44522388992400086, total= 1.0min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced, score=0.4613362811015859, total=  58.6s
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced, score=0.4579795662782954, total=  57.3s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None, score=0.6429517016145532, total=  28.1s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None, score=0.6518655670956987, total=  26.5s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None, score=0.6351832662119312, total=  27.0s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced, score=0.48251088904071615, total=  25.8s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced, score=0.46327221233196025, total=  25.8s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced, score=0.4430940743995946, total=  25.7s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None, score=0.6478433407647521, total=  31.4s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None, score=0.6607707173710055, total=  31.8s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None, score=0.6487562317025386, total=  30.7s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced, score=0.4651175621829224, total=  29.5s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced, score=0.434439329600827, total=  28.4s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced, score=0.43773659901248424, total=  28.2s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None, score=0.655764664320409, total=  33.0s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None, score=0.6571267853839469, total=  31.4s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None, score=0.6392609154472283, total=  36.9s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced, score=0.4612988279588064, total=  33.7s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced, score=0.4554058663443946, total=  30.1s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced, score=0.453739773246354, total=  32.7s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None, score=0.6478538778628238, total=  52.7s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None, score=0.6476838499214437, total=  50.3s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None, score=0.6330651896518386, total=  52.0s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced, score=0.47241715721768857, total=  53.0s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced, score=0.4612099430883905, total=  51.1s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced, score=0.4885820808058575, total=  51.3s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None, score=0.6114808140192334, total=  24.9s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None, score=0.6165532522073709, total=  25.7s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=None, score=0.6061521718927705, total=  26.3s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced, score=0.44843952308595353, total=  25.1s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced, score=0.42446232365700753, total=  28.0s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__class_weight=balanced, score=0.4381518617914897, total=  44.8s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None, score=0.6377552888722484, total=  33.6s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None, score=0.6446853661071985, total=  33.6s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=None, score=0.6251584364842648, total=  32.9s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced, score=0.4531158039228794, total=  30.4s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced, score=0.4338767717469834, total=  29.2s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__class_weight=balanced, score=0.447197250467632, total=  32.6s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None, score=0.6404312637046028, total=  36.0s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None, score=0.640096418708613, total=  32.4s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=None, score=0.6344659235522182, total=  34.0s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced, score=0.45423483960578076, total=  31.6s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced, score=0.4531333673973906, total=  32.2s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=10000, randomforestclassifier__class_weight=balanced, score=0.44438288908450335, total=  35.4s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None, score=0.6264736934584856, total= 1.1min
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None, score=0.623949005745362, total=  59.5s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=None, score=0.6095435719116686, total=  55.3s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced, score=0.45629814329174206, total=  55.5s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced, score=0.4624154642399889, total=  54.3s
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced 




[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__class_weight=balanced, score=0.4467570793450672, total= 1.1min


[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed: 75.1min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'featureunion__tfidfvectorizer__lowercase': [True, False], 'featureunion__tfidfvectorizer__max_features': [1000, 5000, 10000, None], 'featureunion__countvectorizer__binary': [True, False], 'randomforestclassifier__class_weight': [None, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_weighted', verbose=3)

In [6]:
y_predictions = optimizer.predict(X_valid)
print(y_predictions.shape, y_valid.shape)
# best_estimator_ = optimizer.best_estimator_

(63829, 6) (63829, 6)


In [7]:
metrics.roc_auc_score(y_valid, y_predictions)

0.6874419647275994

In [8]:
test_data = build_data_path('test.csv')

data_df = pd.read_csv(test_data)

test_labels = build_data_path('test_labels.csv')
label_df = pd.read_csv(test_labels)

test_df = data_df.set_index('id').join(label_df.set_index('id'))
CONDITIONS = [f'{label} != -1' for label in LABEL_COLS]
QUERY_STRING = ' & '.join(CONDITIONS)
test_df = test_df.query(QUERY_STRING)
X_test = test_df['comment_text']
y_test = test_df[LABEL_COLS]

y_predictions = optimizer.predict(X_test)

print_report(y_test, y_predictions, data_type='TESTING')

TESTING RESULTS:

               precision    recall  f1-score   support

        toxic       0.55      0.78      0.64      6090
 severe_toxic       0.17      0.10      0.12       367
      obscene       0.56      0.72      0.63      3691
       threat       0.29      0.13      0.18       211
       insult       0.52      0.57      0.55      3427
identity_hate       0.59      0.15      0.24       712

    micro avg       0.54      0.66      0.59     14498
    macro avg       0.45      0.41      0.39     14498
 weighted avg       0.53      0.66      0.58     14498
  samples avg       0.07      0.06      0.06     14498

Class-wise AUC-ROC (Kaggle) [0.85496713 0.54635554 0.84020997 0.56345569 0.77155785 0.57455562]
Overall AUC-ROC (Kaggle) 0.6918502981032894


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
