In [1]:
from os import path

from matplotlib import pyplot as plt
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

from functions import load_bad_words, build_data_path, print_report
from constants import LABEL_COLS

In [2]:
BAD_WORDS = load_bad_words()
training_data_path = build_data_path('train.csv')

In [3]:
df = pd.read_csv(training_data_path)
X = df['comment_text']
y = df[LABEL_COLS]

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4)

In [None]:
clf = RandomForestClassifier(n_estimators=100, warm_start=True)

tfidf = TfidfVectorizer(lowercase=True, stop_words='english')
bad_word_counter = CountVectorizer(vocabulary=BAD_WORDS)

union = make_union(tfidf, bad_word_counter)

pipeline = make_pipeline(union, clf)

optimizer = pipeline

print('Would you like to perform an exhaustive search? NOTE: This will take several hours.')
autotune_hyperparameters = input('Please enter "yes" or "no".')


# Auto-tune hyperparameters
while autotune_hyperparameters.lower() not in ['yes', 'no']:
    autotune_hyperparameters = input('Please enter "yes" or "no".')
if autotune_hyperparameters == 'yes':
    parameters = {
        'featureunion__tfidfvectorizer__lowercase': [True, False],
        'featureunion__tfidfvectorizer__max_features': [1000, 5000, 10000, None],
        'featureunion__countvectorizer__binary': [True, False],
        'randomforestclassifier__warm_start': [True]
    }
    optimizer = GridSearchCV(pipeline, parameters, scoring='f1_weighted', verbose=3)

optimizer.fit(X_train, y_train)

Would you like to perform an exhaustive search? NOTE: This will take several hours.
Please enter "yes" or "no".yes


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__warm_start=True 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__warm_start=True, score=0.6561136381213841, total= 2.8min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__warm_start=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.0min remaining:    0.0s


[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__warm_start=True, score=0.6351628280073291, total= 2.9min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__warm_start=True 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  6.1min remaining:    0.0s


[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__warm_start=True, score=0.6663785404025739, total= 2.8min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__warm_start=True 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__warm_start=True, score=0.6729463934274613, total= 3.6min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier__warm_start=True 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=5000, randomforestclassifier_

[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__warm_start=True, score=0.651108863450771, total= 5.5min
[CV] featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__warm_start=True 
[CV]  featureunion__countvectorizer__binary=True, featureunion__tfidfvectorizer__lowercase=False, featureunion__tfidfvectorizer__max_features=None, randomforestclassifier__warm_start=True, score=0.6594911579994146, total= 5.4min
[CV] featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassifier__warm_start=True 
[CV]  featureunion__countvectorizer__binary=False, featureunion__tfidfvectorizer__lowercase=True, featureunion__tfidfvectorizer__max_features=1000, randomforestclassif

In [None]:
y_predictions = optimizer.predict(X_valid)

# best_estimator_ = optimizer.best_estimator_

In [None]:
print_report(y_valid, y_predictions)

In [None]:
# test_data = build_data_path('test.csv')

# data_df = pd.read_csv(test_data)

# test_labels = build_data_path('test_labels.csv')
# label_df = pd.read_csv(test_labels)

# test_df = data_df.set_index('id').join(label_df.set_index('id'))
# CONDITIONS = [f'{label} != -1' for label in LABEL_COLS]
# QUERY_STRING = ' & '.join(CONDITIONS)
# test_df = test_df.query(QUERY_STRING)
# X_test = test_df['comment_text']
# y_test = test_df[LABEL_COLS]

# y_predictions = optimizer.predict(X_test)

# print_report(y_test, y_predictions, data_type='TESTING')