In [1]:
from os import path

from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import ComplementNB, BernoulliNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics

from functions import load_bad_words, load_ethnic_slurs, build_data_path, print_report, run_on_test_data
from constants import LABEL_COLS

import nltk
from nltk import word_tokenize
nltk.download('wordnet')
from nltk.stem import PorterStemmer, WordNetLemmatizer

[nltk_data] Downloading package wordnet to /home/quontas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [None]:
class PorterStemmerTokenizer(object):
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        return [self.ps.stem(t) for t in word_tokenize(doc)]

In [2]:
BAD_WORDS = set(load_bad_words())
ETHNIC_SLURS = set(load_ethnic_slurs())

In [3]:
training_data_path = build_data_path('augmented_train.csv')

In [4]:
df = pd.read_csv(training_data_path)

X = df['comment_text']
y = df[LABEL_COLS]

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33)

In [6]:
clf = OneVsRestClassifier(ComplementNB())

tfidf = TfidfVectorizer(strip_accents='ascii', stop_words='english', ngram_range=(1, 3), norm='l2')
bad_word_counter = CountVectorizer(vocabulary=BAD_WORDS)
slur_counter = CountVectorizer(vocabulary=ETHNIC_SLURS)
union = make_union(tfidf, bad_word_counter, slur_counter)

pipeline = make_pipeline(union, clf)

optimizer = pipeline

print('Would you like to perform an exhaustive search? NOTE: This will take several hours.')
autotune_hyperparameters = input('Please enter "yes" or "no".')

# Auto-tune hyperparameters
while autotune_hyperparameters.lower() not in ['yes', 'no']:
    autotune_hyperparameters = input('Please enter "yes" or "no".')
if autotune_hyperparameters == 'yes':
    parameters = {
        'featureunion__tfidfvectorizer__lowercase': [True, False],
        'featureunion__tfidfvectorizer__strip_accents': [None, 'ascii', 'unicode'],
        'featureunion__tfidfvectorizer__stop_words': [None, 'english'],
        'featureunion__tfidfvectorizer__norm': [None, 'l1', 'l2'],
        'featureunion__tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)],
        'featureunion__tfidfvectorizer__tokenizer': [None,]
        'onevsrestclassifier__estimator__alpha': [0.001, 0.01, 0.1, 1.0],
        
        'onevsrestclassifier__estimator__norm': [True, False],
        'featureunion__tfidfvectorizer__max_features': [1000, 5000, 10000, None]
    }
    optimizer = GridSearchCV(pipeline, parameters, scoring='roc_auc', verbose=3)

optimizer.fit(X_train, y_train)
y_predictions = optimizer.predict(X_valid)

# best_estimator_ = optimizer.best_estimator_

Would you like to perform an exhaustive search? NOTE: This will take several hours.
Please enter "yes" or "no".no


In [7]:
print_report(y_valid, y_predictions)

VALIDATION RESULTS:



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


               precision    recall  f1-score   support

        toxic       0.96      0.55      0.69     20257
 severe_toxic       0.64      0.28      0.39      2099
      obscene       0.92      0.63      0.75     11086
       threat       0.78      0.03      0.06       643
       insult       0.86      0.47      0.61     10326
identity_hate       0.86      0.09      0.16      1924

    micro avg       0.91      0.51      0.66     46335
    macro avg       0.84      0.34      0.45     46335
 weighted avg       0.91      0.51      0.64     46335
  samples avg       0.05      0.04      0.04     46335

Class-wise AUC-ROC (Kaggle) [0.77163422 0.6378582  0.81585634 0.51631542 0.73341973 0.54541109]
Overall AUC-ROC (Kaggle) 0.6700824997884114


In [8]:
run_on_test_data(optimizer)

TESTING RESULTS:

               precision    recall  f1-score   support

        toxic       0.73      0.51      0.60      6090
 severe_toxic       0.22      0.19      0.20       367
      obscene       0.75      0.53      0.62      3691
       threat       0.00      0.00      0.00       211
       insult       0.70      0.34      0.46      3427
identity_hate       0.42      0.09      0.15       712

    micro avg       0.70      0.44      0.54     14498
    macro avg       0.47      0.28      0.34     14498
 weighted avg       0.69      0.44      0.53     14498
  samples avg       0.05      0.04      0.04     14498

Class-wise AUC-ROC (Kaggle) [0.74552029 0.59078017 0.75730547 0.49944329 0.66788759 0.54424834]


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Overall AUC-ROC (Kaggle) 0.6341975255217371


In [9]:
def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s:\n\t%s\n" % (class_label,
              "\n\t".join(feature_names[j].split('__')[-1] for j in top10)))

In [10]:
transformers = optimizer.named_steps.featureunion
classifier = optimizer.named_steps.onevsrestclassifier

print_top10(transformers, clf, LABEL_COLS)

toxic:
	violate rules tard
	pedia look
	pedia hate wikipedia
	pedia hate
	pedia group illiterate
	pedia group
	pedia greek geak
	pedia greek
	pedia fully
	zigabo

severe_toxic:
	appreciate chinese
	books weirdest
	hoax hahahahahahahaha
	ho wikipedia shove
	books working
	books working biggest
	books working largest
	appreciate china
	books weirdest okay
	zigabo

obscene:
	disrupt valuable
	inch dick asshole
	inch dick gonna
	sources article particular
	phenomenon bound racist
	phenomenon bound
	anti seMites palestinians
	inch gay
	little man arse
	zigabo

threat:
	kidnap rape family
	willing password
	willing password going
	boy boy think
	fucking face family
	fucking fag fag
	negro heil
	negro heil hitler
	fucking 40
	zigabo

insult:
	looking oit shut
	looking oit
	looking nigger cares
	looking nigger
	looking naw going
	looking naw
	looking motherfucking pin
	looking motherfucking nip
	designthesyline clearly
	zigabo

identity_hate:
	food language
	food stop climing
	food shit nigger