In [1]:
from os import path

from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import ComplementNB, BernoulliNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics

from functions import load_bad_words, load_ethnic_slurs, build_data_path, print_report, run_on_test_data
from constants import LABEL_COLS

import nltk
from nltk import word_tokenize
nltk.download('wordnet')
from nltk.stem import PorterStemmer, WordNetLemmatizer

[nltk_data] Downloading package wordnet to /home/quontas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
BAD_WORDS = set(load_bad_words())
ETHNIC_SLURS = set(load_ethnic_slurs())

In [3]:
training_data_path = build_data_path('augmented_train.csv')

In [4]:
df = pd.read_csv(training_data_path)

X = df['comment_text']
y = df[LABEL_COLS]

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33)

In [6]:
clf = OneVsRestClassifier(ComplementNB())

tfidf = TfidfVectorizer(strip_accents='ascii', stop_words='english', ngram_range=(1, 1), norm='l2')
bad_word_counter = CountVectorizer(vocabulary=BAD_WORDS)
slur_counter = CountVectorizer(vocabulary=ETHNIC_SLURS)
union = make_union(tfidf, bad_word_counter, slur_counter)

pipeline = make_pipeline(union, clf)

optimizer = pipeline

print('Would you like to perform an exhaustive search? NOTE: This will take several hours.')
autotune_hyperparameters = input('Please enter "yes" or "no".')

# Auto-tune hyperparameters
while autotune_hyperparameters.lower() not in ['yes', 'no']:
    autotune_hyperparameters = input('Please enter "yes" or "no".')
if autotune_hyperparameters == 'yes':
    parameters = {
        'featureunion__tfidfvectorizer__lowercase': [True, False],
        'featureunion__tfidfvectorizer__strip_accents': [None, 'ascii', 'unicode'],
        'featureunion__tfidfvectorizer__stop_words': [None, 'english'],
        'featureunion__tfidfvectorizer__norm': [None, 'l1', 'l2'],
        'featureunion__tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)],
        'onevsrestclassifier__estimator__alpha': [0.001, 0.01, 0.1, 1.0],
        
        'onevsrestclassifier__estimator__norm': [True, False],
        'featureunion__tfidfvectorizer__max_features': [1000, 5000, 10000, None]
    }
    optimizer = GridSearchCV(pipeline, parameters, scoring='roc_auc', verbose=3)

optimizer.fit(X_train, y_train)
y_predictions = optimizer.predict(X_valid)

# best_estimator_ = optimizer.best_estimator_

Would you like to perform an exhaustive search? NOTE: This will take several hours.
Please enter "yes" or "no".no


In [7]:
print_report(y_valid, y_predictions)

VALIDATION RESULTS:



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


               precision    recall  f1-score   support

        toxic       0.80      0.67      0.73     20216
 severe_toxic       0.27      0.75      0.40      2113
      obscene       0.71      0.79      0.74     11118
       threat       0.04      0.13      0.07       651
       insult       0.61      0.69      0.65     10351
identity_hate       0.19      0.32      0.24      1867

    micro avg       0.61      0.68      0.64     46316
    macro avg       0.44      0.56      0.47     46316
 weighted avg       0.68      0.68      0.67     46316
  samples avg       0.05      0.06      0.05     46316

Class-wise AUC-ROC (Kaggle) [0.82422703 0.8625404  0.88341197 0.56086954 0.83522456 0.65600357]
Overall AUC-ROC (Kaggle) 0.770379510275617


In [8]:
run_on_test_data(optimizer)

TESTING RESULTS:

               precision    recall  f1-score   support

        toxic       0.50      0.75      0.60      6090
 severe_toxic       0.10      0.77      0.18       367
      obscene       0.44      0.79      0.57      3691
       threat       0.01      0.07      0.02       211
       insult       0.39      0.68      0.50      3427
identity_hate       0.15      0.41      0.22       712

    micro avg       0.38      0.72      0.49     14498
    macro avg       0.27      0.58      0.35     14498
 weighted avg       0.42      0.72      0.53     14498
  samples avg       0.06      0.07      0.06     14498

Class-wise AUC-ROC (Kaggle) [0.83572849 0.86679081 0.86317551 0.52361712 0.81060037 0.69029651]
Overall AUC-ROC (Kaggle) 0.7650348029135587


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [9]:
def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s:\n\t%s\n" % (class_label,
              "\n\t".join(feature_names[j].split('__')[-1] for j in top10)))

In [10]:
transformers = optimizer.named_steps.featureunion
classifier = optimizer.named_steps.onevsrestclassifier

print_top10(transformers, clf, LABEL_COLS)

toxic:
	2123145146
	kundad
	kunstruktive
	kunt
	kupla
	kurang
	yammer
	follarte
	fuckyourself
	crackhead

severe_toxic:
	stomes
	stikin
	caspa
	anastal1111you
	ancest
	ancestryearly
	ancestryerigate
	ada_at
	cartuchos
	homelan

obscene:
	achivements
	achmed
	achsehole
	kcik
	sexmist
	britch
	britbarb
	katzrin
	zigabo
	follarte

threat:
	m45terbate
	ma5terb8
	ma5terbate
	master-bate
	masterb8
	masterbat*
	masterbat3
	teeeccccctooooniiiiiicccccc
	hawkinghttp
	zigabo

insult:
	faggots129
	islantic
	snigbrook
	furfag
	fortuijn
	66185192207
	libtard
	onanizing
	crackhead
	suberbia

identity_hate:
	gomnna
	closerlookonsyria
	nawmean
	goddammed
	clubz
	goains
	nebracka
	negrate
	uos
	zigabo

