In [2]:
import math
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from joblib import Parallel, delayed
from tqdm import tqdm
from nltk.corpus import stopwords  # Import the stop word list
import re

from sklearn.metrics import accuracy_score 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from keras.wrappers.scikit_learn import KerasClassifier
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import Flatten, Dense, Activation, Dropout
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.callbacks import Callback
from keras.layers.advanced_activations import LeakyReLU
from keras.callbacks import TensorBoard  
from keras import backend as K
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier, XGBRegressor

import lightgbm as lgb
import catboost as cb
import xgboost as xgb

%matplotlib inline
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams["figure.figsize"] = (12, 9)
sns.set(context='paper', style='darkgrid', rc={'figure.facecolor':'white'}, font_scale=1.2)

# import logging       
# logging.basicConfig(filename='./my_framework.txt', filemode='a', datefmt='%H:%M:%S', level=logging.DEBUG,
#                     format='%(asctime)s %(levelname)s %(message)s')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


# Dataset preparation

In [3]:
dftrain = pd.read_csv("input/train.csv")
dftrain['comment_text'] = dftrain['comment_text'].apply(str) # some values parsed as float
dftest = pd.read_csv("input/test.csv")
dftest['comment_text'] = dftest['comment_text'].apply(str) # some values parsed as float

In [4]:
english_stops = set(stopwords.words("english"))
def preprocess_line(raw_line):
    raw_line = re.sub('\\b\\d+\\b', ' ', raw_line)
    raw_line = re.sub('[\';:.,<>#*"\-=/?!№\[\]()«»_|\\\\…•+%]', ' ', raw_line)
    words = raw_line.lower().split()
    meaningful_words = [w for w in words if w not in english_stops]
    return " ".join(meaningful_words)

In [5]:
# TODO IS IT WORTH DOING?
dftrain['comment_text_preprocessed'] = dftrain['comment_text'].apply(preprocess_line)
dftest['comment_text_preprocessed'] = dftest['comment_text'].apply(preprocess_line)

In [6]:
dftrain.head()
# dftrain['comment_text_preprocessed'][0], dftrain['comment_text'][0]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_preprocessed
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0,nonsense kiss geek said true account terminated
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0,please vandalize pages edit w merwin continue ...
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0,points interest removed points interest sectio...
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,asking nationality racial offence wow aware bl...
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,reader going say ethereal vocal style dark lyr...


# Dummy pass

In [17]:
vectorizer = TfidfVectorizer(max_features=60000)
X = vectorizer.fit_transform(dftrain['comment_text_preprocessed'])
y = dftrain['toxic']

In [18]:
clf = MultinomialNB()
clf.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
prediction = clf.predict_proba(X)

In [22]:
log_loss(y, prediction)

0.18122484797361807

# Grid search

In [7]:
X = dftrain['comment_text_preprocessed']
y = y = dftrain['toxic']

In [8]:
def gscv_summary(grid, print_all=False, name='UKNWN'):
    print("[%s] Best score/params: %s %s" % (name, grid.best_score_, grid.best_params_))
#     logging.debug("[%s] Best score/params: %s %s" % (name, grid.best_score_, grid.best_params_))
    if not print_all: 
        return
    print("All params:")
    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

In [9]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

In [18]:
%%time

param_grid = {
    'tfidf__max_features': [6400, 6500, 6600],
#     'tfidf__stop_words': ['english', None],
    'tfidf__binary': [True, False],
    'tfidf__norm': ['l2'],
    'tfidf__smooth_idf': [True, False],
    'tfidf__sublinear_tf': [True],
    'tfidf__use_idf': [True],
#     'mnb__alpha': [0.0, 1.0, 2.0],
#     'mnb__fit_prior': [True, False],
}
grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_log_loss', n_jobs=-1, verbose=2)
grid.fit(X, y)
gscv_summary(grid, name='MNB')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] tfidf__binary=True, tfidf__max_features=6400, tfidf__norm=l2, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV] tfidf__binary=True, tfidf__max_features=6400, tfidf__norm=l2, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV] tfidf__binary=True, tfidf__max_features=6400, tfidf__norm=l2, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV] tfidf__binary=True, tfidf__max_features=6400, tfidf__norm=l2, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV] tfidf__binary=True, tfidf__max_features=6400, tfidf__norm=l2, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV] tfidf__binary=True, tfidf__max_features=6400, tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV] tfidf__binary=True, tfidf__max_features=6400, tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_t

[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   10.2s


[CV] tfidf__binary=True, tfidf__max_features=6600, tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV] tfidf__binary=True, tfidf__max_features=6600, tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV] tfidf__binary=True, tfidf__max_features=6600, tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV]  tfidf__binary=True, tfidf__max_features=6500, tfidf__norm=l2, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, tfidf__use_idf=True, total=   5.5s
[CV] tfidf__binary=True, tfidf__max_features=6600, tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV]  tfidf__binary=True, tfidf__max_features=6500, tfidf__norm=l2, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, tfidf__use_idf=True, total=   6.1s
[CV] tfidf__binary=True, tfidf__max_features=6600, tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True 

[CV] tfidf__binary=False, tfidf__max_features=6600, tfidf__norm=l2, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV]  tfidf__binary=False, tfidf__max_features=6500, tfidf__norm=l2, tfidf__smooth_idf=True, tfidf__sublinear_tf=True, tfidf__use_idf=True, total=   5.4s
[CV] tfidf__binary=False, tfidf__max_features=6600, tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV]  tfidf__binary=False, tfidf__max_features=6400, tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True, total=   6.2s
[CV]  tfidf__binary=False, tfidf__max_features=6400, tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True, total=   6.2s
[CV] tfidf__binary=False, tfidf__max_features=6600, tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True, tfidf__use_idf=True 
[CV] tfidf__binary=False, tfidf__max_features=6600, tfidf__norm=l2, tfidf__smooth_idf=False, tfidf__sublinear_tf=True

[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   37.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   37.5s finished


[MNB] Best score/params: -0.14088461628285465 {'tfidf__binary': False, 'tfidf__max_features': 6500, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': False, 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}
CPU times: user 6.18 s, sys: 1.39 s, total: 7.57 s
Wall time: 41.2 s
