In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

In [8]:
df = pd.read_csv("dataset/train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [9]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    analyzer='char',
    ngram_range=(1,6),
    strip_accents='unicode',
    binary=False,
    stop_words={'english'},
)

In [10]:
X = df['comment_text']
y = df['toxic']

In [11]:
pl = Pipeline([
    ('tfidf', vectorizer),
    ('clf', LogisticRegression(C=4, solver='sag'))
    ])

params = {
    'tfidf__max_features' : [45000, 60000]
}

In [12]:
model = GridSearchCV(pl, params, cv=3, n_jobs=-1, verbose=3, scoring='roc_auc')
model.fit(X, y)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] tfidf__max_features=45000 .......................................
[CV] tfidf__max_features=45000 .......................................
[CV] tfidf__max_features=45000 .......................................
[CV] tfidf__max_features=60000 .......................................
[CV] tfidf__max_features=60000 .......................................
[CV] tfidf__max_features=60000 .......................................
[CV]  tfidf__max_features=45000, score=0.9748924999968186, total= 4.0min
[CV]  tfidf__max_features=60000, score=0.9759851852029502, total= 4.0min
[CV]  tfidf__max_features=45000, score=0.9743378263944023, total= 4.0min


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  6.1min remaining:  6.1min


[CV]  tfidf__max_features=45000, score=0.9752543287774861, total= 4.1min
[CV]  tfidf__max_features=60000, score=0.9749902717679175, total= 4.1min
[CV]  tfidf__max_features=60000, score=0.9752530045965581, total= 4.3min


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  6.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  6.4min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 6), norm='l2', preprocessor=None, smooth_idf=True,
 ... penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'tfidf__max_features': [45000, 60000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [13]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

vect = TfidfVectorizer(
    sublinear_tf=True,
    analyzer='char',
    ngram_range=(1,6),
    strip_accents='unicode',
    binary=False,
    stop_words={'english'},
    max_features=60000
)

features = vect.fit_transform(X)

In [19]:
cv_score = dict()
for class_name in class_names:
    label = df[class_name]
    clf = LogisticRegression(C=4, solver='sag')
    
    cross_val_scores = cross_val_score(clf, features, label, cv=3, n_jobs=-1, scoring='roc_auc')
    avg_cv_score = np.mean(cross_val_scores)
    
    cv_score[class_name] = {'scores': cross_val_scores, 'avg_score': avg_cv_score}

In [20]:
for key in cv_score.keys():
    print('The average CV score for {} is {}'.format(key, cv_score[key]['avg_score']))

{'toxic': {'scores': array([0.97584863, 0.97505944, 0.97536933]), 'avg_score': 0.9754257977064623}, 'severe_toxic': {'scores': array([0.98928075, 0.98518122, 0.98758576]), 'avg_score': 0.987349242075651}, 'obscene': {'scores': array([0.98760519, 0.98812706, 0.98769189]), 'avg_score': 0.9878080491919244}, 'threat': {'scores': array([0.99134392, 0.9822508 , 0.98884206]), 'avg_score': 0.9874789244795981}, 'insult': {'scores': array([0.98146974, 0.98005235, 0.9795856 ]), 'avg_score': 0.9803692309858038}, 'identity_hate': {'scores': array([0.98184024, 0.97936732, 0.98367542]), 'avg_score': 0.981627660517233}}
