In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv("dataset/train.csv")
df.head()

X = df['comment_text']

In [3]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

vect = TfidfVectorizer(
    sublinear_tf=True,
    analyzer='char',
    ngram_range=(1,6),
    strip_accents='unicode',
    binary=False,
    stop_words={'english'},
    max_features=60000
)

char_features = vect.fit_transform(X)

In [4]:
char_tfidf_lr_cv_score = dict()
for class_name in class_names:
    label = df[class_name]
    clf = LogisticRegression(C=4, solver='sag')
    
    cross_val_scores = cross_val_score(clf, char_features, label, cv=3, n_jobs=-1, scoring='roc_auc')
    avg_cv_score = np.mean(cross_val_scores)
    
    char_tfidf_lr_cv_score[class_name] = {'scores': cross_val_scores, 'avg_score': avg_cv_score}

In [5]:
print('Accuracy for Char TfidfVectorizer => Logistic Regression model:')
for key in char_tfidf_lr_cv_score.keys():
    print('The average CV score for {} is {}'.format(key, char_tfidf_lr_cv_score[key]['avg_score']))

Accuracy for Char TfidfVectorizer => Logistic Regression model:
The average CV score for toxic is 0.9754256481533711
The average CV score for severe_toxic is 0.9873492063346941
The average CV score for obscene is 0.9878080656466953
The average CV score for threat is 0.9874788849473234
The average CV score for insult is 0.980369286215482
The average CV score for identity_hate is 0.9816276875363413


In [6]:
word_vect = TfidfVectorizer(
    sublinear_tf=True,
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1,2),
    strip_accents='ascii',
    binary=False,
    stop_words={'english'},
    max_features=60000,
    min_df=5,
    max_df=0.8
)

word_features = word_vect.fit_transform(X)

In [7]:
word_tfidf_lr_cv_score = dict()
for class_name in class_names:
    label = df[class_name]
    clf = LogisticRegression(C=4, solver='sag')
    
    cross_val_scores = cross_val_score(clf, word_features, label, cv=3, n_jobs=-1, scoring='roc_auc')
    avg_cv_score = np.mean(cross_val_scores)
    
    word_tfidf_lr_cv_score[class_name] = {'scores': cross_val_scores, 'avg_score': avg_cv_score}

In [8]:
print('Accuracy for Word TfidfVectorizer => Logistic Regression model:')
for key in word_tfidf_lr_cv_score.keys():
    print('The average CV score for {} is {}'.format(key, word_tfidf_lr_cv_score[key]['avg_score']))

Accuracy for Word TfidfVectorizer => Logistic Regression model:
The average CV score for toxic is 0.9729955423180443
The average CV score for severe_toxic is 0.9849428734325918
The average CV score for obscene is 0.9846870113496786
The average CV score for threat is 0.988220900430969
The average CV score for insult is 0.9781798862665024
The average CV score for identity_hate is 0.9744560490604087


In [9]:
char_tfidf_multinomNB_cv_score = dict()
for class_name in class_names:
    label = df[class_name]
    clf = MultinomialNB()
    
    cross_val_scores = cross_val_score(clf, char_features, label, cv=3, n_jobs=-1, scoring='roc_auc')
    avg_cv_score = np.mean(cross_val_scores)
    
    char_tfidf_multinomNB_cv_score[class_name] = {'scores': cross_val_scores, 'avg_score': avg_cv_score}

In [10]:
print('Accuracy for Char TfidfVectorizer => Multinomial Naive Bayes model:')
for key in char_tfidf_multinomNB_cv_score.keys():
    print('The average CV score for {} is {}'.format(key, char_tfidf_multinomNB_cv_score[key]['avg_score']))

Accuracy for Char TfidfVectorizer => Multinomial Naive Bayes model:
The average CV score for toxic is 0.9469668327475107
The average CV score for severe_toxic is 0.9668873121325303
The average CV score for obscene is 0.9579034731671313
The average CV score for threat is 0.8751377419340619
The average CV score for insult is 0.9579588607108812
The average CV score for identity_hate is 0.9176980189378297


In [11]:
word_tfidf_multinomNB_cv_score = dict()
for class_name in class_names:
    label = df[class_name]
    clf = MultinomialNB()
    
    cross_val_scores = cross_val_score(clf, word_features, label, cv=3, n_jobs=-1, scoring='roc_auc')
    avg_cv_score = np.mean(cross_val_scores)
    
    word_tfidf_multinomNB_cv_score[class_name] = {'scores': cross_val_scores, 'avg_score': avg_cv_score}

In [12]:
print('Accuracy for Word TfidfVectorizer => Multinomial Naive Bayes model:')
for key in word_tfidf_multinomNB_cv_score.keys():
    print('The average CV score for {} is {}'.format(key, word_tfidf_multinomNB_cv_score[key]['avg_score']))

Accuracy for Word TfidfVectorizer => Multinomial Naive Bayes model:
The average CV score for toxic is 0.9451445091173203
The average CV score for severe_toxic is 0.9270879922622292
The average CV score for obscene is 0.9393252448901995
The average CV score for threat is 0.8204057527445183
The average CV score for insult is 0.936402306481306
The average CV score for identity_hate is 0.8483597897476142
