In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [3]:
train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [6]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)


In [7]:
word_vectorizer.fit(all_text)

In [8]:
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [9]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)

In [10]:
char_vectorizer.fit(all_text)



In [11]:
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [12]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])


In [23]:
scores = []
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    x = classifier.predict_proba(test_features)[:, 1]
    print(class_name,x)
print('Total CV score is {}'.format(np.mean(scores)))


CV score for class toxic is 0.9692180619098826
toxic [0.98223394 0.02755577 0.02681067 ... 0.02033657 0.0313028  0.63751008]
CV score for class severe_toxic is 0.9875921293776967
severe_toxic [0.08763965 0.005406   0.0053023  ... 0.00328207 0.00298357 0.00576103]
CV score for class obscene is 0.9838684171098375
obscene [0.95649981 0.0171574  0.01581882 ... 0.01477404 0.01481027 0.27257813]
CV score for class threat is 0.9833771690667303
threat [0.0122031  0.00253318 0.00247995 ... 0.00200114 0.00196325 0.00488419]
CV score for class insult is 0.9774236755510657
insult [0.86488065 0.01612004 0.01387111 ... 0.01178212 0.01509783 0.14477389]
CV score for class identity_hate is 0.9739429113023542
identity_hate [0.08794679 0.00681705 0.00570982 ... 0.00483715 0.00955941 0.00908184]
Total CV score is 0.9792370607195946


In [13]:
test_text = test['comment_text']
# test_text = pd.Series([''])
test_word_features = word_vectorizer.transform(test_text)
test_char_features = char_vectorizer.transform(test_text)
test_features = hstack([test_char_features, test_word_features])


In [25]:
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')
    classifier.fit(train_features, train_target)
    x = classifier.predict_proba(test_features)[:, 1]
    print(class_name,x)


toxic [0.08114755]
severe_toxic [0.00930101]
obscene [0.0353081]
threat [0.00359846]
insult [0.03326498]
identity_hate [0.00986155]


In [26]:
classifiers = []
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')
    classifier.fit(train_features, train_target)
    classifiers.append([class_name,classifier])

In [35]:
for clf in classifiers:
    x = clf[1].predict_proba(test_features)[:, 1]
    print(clf[0],x)
    

toxic [0.4757797]
severe_toxic [0.02278432]
obscene [0.08434676]
threat [0.1871434]
insult [0.0666247]
identity_hate [0.01809599]


In [14]:
scores = []
for class_name in class_names:
    train_target = train[class_name]
    classifier = RandomForestClassifier(n_estimators=1000,max_leaf_nodes=18,random_state=20)

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    x = classifier.predict_proba(test_features)[:, 1]
    print(class_name,x)
print('Total CV score is {}'.format(np.mean(scores)))


CV score for class toxic is 0.9241190331488213
toxic [0.55072522 0.08000855 0.10163209 ... 0.072159   0.04842988 0.26369423]
CV score for class severe_toxic is 0.9818540064835566
severe_toxic [0.08541845 0.00757622 0.00671074 ... 0.00539884 0.00420882 0.01311185]
CV score for class obscene is 0.9767070985377856
obscene [0.50582021 0.03642126 0.04276366 ... 0.03414272 0.02219139 0.23223231]
CV score for class threat is 0.9657470411970183
threat [0.0073551  0.00213375 0.00218697 ... 0.00214697 0.00212645 0.0035866 ]
CV score for class insult is 0.9598064358861662
insult [0.42228612 0.03282708 0.03994883 ... 0.03425413 0.02167108 0.09030411]
CV score for class identity_hate is 0.9628909268950571
identity_hate [0.05572403 0.00624707 0.00634249 ... 0.00617684 0.00595651 0.01031566]
Total CV score is 0.9618540903580676
