In [12]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from scipy.special import logit, expit
from tqdm import tqdm


In [5]:
%%time

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('input/train.csv').fillna(' ')
test = pd.read_csv('input/test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])


In [21]:
%%time

print ("Doing word vectorizer")
word_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='word',
            token_pattern=r'\w{1,}',
            ngram_range=(1, 1),
            max_features=100000)

word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)


print ("Doing char vectorizer")
char_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='char',
            ngram_range=(1, 5),
            max_features=100000)
char_vectorizer.fit(all_text)

train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])


Doing word vectorizer
Doing char vectorizer
Wall time: 10min 48s


In [22]:
%%time

losses = []
predictions = {'id': test['id']}
for class_name in tqdm(class_names):
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

    cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))

    classifier.fit(train_features, train_target)
    predictions[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(losses)))

submission = pd.DataFrame.from_dict(predictions)
submission.to_csv('submissions/logistic_start.csv', index=False)

  0%|                                                                                            | 0/6 [00:00<?, ?it/s]

CV score for class toxic is 0.9788316471982434


 17%|█████████████▊                                                                     | 1/6 [02:33<12:45, 153.17s/it]

CV score for class severe_toxic is 0.9886078733914744


 33%|███████████████████████████▋                                                       | 2/6 [05:14<10:28, 157.08s/it]

CV score for class obscene is 0.990175713630126


 50%|█████████████████████████████████████████▌                                         | 3/6 [07:39<07:39, 153.27s/it]

CV score for class threat is 0.9889047468902628


 67%|███████████████████████████████████████████████████████▎                           | 4/6 [10:52<05:26, 163.23s/it]

CV score for class insult is 0.9829270756824352


 83%|█████████████████████████████████████████████████████████████████████▏             | 5/6 [13:30<02:42, 162.13s/it]

CV score for class identity_hate is 0.9827410917839409


100%|███████████████████████████████████████████████████████████████████████████████████| 6/6 [16:11<00:00, 161.97s/it]


Total CV score is 0.9853646914294139
Wall time: 16min 13s


In [23]:
print (class_name)
y = train[class_name]
rdg = Ridge(alpha=1.0, fit_intercept=True)
cv_score = cross_val_score(rdg, train_features, train_target, cv=3, scoring='roc_auc')

print (np.mean(cv_score),cv_score)


identity_hate
0.950899192572 [ 0.94447788  0.94815301  0.96006669]
