In [1]:
import numpy as np
import pandas as pd

In [2]:
import pickle

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [4]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [5]:
train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')

In [6]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [7]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(3, 6),
    max_features=50000)

In [7]:
char_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_range=(3, 6), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [8]:
import pickle
f = open('char_vectorizer.pkl', 'wb')
pickle.dump(char_vectorizer, f)
f.close()

In [21]:
feature_labels

{'identity_hate': array([   25,  1483,  2122,  2321,  2322,  2359,  2360,  2362,  2658,
         3087,  3297,  3302,  3927,  3933,  6155,  6159,  6160,  6161,
         6202,  6301,  7878,  8955,  9457, 11040, 13319, 13343, 20252,
        20561, 21082, 21242, 21531, 21665, 21735, 21760, 21761, 21945,
        22005, 22377, 23496, 23586, 24200, 24819, 25809, 27083, 29552,
        31422, 32453, 32454, 32758, 38792, 40513, 40792, 41977, 42251,
        42266, 44267, 45333, 45334, 46302, 46631, 46649, 47197, 49883,
        50016, 50070, 50112, 50115, 50377, 50380, 50522, 50627, 50657,
        50688, 50707, 50736, 50737, 50744, 50798, 50802, 50860, 50863,
        50924, 50930, 50931, 50948, 50959, 50976, 51115, 51130, 51143,
        51147, 51158, 51192, 51193, 51280, 51296, 51324, 51325, 51328,
        51341, 51361, 51370, 51411, 51445, 51446, 51448, 51454, 51537,
        51550, 51559, 51560, 51617, 51675, 51696, 51838, 51841, 51842,
        51906, 51959, 51985, 52083, 52223, 52332, 52390, 524

In [9]:
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [10]:
with open('./identity_hate_train_matrix.pkl', 'rb') as f:
    X_train = pickle.load(f)

In [27]:
with open('./identity_hate_test_matrix.pkl', 'rb') as f:
    X_test = pickle.load(f)

In [9]:
X_train.shape

(159571, 380)

In [13]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_char_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_char_features, train_target)
    submission[class_name] = classifier.predict_proba(test_char_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

submission.to_csv('submission.csv', index=False)

CV score for class toxic is 0.9729069992661149
CV score for class severe_toxic is 0.9874013208313278
CV score for class obscene is 0.9856102206766034
CV score for class threat is 0.9839026965488363
CV score for class insult is 0.9798944530250471
CV score for class identity_hate is 0.9804904405244007
Total CV score is 0.9817010218120551


In [47]:
values = np.r_[[0.1, 1.0, 1.5], np.linspace(2.0, 10.0, 9)]
print('C values in:', values)
for C in values:
    train_target = X_train
    classifier = LogisticRegression(solver='saga', tol=1e-4, max_iter=200, C=C)

    cv_score = np.mean(cross_val_score(classifier, X=train_target, y=train['identity_hate'], cv=3, 
                                   scoring='roc_auc'))
    print('CV score for class identity_hate for C = {} is: {}'.format(C, cv_score))

C values in: [ 0.1  1.   1.5  2.   3.   4.   5.   6.   7.   8.   9.  10. ]
CV score for class identity_hate for C = 0.1 is: 0.9731002395994834
CV score for class identity_hate for C = 1.0 is: 0.9807745550602197
CV score for class identity_hate for C = 1.5 is: 0.9816719456277725
CV score for class identity_hate for C = 2.0 is: 0.9821737474152678
CV score for class identity_hate for C = 3.0 is: 0.9826243825586761
CV score for class identity_hate for C = 4.0 is: 0.9828450520804303
CV score for class identity_hate for C = 5.0 is: 0.9829430175194808
CV score for class identity_hate for C = 6.0 is: 0.9829779808216407
CV score for class identity_hate for C = 7.0 is: 0.9829987047080989
CV score for class identity_hate for C = 8.0 is: 0.9829737914196128
CV score for class identity_hate for C = 9.0 is: 0.9829386317248304
CV score for class identity_hate for C = 10.0 is: 0.982920355112824


In [48]:
print('Optimal C value is: 7.0')

Optimal C value is: 7.0


При больших параметрах C результаты на cv несильно отличаются, так что для остальных классов подбор параметров можно не проводить

# ExtraTreesRegression

In [7]:
from sklearn.ensemble import ExtraTreesRegressor

In [33]:
for num_f in range(5, 41, 5):
    train_target = X_train
    #classifier = LogisticRegression(solver='sag')
    classifier = ExtraTreesRegressor(max_depth=5, max_features=num_f, n_estimators=50)

    cv_score = np.mean(cross_val_score(classifier, X=train_target, y=train['identity_hate'], cv=3, 
                                   scoring='roc_auc'))
    print('CV score for class identity_hate for {} max_features is {}'.format(num_f, cv_score))

CV score for class identity_hate for 5 max_features is 0.9552150309755415
CV score for class identity_hate for 10 max_features is 0.9585089574441321
CV score for class identity_hate for 15 max_features is 0.9524322778975104
CV score for class identity_hate for 20 max_features is 0.9543314804227746
CV score for class identity_hate for 25 max_features is 0.9481725652209549
CV score for class identity_hate for 30 max_features is 0.9460289821727766
CV score for class identity_hate for 35 max_features is 0.9414149441979224
CV score for class identity_hate for 40 max_features is 0.9316432086942253


In [27]:
for depth in range(3, 30, 2):
    train_target = X_train
    #classifier = LogisticRegression(solver='sag')
    classifier = ExtraTreesRegressor(max_depth=depth, max_features=10, n_estimators=50)

    cv_score = np.mean(cross_val_score(classifier, X=train_target, y=train['identity_hate'], cv=3, 
                                   scoring='roc_auc'))
    print('CV score for class identity_hate for {} depth is {}'.format(depth, cv_score))

CV score for class identity_hate for 3 depth is 0.9493649655148243
CV score for class identity_hate for 5 depth is 0.9541263996740131
CV score for class identity_hate for 7 depth is 0.9569196968451656
CV score for class identity_hate for 9 depth is 0.9621008939014727
CV score for class identity_hate for 11 depth is 0.9649993412916351
CV score for class identity_hate for 13 depth is 0.9657365016485288
CV score for class identity_hate for 15 depth is 0.9672766036873425
CV score for class identity_hate for 17 depth is 0.9694513168365363
CV score for class identity_hate for 19 depth is 0.9698232591097492
CV score for class identity_hate for 21 depth is 0.9703291831269739
CV score for class identity_hate for 23 depth is 0.9722308766179198
CV score for class identity_hate for 25 depth is 0.9728644167991757
CV score for class identity_hate for 27 depth is 0.9700937039487915
CV score for class identity_hate for 29 depth is 0.9702557993305173


### Roc_auc для n_estimators = 2000: 0.977. Модель показала себя не очень хорошо

# Градиентный бустинг (недоделано)

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

In [22]:
for value in [1, 3, 5, 10, 15, 20]:
    train_target = X_train
    #classifier = LogisticRegression(solver='sag')
    classifier = GradientBoostingRegressor(max_depth=5, max_features=value, n_estimators=50)

    cv_score = np.mean(cross_val_score(classifier, X=train_target, y=train['identity_hate'], cv=3, 
                                   scoring='roc_auc'))
    print('CV score for class identity_hate for {} max_features is {}'.format(value, cv_score))

CV score for class identity_hate for 1 max_features is 0.9549866260616277
CV score for class identity_hate for 3 max_features is 0.9611634044493965
CV score for class identity_hate for 5 max_features is 0.9607212388121905
CV score for class identity_hate for 10 max_features is 0.962324985375722
CV score for class identity_hate for 15 max_features is 0.9504973849900216
CV score for class identity_hate for 20 max_features is 0.9491200942497932


In [24]:
for value in [3, 5, 7, 9, 11, 15]:
    train_target = X_train
    #classifier = LogisticRegression(solver='sag')
    classifier = GradientBoostingRegressor(max_depth=value, max_features=10, n_estimators=50)

    cv_score = np.mean(cross_val_score(classifier, X=train_target, y=train['identity_hate'], cv=5, 
                                   scoring='roc_auc'))
    print('CV score for class identity_hate for {} max_depth is {}'.format(value, cv_score))

CV score for class identity_hate for 1 max_depth is 0.8996935212875462
CV score for class identity_hate for 3 max_depth is 0.9541602448395373
CV score for class identity_hate for 5 max_depth is 0.9540233660853575
CV score for class identity_hate for 7 max_depth is 0.962598820425725
CV score for class identity_hate for 9 max_depth is 0.963011249475989
CV score for class identity_hate for 11 max_depth is 0.9651724341287156


In [14]:


train_target = X_train
#classifier = LogisticRegression(solver='sag')
classifier = GradientBoostingRegressor(max_depth=3, n_estimators=100, max_features=10)

%time cv_score = np.mean(cross_val_score(classifier, X=train_target, y=train['identity_hate'], cv=3,\
                                   scoring='roc_auc'))
print('CV score for class identity_hate is {}'.format(cv_score))



CPU times: user 7.13 s, sys: 7.89 ms, total: 7.14 s
Wall time: 7.14 s
CV score for class identity_hate is 0.9582371790839845
