In [89]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy


In [90]:
import scipy

In [109]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.fillna("unknown")
test = test.fillna("unknown")

In [110]:
train_mes, valid_mes, train_l, valid_l = train_test_split(train['comment_text'],train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']], test_size=0.2, random_state=2)


In [111]:
train_mes.head(2)

91312    "\n\nTemplate:Shortcut\n\nOut of curiosity, wh...
86103    I always got the impression he was a real cunt...
Name: comment_text, dtype: object

In [112]:
valid_mes.head(2)

57791    In reference to the formula 1-(-1) = 2 you pos...
16982    Sir William Lawrence, 1st Baronet \n\nObviousl...
Name: comment_text, dtype: object

In [113]:
train_l.head(2)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
91312,0,0,0,0,0,0
86103,0,0,1,0,1,0


In [114]:
valid_l.head(2)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
57791,0,0,0,0,0,0
16982,0,0,0,0,0,0


In [115]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()


In [116]:
transform_com = TfidfVectorizer(ngram_range=(1,3),tokenizer=tokenize,analyzer='char',
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1).fit(train['comment_text'])

In [117]:
comments_train = transform_com.transform(train_mes)
comments_valid = transform_com.transform(valid_mes)

In [118]:
comments_test = transform_com.transform(test['comment_text'])

In [119]:
gc.collect()


73

In [120]:
train_mes = pd.DataFrame(train_mes)
valid_mes = pd.DataFrame(valid_mes)

In [121]:
data = [train_mes, valid_mes, test]

In [122]:
for element in data:
    element['total_length'] = element['comment_text'].apply(len)
    element['capitals'] = element['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    element['caps_vs_length'] = element.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    element['num_exclamation_marks'] = element['comment_text'].apply(lambda comment: comment.count('!'))
    element['num_question_marks'] = element['comment_text'].apply(lambda comment: comment.count('?'))
    element['num_punctuation'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '.,;:'))
    element['num_symbols'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '*&$%'))
    element['num_words'] = element['comment_text'].apply(lambda comment: len(comment.split()))
    element['num_unique_words'] = element['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    element['words_vs_unique'] = element['num_unique_words'] / element['num_words']
    element['num_smilies'] = element['comment_text'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))

In [123]:
col = ['total_length', 'capitals', 'caps_vs_length',
       'num_exclamation_marks', 'num_question_marks', 'num_punctuation',
       'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique',
       'num_smilies']

In [124]:
train_mes = scipy.sparse.csr_matrix(train_mes[col].values)
valid_mes = scipy.sparse.csr_matrix(valid_mes[col].values)
test = scipy.sparse.csr_matrix(test[col].values)


In [125]:
import xgboost as xgb


In [126]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'auc'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model

In [127]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [128]:
preds = np.zeros((test.shape[0], len(col)))

In [129]:
for i, j in enumerate(col):
    print('fit '+j)
    model = runXGB(comments_train, train_l[j], comments_valid,valid_l[j])
    preds[:,i] = model.predict(xgb.DMatrix(comments_test), ntree_limit = model.best_ntree_limit)

fit toxic
[0]	train-auc:0.774743	test-auc:0.771865
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 20 rounds.
[1]	train-auc:0.818378	test-auc:0.811372
[2]	train-auc:0.847807	test-auc:0.840707
[3]	train-auc:0.853439	test-auc:0.845658
[4]	train-auc:0.855188	test-auc:0.847146
[5]	train-auc:0.85909	test-auc:0.850699
[6]	train-auc:0.86262	test-auc:0.854869
[7]	train-auc:0.882407	test-auc:0.874645
[8]	train-auc:0.889507	test-auc:0.880181
[9]	train-auc:0.892054	test-auc:0.88317
[10]	train-auc:0.894995	test-auc:0.886343
[11]	train-auc:0.897552	test-auc:0.889041
[12]	train-auc:0.899423	test-auc:0.89044
[13]	train-auc:0.90369	test-auc:0.894621
[14]	train-auc:0.905154	test-auc:0.896713
[15]	train-auc:0.907516	test-auc:0.898973
[16]	train-auc:0.913106	test-auc:0.904034
[17]	train-auc:0.915634	test-auc:0.906192
[18]	train-auc:0.91729	test-auc:0.907331
[19]	train-auc:0.919532	test-auc:0.909415
[20]	train-auc:0.920782	t

In [130]:
gc.collect()

223

In [131]:
subm = pd.read_csv('sample_submission.csv')    
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('xgb.csv', index=False)

In [None]:
1