In [1]:
import numpy as np

In [2]:
path_train='dataset/train.csv'
path_test='dataset/test.csv'

In [3]:
import pandas as pd
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train_x, val_x, train_y, val_y = train_test_split(train['comment_text'],train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']], test_size=0.2, random_state=2)

In [6]:
import re, string,gc
symbols = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return symbols.sub(r' \1 ', s).split()

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
transform_function = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1).fit(train['comment_text'])



In [8]:
comments_train = transform_function.transform(train_x)
comments_val = transform_function.transform(val_x)
comments_test = transform_function.transform(test['comment_text'])

In [9]:
col = ['total_length', 'capitals', 'caps_vs_length','num_exclamation_marks', 'num_question_marks', 'num_punctuation','num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique','num_smilies']

In [11]:
combined = [train_x, val_x, test]

for data in combined:
    data['total_length'] = data['comment_text'].apply(len)
    data['capitals'] = data['comment_text'].apply(lambda x: sum(1 for c in x if c.isupper()))
    data['caps_vs_length'] = data.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    data['num_exclamation_marks'] = data['comment_text'].apply(lambda x: x.count('!'))
    data['num_question_marks'] = data['comment_text'].apply(lambda x: x.count('?'))
    data['num_punctuation'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
    data['num_symbols'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in '*&$%'))
    data['num_words'] = data['comment_text'].apply(lambda x: len(x.split()))
    data['num_unique_words'] = data['comment_text'].apply(lambda x: len(set(w for w in x.split())))
    data['words_vs_unique'] = data['num_unique_words'] / data['num_words']
    data['num_smilies'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in (':-)', ':)', ';-)', ';)')))

KeyError: 'comment_text'

In [10]:
import scipy
train_x = scipy.sparse.csr_matrix(train_x[col].values)
val_x = scipy.sparse.csr_matrix(val_x[col].values)
test = scipy.sparse.csr_matrix(test[col].values)

KeyError: "None of [Index(['total_length', 'capitals', 'caps_vs_length', 'num_exclamation_marks',\n       'num_question_marks', 'num_punctuation', 'num_symbols', 'num_words',\n       'num_unique_words', 'words_vs_unique', 'num_smilies'],\n      dtype='object')] are in the [index]"

In [14]:
comments_train = scipy.sparse.hstack([train_x.tocsr(),comments_train.tocsr()])
comments_val = scipy.sparse.hstack([val_x,comments_val])
comments_test = scipy.sparse.hstack([test,comments_test])

In [15]:
import xgboost as xgb

In [16]:
def run(train_X, train_y, test_X, test_y=None, feature_names=None):
    dic = {}
    dic['objective'] = 'binary:logistic'
    dic['eta'] = 0.1
    dic['max_depth'] = 6
    dic['silent'] = 1
    dic['eval_metric'] = 'auc'
    dic['min_child_weight'] = 1
    dic['subsample'] = 0.7
    dic['colsample_bytree'] = 0.7
    num = 100
    list_dic = list(dic.items())

    xgtrain = xgb.DMatrix(train_X, label=train_y)
    xgtest = xgb.DMatrix(test_X, label=test_y)

    model = xgb.train(list_dic, xgtrain, num, [ (xgtrain,'train'), (xgtest, 'test') ], early_stopping_rounds=10)

    return model 

In [17]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = run(comments_train, train_y[j], comments_val,val_y[j])
    preds[:,i] = model.predict(xgb.DMatrix(comments_test), ntree_limit = model.best_ntree_limit)
    gc.collect()

fit toxic




Parameters: { "silent" } are not used.

[0]	train-auc:0.69918	test-auc:0.69861
[1]	train-auc:0.71597	test-auc:0.71605
[2]	train-auc:0.75598	test-auc:0.75801
[3]	train-auc:0.76397	test-auc:0.76431
[4]	train-auc:0.76690	test-auc:0.76740
[5]	train-auc:0.82699	test-auc:0.82094
[6]	train-auc:0.85755	test-auc:0.84995
[7]	train-auc:0.85698	test-auc:0.85013
[8]	train-auc:0.85867	test-auc:0.85116
[9]	train-auc:0.86382	test-auc:0.85564
[10]	train-auc:0.87147	test-auc:0.86218
[11]	train-auc:0.87592	test-auc:0.86619
[12]	train-auc:0.88396	test-auc:0.87416
[13]	train-auc:0.88761	test-auc:0.87731
[14]	train-auc:0.89162	test-auc:0.88209
[15]	train-auc:0.89253	test-auc:0.88239
[16]	train-auc:0.89931	test-auc:0.89070
[17]	train-auc:0.90224	test-auc:0.89346
[18]	train-auc:0.90466	test-auc:0.89581
[19]	train-auc:0.90597	test-auc:0.89692
[20]	train-auc:0.90828	test-auc:0.89936
[21]	train-auc:0.91003	test-auc:0.90095
[22]	train-auc:0.91256	test-auc:0.90405
[23]	train-auc:0.91431	test-auc:0.90561
[24]	train



NameError: name 'gc' is not defined

In [None]:
labels=pd.read_csv('dataset/test_labels.csv')
labels=np.array(labels.iloc[:,1:])
sum_labels=np.sum(labels,axis=1)
idx=sum_labels>=0

In [None]:
preds_consider=preds[idx]
labels_consider= labels[idx]
preds_consider.shape,labels_consider.shape

((63978, 6), (63978, 6))

In [None]:
from sklearn.metrics import roc_auc_score
scores=[]
for i in range(6):
  scores.append(roc_auc_score(labels_consider[:,i],preds_consider[:,i]))
np.mean(scores)

0.9635699477762086