In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [9]:
import re

from catboost import CatBoostClassifier, Pool
from catboost.text_processing import Tokenizer, Dictionary

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [3]:
import catboost
catboost.__version__

'1.0.4'

# Constants

In [12]:
SEED = 42
y_label = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Load Data

In [6]:
src = pd.read_csv('data_train_clean.csv')
test_src = pd.read_csv('data_test_clean.csv')

# Preprocessing

In [7]:
def cleanString(comment: str) -> str:
    comment = re.sub('n\'t', '', comment)
    comment = re.sub('\'m', '', comment)
    comment = re.sub('\'ve', '', comment)
    # comment = re.sub(' to', '', comment)
    # comment = re.sub('the', '', comment)
    comment = re.sub('\'s', '', comment)
    comment = re.sub(' is', '', comment)
    comment = re.sub(' are', '', comment)
    comment = re.sub(' have', '', comment)
    comment = re.sub(' has', '', comment)
    comment = re.sub(' a', '', comment)
    comment = re.sub(' the', '', comment)


    comment = comment.replace('\n', ' \n ')
    comment = comment.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    comment = comment.replace(r'[0-9]', '')
    # remove punctuation and numbers
    # comment = re.sub('[^a-zA-Z!?%]', ' ', comment)
    comment = re.sub('[^a-zA-Z%]', ' ', comment)
    # del %
    comment = re.sub('%', '', comment)
    # remove multiple spaces
    comment = re.sub(r' +', ' ', comment)
    # remove newline
    comment = re.sub(r'\n', ' ', comment)
    # remove digits
    # comment = ''.join(i for i in comment if not i.isdigit())
    comment = re.sub(r' +', ' ', comment)
    comment = comment.strip()
    return comment

In [10]:
src.clean_comment = src.clean_comment.map(cleanString)
test_src.clean_comment = test_src.clean_comment.map(cleanString)

In [13]:
src.sample(5, random_state=SEED)

Unnamed: 0.1,Unnamed: 0,id,clean_comment,toxic,severe_toxic,obscene,threat,insult,identity_hate
96708,92945,f887d88d2f290304,b nswer to m xc xbctze here m xc xbctze questi...,0,0,0,0,0,0
28033,67099,b3859ebf00f82cb9,b yes yes we want to make more episodes for th...,0,0,0,0,0,0
96254,120877,86b66414765e1fe6,b rudeccusations you one making falseccusation...,0,0,0,0,0,0
1841,86236,e6a18182c0af24ea,b valigntop june straits chinese recreation cl...,0,0,0,0,0,0
47528,24539,40d4bc848e533b85,b wow ericbtw dont you like hentai too eww low...,0,0,0,0,0,0


# Tokenizer

In [15]:
src.iloc[13164].clean_comment

'b although igree with block i do not thinkrchiving within couple of hours would be wise it resulted in turmoil before maybefter few more hours or something'

In [10]:
# tokenizer = Tokenizer(lowercasing=True,
#                       number_process_policy=None,
#                       separator_type='BySense',
#                       skip_empty=True,
#                       token_types=['Word'],
#                       sub_tokens_policy='SeveralTokens')

In [11]:
# tokenizer.tokenize(src.iloc[131631].comment_text)

In [16]:
text_proc_param = {
    'tokenizers': [{'tokenizer_id': 'Sense',
                    'separator_type': 'BySense',
                    'lowercasing': 'True',
                    'token_types':['Word'],
                    'sub_tokens_policy':'SeveralTokens'}],
    'dictionaries': [{'dictionary_id': 'Word',
                      'max_dictionary_size': '4000'}],
    'feature_calcers': ['BoW:top_tokens_count=3000']
}

# Train test split

In [17]:
df_train, df_test = train_test_split(src, test_size=0.15, random_state=SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)

print('train shape:', df_train.shape)
print('valid shape:', df_val.shape)
print('test  shape:', df_test.shape)

train shape: (94944, 9)
valid shape: (8377, 9)
test  shape: (8378, 9)


In [18]:
train_pools = dict()
val_pools = dict()
oos_pools = dict()
for label in y_label:
    train_pools[label] = Pool(df_train[['clean_comment']], label=df_train[label], text_features=['clean_comment'])
    val_pools[label] = Pool(df_val[['clean_comment']], label=df_val[label], text_features=['clean_comment'])
    oos_pools[label] = Pool(df_test[['clean_comment']], label=df_test[label], text_features=['clean_comment'])

In [19]:
train_pools

{'toxic': <catboost.core.Pool at 0x24e9da02be0>,
 'severe_toxic': <catboost.core.Pool at 0x24e9da02ca0>,
 'obscene': <catboost.core.Pool at 0x24e9da02760>,
 'threat': <catboost.core.Pool at 0x24e9da02fa0>,
 'insult': <catboost.core.Pool at 0x24e9fcadb80>,
 'identity_hate': <catboost.core.Pool at 0x24e9da02820>}

# Training

In [16]:
models = dict()
print('===Start Train===')
for label in y_label:
    print('Train model for label', label)
    models[label] = CatBoostClassifier(learning_rate=0.3,
                                       task_type='GPU',
                                       iterations=5000,
                                       eval_metric='AUC',
                                       od_type='Iter',
                                       od_wait=500,
                                       random_state=SEED,
                                       **text_proc_param)
    models[label].fit(train_pools[label], eval_set=val_pools[label], early_stopping_rounds=500, 
                      verbose=100, use_best_model=True)

===Start Train===
Train model for label toxic
0:	learn: 0.6685019	test: 0.6761781	best: 0.6761781 (0)	total: 16.7ms	remaining: 1m 23s
100:	learn: 0.9421952	test: 0.9413802	best: 0.9413802 (100)	total: 1.12s	remaining: 54.5s
200:	learn: 0.9537013	test: 0.9472930	best: 0.9473977 (191)	total: 2.17s	remaining: 51.7s
300:	learn: 0.9601055	test: 0.9484918	best: 0.9486518 (268)	total: 3.21s	remaining: 50.1s
400:	learn: 0.9650146	test: 0.9498805	best: 0.9498805 (400)	total: 4.26s	remaining: 48.8s
500:	learn: 0.9683781	test: 0.9508457	best: 0.9509888 (496)	total: 5.29s	remaining: 47.5s
600:	learn: 0.9713670	test: 0.9518200	best: 0.9518308 (599)	total: 6.38s	remaining: 46.7s
700:	learn: 0.9739548	test: 0.9529701	best: 0.9529701 (700)	total: 7.44s	remaining: 45.6s
800:	learn: 0.9761288	test: 0.9535662	best: 0.9536126 (788)	total: 8.52s	remaining: 44.7s
900:	learn: 0.9780705	test: 0.9534857	best: 0.9536126 (788)	total: 10s	remaining: 45.5s
1000:	learn: 0.9795243	test: 0.9534761	best: 0.9537410 (92

In [17]:
y_pred_oos = np.zeros((df_test.shape[0], len(y_label)))
for idx, label in enumerate(y_label):
    y_pred_oos[:, idx] = models[label].predict_proba(oos_pools[label])[:, 1]
print(roc_auc_score(df_test[y_label], y_pred_oos))

0.9574696026863109


# Predict and save

In [18]:
test_pool = Pool(test_src[['clean_comment']], text_features=['clean_comment'])

In [19]:
y_pred_test = np.zeros((test_src.shape[0], len(y_label)))
for idx, label in enumerate(y_label):
    y_pred_test[:, idx] = models[label].predict_proba(test_pool)[:, 1]

In [20]:
y_pred_test

In [None]:
prediction = pd.DataFrame(y_pred_test, columns=y_label)

In [None]:
prediction.to_csv('submission.csv', index=False)

In [22]:
ss.to_csv('submission.csv', index=False)

In [None]:
prediction.head()