In [24]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostClassifier
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from tqdm.notebook import tqdm

In [25]:
df = pd.read_csv('ru_toxic_dataset.csv')

In [26]:
df['toxic'] = df['toxic'].apply(lambda x: int(x))

In [27]:
df

Unnamed: 0,comment,toxic
0,дворника надо тоже уничтожить!,1
1,"моя старшая неделю шипела, не принимала подкид...",0
2,полностью с вами согласна!,0
3,"хоть ногу вверх, ничего не изменится",0
4,а что значит - левого ребенка?,0
...,...,...
163182,Вонючий совковый скот прибежал и ноет. А вот и...,1
163183,А кого любить? Гоблина тупорылого что-ли? Или ...,1
163184,"Посмотрел Утомленных солнцем 2. И оказалось, ч...",0
163185,КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н...,1


In [28]:
X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['toxic'])

In [29]:
text_features = ['comment']

In [30]:
learn_pool = Pool(
        X_train, 
        y_train, 
        text_features=text_features,
        feature_names=['comment']
    )

In [38]:
model = CatBoostClassifier(iterations=3000, learning_rate=0.03, sampling_frequency='PerTree', class_weights=[0.2, 0.8])
model.fit(learn_pool, verbose=True)

0:	learn: 0.6697777	total: 28.6ms	remaining: 1m 25s
1:	learn: 0.6479109	total: 62.2ms	remaining: 1m 33s
2:	learn: 0.6266855	total: 97ms	remaining: 1m 36s
3:	learn: 0.6070558	total: 132ms	remaining: 1m 38s
4:	learn: 0.5900219	total: 166ms	remaining: 1m 39s
5:	learn: 0.5725177	total: 203ms	remaining: 1m 41s
6:	learn: 0.5574590	total: 237ms	remaining: 1m 41s
7:	learn: 0.5420014	total: 274ms	remaining: 1m 42s
8:	learn: 0.5296426	total: 309ms	remaining: 1m 42s
9:	learn: 0.5171915	total: 342ms	remaining: 1m 42s
10:	learn: 0.5055210	total: 379ms	remaining: 1m 42s
11:	learn: 0.4954250	total: 415ms	remaining: 1m 43s
12:	learn: 0.4850783	total: 453ms	remaining: 1m 44s
13:	learn: 0.4769638	total: 487ms	remaining: 1m 43s
14:	learn: 0.4690319	total: 520ms	remaining: 1m 43s
15:	learn: 0.4625383	total: 553ms	remaining: 1m 43s
16:	learn: 0.4552958	total: 587ms	remaining: 1m 43s
17:	learn: 0.4478774	total: 623ms	remaining: 1m 43s
18:	learn: 0.4417388	total: 657ms	remaining: 1m 43s
19:	learn: 0.4356461	

<catboost.core.CatBoostClassifier at 0x265c7228040>

In [39]:
y_pred = []
y_pred_proba = []
for el in tqdm(X_test):
    y_pred.append(model.predict([el]))
    y_pred_proba.append(model.predict_proba([el]))

  0%|          | 0/40797 [00:00<?, ?it/s]

In [40]:
accuracy_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred)

(0.9084491506728436, 0.7746334399324203, 0.8710061740365932)

In [34]:
model.classes_

array([0, 1], dtype=int64)

In [17]:
from collections import Counter

In [18]:
Counter(y_train)

Counter({0.0: 98754, 1.0: 23636})

In [41]:
model.save_model('toxic_clf.cbm')