In [3]:
import pandas as pd
from catboost import CatBoostClassifier
import re
import nltk
from nltk.corpus import stopwords
from pymorphy3 import MorphAnalyzer
from typing import List
from tqdm import tqdm
import random
import numpy as np
import os

In [4]:
tqdm.pandas()

In [5]:
def set_all_seeds(seed=42):
    # python's seeds
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_all_seeds()
seed=42

In [6]:


# Загружаем стоп-слова NLTK (для русского и английского)
nltk.download("stopwords")
stop_words = set(stopwords.words("russian")) | set(stopwords.words("english"))

# Инициализация морфологического анализатора pymorphy3 для лемматизации
morph = MorphAnalyzer()

def preprocess_text(text: str) -> str:
    """
    Предобрабатывает текст для модели CatBoost.
    
    Шаги обработки:
    1. Приведение к нижнему регистру.
    2. Удаление всех символов, кроме букв и пробелов.
    3. Токенизация (разделение на слова).
    4. Удаление стоп-слов.
    5. Лемматизация слов.
    
    :param text: Входной текст.
    :return: Предобработанный текст.
    """
    
    # 1. Приводим текст к нижнему регистру
    text = text.lower()
    
    # 2. Удаляем спецсимволы, цифры и оставляем только буквы и пробелы
    text = re.sub(r"[^a-zа-яё\s]", " ", text)
    
    # 3. Токенизация: разделение текста на слова
    words = text.split()
    
    # 4. Удаление стоп-слов
    words = [word for word in words if word not in stop_words]
    
    # 5. Лемматизация
    lemmatized_words = [morph.parse(word)[0].normal_form for word in words]
    
    # Собираем обратно в строку и возвращаем результат
    processed_text = " ".join(lemmatized_words)
    
    return processed_text


[nltk_data] Downloading package stopwords to C:\Users\nsk-
[nltk_data]     adm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df_train = pd.read_csv(r'train.csv')
df_train.drop(columns=['id'], inplace=True)

In [8]:
df_train['comment_text'] = df_train['comment_text'].progress_apply(preprocess_text)

100%|██████████| 159571/159571 [01:00<00:00, 2629.73it/s]


In [9]:
cb1 = CatBoostClassifier(iterations=500, text_features=['comment_text'], random_seed=seed)
cb2 = CatBoostClassifier(iterations=500 ,text_features=['comment_text'], random_seed=seed)
cb3 = CatBoostClassifier(iterations=500 ,text_features=['comment_text'], random_seed=seed)
cb4 = CatBoostClassifier(iterations=500 ,text_features=['comment_text'], random_seed=seed)
cb5 = CatBoostClassifier(iterations=500 ,text_features=['comment_text'], random_seed=seed)
cb6 = CatBoostClassifier(iterations=500 ,text_features=['comment_text'], random_seed=seed)

In [10]:
X = df_train[['comment_text']]
y1 = df_train['toxic']
y2 = df_train['severe_toxic']
y3 = df_train['obscene']
y4 = df_train['threat']
y5 = df_train['insult']
y6 = df_train['identity_hate']

In [11]:
cb1.fit(X, y1)

Learning rate set to 0.169684
0:	learn: 0.4819422	total: 243ms	remaining: 2m 1s
1:	learn: 0.3492592	total: 330ms	remaining: 1m 22s
2:	learn: 0.2748612	total: 416ms	remaining: 1m 8s
3:	learn: 0.2343488	total: 503ms	remaining: 1m 2s
4:	learn: 0.2012141	total: 601ms	remaining: 59.5s
5:	learn: 0.1838809	total: 688ms	remaining: 56.6s
6:	learn: 0.1718166	total: 783ms	remaining: 55.1s
7:	learn: 0.1631222	total: 875ms	remaining: 53.8s
8:	learn: 0.1572059	total: 966ms	remaining: 52.7s
9:	learn: 0.1529291	total: 1.05s	remaining: 51.7s
10:	learn: 0.1510148	total: 1.14s	remaining: 50.7s
11:	learn: 0.1484995	total: 1.23s	remaining: 50.1s
12:	learn: 0.1472267	total: 1.32s	remaining: 49.4s
13:	learn: 0.1461096	total: 1.4s	remaining: 48.8s
14:	learn: 0.1446561	total: 1.5s	remaining: 48.4s
15:	learn: 0.1432822	total: 1.58s	remaining: 47.9s
16:	learn: 0.1424080	total: 1.68s	remaining: 47.6s
17:	learn: 0.1418347	total: 1.76s	remaining: 47.1s
18:	learn: 0.1412305	total: 1.85s	remaining: 46.7s
19:	learn: 0

<catboost.core.CatBoostClassifier at 0x28e6b9ef740>

In [12]:
cb2.fit(X, y2)

Learning rate set to 0.169684
0:	learn: 0.3846984	total: 73.4ms	remaining: 36.6s
1:	learn: 0.1898762	total: 160ms	remaining: 39.8s
2:	learn: 0.1076155	total: 251ms	remaining: 41.5s
3:	learn: 0.0710361	total: 343ms	remaining: 42.6s
4:	learn: 0.0518030	total: 435ms	remaining: 43s
5:	learn: 0.0416103	total: 528ms	remaining: 43.5s
6:	learn: 0.0375685	total: 619ms	remaining: 43.6s
7:	learn: 0.0337569	total: 707ms	remaining: 43.5s
8:	learn: 0.0305431	total: 795ms	remaining: 43.4s
9:	learn: 0.0295051	total: 889ms	remaining: 43.6s
10:	learn: 0.0288567	total: 977ms	remaining: 43.4s
11:	learn: 0.0282176	total: 1.06s	remaining: 43.2s
12:	learn: 0.0275054	total: 1.16s	remaining: 43.3s
13:	learn: 0.0272018	total: 1.24s	remaining: 43.2s
14:	learn: 0.0269594	total: 1.33s	remaining: 43s
15:	learn: 0.0267519	total: 1.42s	remaining: 42.9s
16:	learn: 0.0266551	total: 1.5s	remaining: 42.7s
17:	learn: 0.0264722	total: 1.59s	remaining: 42.5s
18:	learn: 0.0264113	total: 1.67s	remaining: 42.2s
19:	learn: 0.02

<catboost.core.CatBoostClassifier at 0x28e1f0b4e00>

In [13]:
cb3.fit(X, y3)

Learning rate set to 0.169684
0:	learn: 0.4092822	total: 153ms	remaining: 1m 16s
1:	learn: 0.2702323	total: 331ms	remaining: 1m 22s
2:	learn: 0.1965366	total: 514ms	remaining: 1m 25s
3:	learn: 0.1442764	total: 708ms	remaining: 1m 27s
4:	learn: 0.1174350	total: 906ms	remaining: 1m 29s
5:	learn: 0.1012159	total: 1.09s	remaining: 1m 29s
6:	learn: 0.0950331	total: 1.28s	remaining: 1m 30s
7:	learn: 0.0871090	total: 1.51s	remaining: 1m 33s
8:	learn: 0.0840739	total: 1.69s	remaining: 1m 32s
9:	learn: 0.0799298	total: 1.86s	remaining: 1m 31s
10:	learn: 0.0783154	total: 2.02s	remaining: 1m 29s
11:	learn: 0.0762908	total: 2.19s	remaining: 1m 29s
12:	learn: 0.0750699	total: 2.35s	remaining: 1m 27s
13:	learn: 0.0740441	total: 2.52s	remaining: 1m 27s
14:	learn: 0.0726392	total: 2.67s	remaining: 1m 26s
15:	learn: 0.0718844	total: 2.84s	remaining: 1m 26s
16:	learn: 0.0712104	total: 3.01s	remaining: 1m 25s
17:	learn: 0.0705688	total: 3.17s	remaining: 1m 24s
18:	learn: 0.0697719	total: 3.36s	remaining:

<catboost.core.CatBoostClassifier at 0x28e1f0b4e90>

In [14]:
cb4.fit(X, y4)

Learning rate set to 0.169684
0:	learn: 0.3203595	total: 87.1ms	remaining: 43.5s
1:	learn: 0.1579111	total: 181ms	remaining: 45s
2:	learn: 0.0840628	total: 279ms	remaining: 46.2s
3:	learn: 0.0456629	total: 378ms	remaining: 46.9s
4:	learn: 0.0292304	total: 478ms	remaining: 47.3s
5:	learn: 0.0216266	total: 570ms	remaining: 46.9s
6:	learn: 0.0172847	total: 668ms	remaining: 47s
7:	learn: 0.0153357	total: 762ms	remaining: 46.8s
8:	learn: 0.0140682	total: 855ms	remaining: 46.7s
9:	learn: 0.0134051	total: 953ms	remaining: 46.7s
10:	learn: 0.0131286	total: 1.04s	remaining: 46.3s
11:	learn: 0.0126478	total: 1.14s	remaining: 46.3s
12:	learn: 0.0122564	total: 1.23s	remaining: 46.3s
13:	learn: 0.0121156	total: 1.32s	remaining: 46s
14:	learn: 0.0119940	total: 1.42s	remaining: 45.8s
15:	learn: 0.0117825	total: 1.52s	remaining: 45.9s
16:	learn: 0.0116758	total: 1.61s	remaining: 45.8s
17:	learn: 0.0115373	total: 1.72s	remaining: 46s
18:	learn: 0.0113807	total: 1.82s	remaining: 46.1s
19:	learn: 0.01134

<catboost.core.CatBoostClassifier at 0x28e1f0b40b0>

In [15]:
cb5.fit(X, y5)

Learning rate set to 0.169684
0:	learn: 0.4217595	total: 89.8ms	remaining: 44.8s
1:	learn: 0.2871109	total: 183ms	remaining: 45.6s
2:	learn: 0.2138790	total: 275ms	remaining: 45.6s
3:	learn: 0.1637628	total: 373ms	remaining: 46.2s
4:	learn: 0.1384536	total: 466ms	remaining: 46.2s
5:	learn: 0.1206519	total: 560ms	remaining: 46.1s
6:	learn: 0.1140385	total: 653ms	remaining: 46s
7:	learn: 0.1061488	total: 751ms	remaining: 46.2s
8:	learn: 0.1003014	total: 855ms	remaining: 46.7s
9:	learn: 0.0982463	total: 949ms	remaining: 46.5s
10:	learn: 0.0954098	total: 1.04s	remaining: 46.3s
11:	learn: 0.0933149	total: 1.14s	remaining: 46.2s
12:	learn: 0.0925014	total: 1.23s	remaining: 46.1s
13:	learn: 0.0916755	total: 1.32s	remaining: 46s
14:	learn: 0.0906831	total: 1.42s	remaining: 45.8s
15:	learn: 0.0895129	total: 1.52s	remaining: 46s
16:	learn: 0.0890206	total: 1.61s	remaining: 45.8s
17:	learn: 0.0886064	total: 1.7s	remaining: 45.5s
18:	learn: 0.0881881	total: 1.79s	remaining: 45.4s
19:	learn: 0.0878

<catboost.core.CatBoostClassifier at 0x28e1f0b6630>

In [16]:
cb6.fit(X, y6)

Learning rate set to 0.169684
0:	learn: 0.3775458	total: 93.3ms	remaining: 46.6s
1:	learn: 0.2166218	total: 188ms	remaining: 46.7s
2:	learn: 0.1316876	total: 287ms	remaining: 47.6s
3:	learn: 0.0832775	total: 391ms	remaining: 48.5s
4:	learn: 0.0599456	total: 492ms	remaining: 48.7s
5:	learn: 0.0476836	total: 597ms	remaining: 49.2s
6:	learn: 0.0411728	total: 700ms	remaining: 49.3s
7:	learn: 0.0373016	total: 811ms	remaining: 49.9s
8:	learn: 0.0351156	total: 918ms	remaining: 50.1s
9:	learn: 0.0334320	total: 1.01s	remaining: 49.7s
10:	learn: 0.0323211	total: 1.11s	remaining: 49.4s
11:	learn: 0.0315082	total: 1.21s	remaining: 49.3s
12:	learn: 0.0310282	total: 1.31s	remaining: 49.1s
13:	learn: 0.0304813	total: 1.41s	remaining: 49.1s
14:	learn: 0.0300698	total: 1.52s	remaining: 49.1s
15:	learn: 0.0298173	total: 1.62s	remaining: 49s
16:	learn: 0.0295804	total: 1.72s	remaining: 49s
17:	learn: 0.0293949	total: 1.82s	remaining: 48.7s
18:	learn: 0.0291139	total: 1.93s	remaining: 48.8s
19:	learn: 0.0

<catboost.core.CatBoostClassifier at 0x28e1f0b4da0>

In [17]:
test = pd.read_csv(r'test.csv')

In [18]:
test['comment_text'] = test['comment_text'].progress_apply(preprocess_text)

100%|██████████| 153164/153164 [01:05<00:00, 2337.81it/s]


In [19]:
test['toxic'] = cb1.predict_proba(test[['comment_text']])[:, 1]
test['severe_toxic'] = cb2.predict_proba(test[['comment_text']])[:, 1]
test['obscene'] = cb3.predict_proba(test[['comment_text']])[:, 1]
test['threat'] = cb4.predict_proba(test[['comment_text']])[:, 1]
test['insult'] = cb5.predict_proba(test[['comment_text']])[:, 1]
test['identity_hate'] = cb6.predict_proba(test[['comment_text']])[:, 1]

In [20]:
test.drop(columns=['comment_text']).to_csv('answer.csv', index=False)

In [21]:
test

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,yo bitch ja rule succesful ever whats hating s...,0.992719,0.149049,0.907984,0.077191,0.863013,0.169093
1,0000247867823ef7,rfc title fine imo,0.016071,0.000828,0.004951,0.000416,0.008251,0.001339
2,00013b17ad220c46,sources zawe ashton lapland,0.025307,0.002818,0.008205,0.001496,0.014832,0.007827
3,00017563c3f7919a,look back source information updated correct f...,0.004744,0.000506,0.001532,0.000628,0.002263,0.000503
4,00017695ad8997eb,anonymously edit articles,0.022693,0.001913,0.006215,0.001325,0.010462,0.004175
...,...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,totally agree stuff nothing long crap,0.660755,0.000759,0.297984,0.000614,0.049526,0.003385
153160,fffd7a9a6eb32c16,throw field home plate get faster throwing cut...,0.015626,0.000567,0.005948,0.000467,0.008938,0.004011
153161,fffda9e8d6fafa9e,okinotorishima categories see changes agree co...,0.003609,0.000450,0.001338,0.000359,0.001832,0.000682
153162,fffe8f1340a79fc2,one founding nations eu germany law return qui...,0.003553,0.000198,0.001483,0.000215,0.002501,0.008073
