In [1]:
import kagglehub
import pandas as pd
import os 

import nltk
from nltk.corpus import stopwords
from pymorphy3 import MorphAnalyzer 

from transformers import AutoTokenizer, AutoModel
import torch


nltk.download('stopwords')
morph = MorphAnalyzer()
stopwords = set(stopwords.words('russian'))

path = kagglehub.dataset_download("blackmoon/russian-language-toxic-comments")
path = path + '/' + os.listdir(path)[0]
print(path)

df = pd.read_csv(path)

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru").to('cuda')

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

[nltk_data] Downloading package stopwords to /home/ars/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  import pkg_resources


/home/ars/.cache/kagglehub/datasets/blackmoon/russian-language-toxic-comments/versions/1/labeled.csv


In [2]:
batch_size = 80
res = []
for ind in range((len(df) // batch_size) + 1):
    print(f'{ind + 1}/{(len(df) // batch_size)}')
    comments = df.iloc[ind * batch_size:(ind + 1) * batch_size,0].tolist()

    encoded_input = tokenizer(comments, padding=True, truncation=True, max_length=512, return_tensors='pt')
    encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    res.append(sentence_embeddings.detach().cpu())

res2 = torch.cat(res)
res2 = res2.numpy()


1/180
2/180
3/180
4/180
5/180
6/180
7/180
8/180
9/180
10/180
11/180
12/180
13/180
14/180
15/180
16/180
17/180
18/180
19/180
20/180
21/180
22/180
23/180
24/180
25/180
26/180
27/180
28/180
29/180
30/180
31/180
32/180
33/180
34/180
35/180
36/180
37/180
38/180
39/180
40/180
41/180
42/180
43/180
44/180
45/180
46/180
47/180
48/180
49/180
50/180
51/180
52/180
53/180
54/180
55/180
56/180
57/180
58/180
59/180
60/180
61/180
62/180
63/180
64/180
65/180
66/180
67/180
68/180
69/180
70/180
71/180
72/180
73/180
74/180
75/180
76/180
77/180
78/180
79/180
80/180
81/180
82/180
83/180
84/180
85/180
86/180
87/180
88/180
89/180
90/180
91/180
92/180
93/180
94/180
95/180
96/180
97/180
98/180
99/180
100/180
101/180
102/180
103/180
104/180
105/180
106/180
107/180
108/180
109/180
110/180
111/180
112/180
113/180
114/180
115/180
116/180
117/180
118/180
119/180
120/180
121/180
122/180
123/180
124/180
125/180
126/180
127/180
128/180
129/180
130/180
131/180
132/180
133/180
134/180
135/180
136/180
137/180
138/180
139/

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split 

idf_train, idf_val, y_train, y_val = train_test_split(res2, df['toxic'], test_size=0.2)

model = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1)
model.fit(idf_train, y_train)
res = model.predict_proba(idf_val)

res_ = [[int(i > drop_rate/100) for i in res[:,1]] for drop_rate in range(0, 100)]

# качество намного лучше чем у tf idf 
max(map(lambda x: accuracy_score(y_val, x), res_)), max(map(lambda x: f1_score(y_val, x), res_))

In [None]:
scores = []
for drop_rate in range(100):
    drop_rate /= 100
    res_ = [int(i > drop_rate) for i in res[:,1]]
    scores.append([drop_rate, accuracy_score(y_val, res_), f1_score(y_val, res_)])
max(scores, key=lambda x: x[2]) # 0.42 best

[0.39, 0.9004509191814083, 0.8565717141429285]

In [60]:
import joblib

joblib.dump(model, "model.pkl")

['model.pkl']