In [1]:
import kagglehub
import pandas as pd
import os 
import numpy as np
from collections import Counter

from lemmatizer import get_normal_series

import nltk
from nltk.corpus import stopwords
from pymorphy3 import MorphAnalyzer 



nltk.download('stopwords')
morph = MorphAnalyzer()
stopwords = set(stopwords.words('russian'))

path = kagglehub.dataset_download("blackmoon/russian-language-toxic-comments")
path = path + '/' + os.listdir(path)[0]
print(path)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package stopwords to /home/ars/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  import pkg_resources


/home/ars/.cache/kagglehub/datasets/blackmoon/russian-language-toxic-comments/versions/1/labeled.csv


In [2]:
df = pd.read_csv(path)
print("Нули", df.isna().sum())

print(f"{df['toxic'].mean():.2f} - доля токсиков")

df.groupby('toxic').agg(mean_num_letters = ('comment', lambda x: x.str.len().mean().astype(int)),
                        mean_num_words = ('comment', lambda x: x.str.split().str.len().mean().astype(int)))


Нули comment    0
toxic      0
dtype: int64
0.33 - доля токсиков


Unnamed: 0_level_0,mean_num_letters,mean_num_words
toxic,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,194,30
1.0,141,22


In [3]:
def tokenizer(text: str) -> list[str]:
    return ''.join(['е' if i.lower()=='ё' else 
                    i.lower() if i.isalpha() else 
                    ' ' for i in text]).split()

def drop_stopwords(li: list[str], stopwords) -> list[str]:
    return [i for i in li if i not in stopwords]

df['comment'] = df['comment'].apply(tokenizer)
df['comment'] = get_normal_series(df['comment'])
df['comment'] = df['comment'].apply(lambda x: drop_stopwords(x, stopwords))

li = []
for i in df.comment.values:
    li.extend(i)

count = Counter(li)
drop_level = 3
print(f'''
Уникальных слов: {len(count)}
Слов с частотой больше {drop_level}: {sum([1 for i in count.values() if i > drop_level])}
''')
drop_words = [i for i in count if count[i] <= drop_level]
used_words = [i for i in count if count[i] > drop_level]

df['comment'] = df['comment'].apply(lambda x: drop_stopwords(x, set(drop_words)))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1201), Label(value='0 / 1201'))), …


Уникальных слов: 33809
Слов с частотой больше 3: 8769



In [4]:
def get_tf(words: list[str], documents: list[list[str]]) -> list[list[float]]:
    '''
    Вовращает tf в формате
    shape: (len documents, len words)
    '''

    count_ = []
    for text in documents:
        count_.append(Counter(text))

    tf = []
    for text_count in count_:
        tf.append([])
        sum_ = sum(text_count.values())
        sum_ = sum_ if sum_ > 0 else 1
        for word in words:
            tf[-1].append(text_count.get(word, 0) / sum_)
    
    return tf


def get_idf(words: list[str], documents: list[list[str]]) -> list[float]:
    '''
    Возвращает idf в формате 
    shape: (len words)
    '''
    documents = [set(text) for text in documents]
        
    idf = []
    for word in words:
        count_ = 0
        for text in documents:
            count_ += word in text
        idf.append(np.log((len(documents) + 1) / (count_ + 1)))

    return idf
            

In [5]:
tf = np.array(get_tf(used_words, df.comment.values.tolist()))
idf = np.array(get_idf(used_words, df.comment.values.tolist()))
tf_idf = tf * idf.reshape([1, -1])

# фичи - tf_idf
# таргет - df['toxic']
# слова - used_words

In [6]:
from cuml.decomposition import PCA, TruncatedSVD
from cuml.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split 

idf_train, idf_val, y_train, y_val = train_test_split(tf_idf, df['toxic'], test_size=0.2)

# pca = PCA(n_components=100)
# idf_train = pca.fit_transform(idf_train)
# idf_val = pca.transform(idf_val)

# svd = TruncatedSVD(n_components=1000)
# idf_train = svd.fit_transform(idf_train)
# idf_val = svd.transform(idf_val)

model = RandomForestClassifier(n_estimators=300, max_depth=100)
model.fit(idf_train, y_train)
res = model.predict_proba(idf_val)

res_ = [[int(i > drop_rate/100) for i in res[:,1]] for drop_rate in range(0, 100)]
# accuracy_score(y_val, res_), f1_score(y_val, res_)
max(map(lambda x: accuracy_score(y_val, x), res_)), max(map(lambda x: f1_score(y_val, x), res_))

(0.8414845646895595, 0.7424849699398798)

In [9]:
idf_train.shape

(11529, 1000)