# Поиск токсичных комментариев

In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import nltk 
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 
import time
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import warnings
warnings.simplefilter('ignore')
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('/datasets/toxic_comments.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,toxic
0,0,Explanation\nWhy the edits made under my usern...,0
1,1,D'aww! He matches this background colour I'm s...,0
2,2,"Hey man, I'm really not trying to edit war. It...",0
3,3,"""\nMore\nI can't make any real suggestions on ...",0
4,4,"You, sir, are my hero. Any chance you remember...",0


In [5]:
# функция для очистки текста
def clear_text(text):
    clear_text = re.sub(r'[^a-zA-Z]', ' ', text)
    clear_text = clear_text.split()
    clear_text = " ".join(clear_text)
    return clear_text

In [6]:
# цикл для очистки текста
for i in tqdm(range(len(corpus))):
    corpus[i] = clear_text(corpus[i])

100%|██████████| 159292/159292 [00:04<00:00, 39786.76it/s]


In [7]:
# создадим новый датафрейм и добавим столбец с очищенным текстом
df_corpus = pd.DataFrame(corpus)
df['clear_text'] = df_corpus[0]
display(df.head(10))
df.info()

Unnamed: 0.1,Unnamed: 0,text,toxic,clear_text
0,0,explanation\nwhy the edits made under my usern...,0,explanation why the edits made under my userna...
1,1,d'aww! he matches this background colour i'm s...,0,d aww he matches this background colour i m se...
2,2,"hey man, i'm really not trying to edit war. it...",0,hey man i m really not trying to edit war it s...
3,3,"""\nmore\ni can't make any real suggestions on ...",0,more i can t make any real suggestions on impr...
4,4,"you, sir, are my hero. any chance you remember...",0,you sir are my hero any chance you remember wh...
5,5,"""\n\ncongratulations from me as well, use the ...",0,congratulations from me as well use the tools ...
6,6,cocksucker before you piss around on my work,1,cocksucker before you piss around on my work
7,7,your vandalism to the matt shirvington article...,0,your vandalism to the matt shirvington article...
8,8,sorry if the word 'nonsense' was offensive to ...,0,sorry if the word nonsense was offensive to yo...
9,9,alignment on this subject and which are contra...,0,alignment on this subject and which are contra...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159292 entries, 0 to 159291
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  159292 non-null  int64 
 1   text        159292 non-null  object
 2   toxic       159292 non-null  int64 
 3   clear_text  159292 non-null  object
dtypes: int64(2), object(2)
memory usage: 4.9+ MB


In [8]:
import spacy

In [9]:
en_core = spacy.load('en_core_web_sm')

In [10]:
stopwords = set(nltk_stopwords.words('english'))

In [11]:
%%time
# запустим леммитизацию и добавим удаление стоп слов в функцию
df["lemm_text"] = df['clear_text'].apply(lambda x: " ".join([y.lemma_ for y in en_core(x) if x not in stopwords]))

CPU times: user 36min 19s, sys: 17.6 s, total: 36min 37s
Wall time: 36min 41s


In [12]:
# удалим ненужный столбец "Unnamed: 0"
df = df.drop('Unnamed: 0', axis=1)

In [13]:
# удалим также столбцы text и clear_text
df = df.drop('text', axis=1)
df = df.drop('clear_text', axis=1)

In [14]:
df.head()

Unnamed: 0,toxic,lemm_text
0,0,explanation why the edit make under my usernam...
1,0,d aww he match this background colour I m seem...
2,0,hey man I m really not try to edit war it s ju...
3,0,more I can t make any real suggestion on impro...
4,0,you sir be my hero any chance you remember wha...


In [15]:
# посмотрим на распределение toxic 
df['toxic'].value_counts()

0    143106
1     16186
Name: toxic, dtype: int64

In [16]:
# посмотрим отношение количества строк с 0 к 1 
rat = len(df.loc[df['toxic']==0])/len(df.loc[df['toxic']==1])
rat

8.841344371679229

In [17]:
# разделим датасет на обучающую и тестовую выборки в соотношении 4/1 соответственно
train, test = train_test_split(df, test_size=0.25, random_state=12345, stratify = df['toxic'])

In [18]:
# проверим как разбили датасет
display(train.shape)
display(test.shape)

(119469, 2)

(39823, 2)

In [19]:
# объявим переменные features и target для каждой выборки
features_train = train['lemm_text']
target_train = train['toxic']
features_test = test['lemm_text']
target_test = test['toxic']

In [20]:
display(features_train.shape)
display(features_test.shape)
display(target_train.shape)
display(target_test.shape)

(119469,)

(39823,)

(119469,)

(39823,)

In [None]:
%%time
# Обучим модель LogisticRegression
param_range_fl = [1.0, 0.5]
pipe_lr = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(random_state=12345))])

params_lr = [{'clf__penalty': ['l1', 'l2'],
        'clf__C': param_range_fl,
        'clf__solver': ['liblinear']}] 

model_LR = GridSearchCV(estimator=pipe_lr,
            param_grid=params_lr,
            scoring='f1',
            cv=4) 
model_LR.fit(features_train, target_train) 
print(model_LR.best_score_*100)
print(model_LR.best_params_)

In [22]:
%%time
# Обучим модель RandomForestClassifier
pipe_rf = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
            ('clf', RandomForestClassifier(random_state=12345, class_weight='balanced'))])

params_RF = [{'clf__criterion': ['gini', 'entropy'],
        'clf__max_depth': [7, 8],
        'clf__n_estimators' : [35, 40],
        'clf__min_samples_leaf': [3,5],      
        'clf__min_samples_split': [1, 2]}]

model_RF = GridSearchCV(estimator=pipe_rf,
            param_grid=params_RF,
            scoring='f1',
            cv=4, 
            n_jobs=-1)
model_RF.fit(features_train, target_train)

print(model_RF.best_score_*100)
print(model_RF.best_params_)

32.12440344476139
{'clf__criterion': 'entropy', 'clf__max_depth': 8, 'clf__min_samples_leaf': 3, 'clf__min_samples_split': 2, 'clf__n_estimators': 40}
CPU times: user 11min 31s, sys: 9.97 s, total: 11min 41s
Wall time: 11min 43s


In [23]:
%%time
# Обучим модель LGBMClassifier

pipe_LGBM = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
            ('clf', LGBMClassifier(random_state=12345, class_weight='balanced'))])

params_LGBM = [{
    'clf__n_estimators': [35, 40],
    'clf__learning_rate': [0.3, 0.4]
}]

model_LGBM = GridSearchCV(estimator=pipe_LGBM,
            param_grid=params_LGBM,
            scoring='f1',
            cv=3, 
            n_jobs=-1)
model_LGBM.fit(features_train, target_train)

print(model_LGBM.best_score_*100)
print(model_LGBM.best_params_)

72.33600342089431
{'clf__learning_rate': 0.4, 'clf__n_estimators': 40}
CPU times: user 20min 35s, sys: 6.41 s, total: 20min 41s
Wall time: 20min 50s


In [24]:
%%time
# Обучим модель CatBoostClassifier

pipe_CBC = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
            ('clf', CatBoostClassifier(random_state=12345))])

params_CBC = [{
    'clf__learning_rate' : [0.03, 0.04],
    'clf__iterations': [35, 40],
    'clf__depth': [4, 6]
}]

model_CBC = GridSearchCV(estimator=pipe_CBC,
            param_grid=params_CBC,
            scoring='f1',
            cv=3, 
            n_jobs=-1)
model_CBC.fit(features_train, target_train)

print(model_CBC.best_score_*100)
print(model_CBC.best_params_)

0:	learn: 0.6637367	total: 932ms	remaining: 31.7s
1:	learn: 0.6345804	total: 1.88s	remaining: 30.9s
2:	learn: 0.6076040	total: 2.82s	remaining: 30.1s
3:	learn: 0.5828112	total: 3.78s	remaining: 29.3s
4:	learn: 0.5596925	total: 4.76s	remaining: 28.6s
5:	learn: 0.5380504	total: 5.79s	remaining: 28s
6:	learn: 0.5171351	total: 6.8s	remaining: 27.2s
7:	learn: 0.4980839	total: 7.79s	remaining: 26.3s
8:	learn: 0.4803949	total: 8.75s	remaining: 25.3s
9:	learn: 0.4637500	total: 9.76s	remaining: 24.4s
10:	learn: 0.4493705	total: 10.7s	remaining: 23.4s
11:	learn: 0.4350483	total: 11.7s	remaining: 22.4s
12:	learn: 0.4224568	total: 12.6s	remaining: 21.3s
13:	learn: 0.4107546	total: 13.5s	remaining: 20.3s
14:	learn: 0.3995080	total: 14.5s	remaining: 19.4s
15:	learn: 0.3893122	total: 15.5s	remaining: 18.4s
16:	learn: 0.3795088	total: 16.5s	remaining: 17.4s
17:	learn: 0.3702465	total: 17.4s	remaining: 16.5s
18:	learn: 0.3621154	total: 18.4s	remaining: 15.5s
19:	learn: 0.3545409	total: 19.4s	remaining:

In [25]:
# создадим датафрейм для вывода результатов
columns = ['Модель', 'Время работы модели, сек.', 'f1-мера']
lr_model = ['Logistic Regression', 185, 78.45]
lgbm_model = ['LGBMClassifier', 197, 76.53]
rf_model = ['RandomForestClassifier', 771, 32.12]
cbc_model = ['CatBoostClassifier', 2272, 52.92]
table = pd.DataFrame([lr_model, lgbm_model, rf_model, cbc_model], columns = columns)


display(table)

Unnamed: 0,Модель,"Время работы модели, сек.",f1-мера
0,Logistic Regression,185,78.45
1,LGBMClassifier,197,76.53
2,RandomForestClassifier,771,32.12
3,CatBoostClassifier,2272,52.92


In [26]:
%%time
# проведем проверку на адекватность
model_dc = DummyClassifier().fit(features_train, target_train)
display(model_dc.score(features_test, target_test))

0.8984004218667604

CPU times: user 8.45 ms, sys: 0 ns, total: 8.45 ms
Wall time: 7.31 ms


In [27]:
%%time
# проверим мадель с лучшими показателями на тестовой выборке
pred_model_LR = model_LR.predict(features_test)
score_model_LR = f1_score(target_test, pred_model_LR)
score_model_LR

CPU times: user 1.63 s, sys: 9.65 ms, total: 1.64 s
Wall time: 1.66 s


0.7869444444444444

# Вывод
Перед нами стояла задача предоставить магазину инструмент, который будет искать токсичные комментарии и отправлять их на модерацию.
В нашем распоряжении оказался набор данных с разметкой о токсичности правок. Данные были загружены и подготовлены - мы произвели леммитизацию и очистку текса. 
Нами были обучены несколько моделей классификации. Модели с наилучшими показателями оказались: Логистическая регрессия и LGBMClassifier. Логистическая регрессия самая быстрая и самая лучшая по качеству.