# 1. Подготовка

In [2]:
#импорт библиотек
import pandas as pd
import nltk
import numpy as np
from pymystem3 import Mystem
from catboost import Pool, cv
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from sklearn.model_selection import  GridSearchCV
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# установка стоп слов
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# откроем csv файл
data = pd.read_csv('/datasets/toxic_comments.csv')
data.info()
display(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
text     159571 non-null object
toxic    159571 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [5]:
# проверим 
print(data.isna().sum())
print(data.duplicated().sum())

text     0
toxic    0
dtype: int64
0


In [6]:
# разделим на выборки
data['text'] = data['text'].astype('U')

features = data['text']
target = data['toxic']

features_train, features_test, target_train, target_test = train_test_split(features, target, 
                                                                            test_size = 0.25, random_state = 12345)

In [7]:
# активация стоп слов
stopwords = set(nltk_stopwords.words('english'))

In [8]:
# применим Tf-idf
count_tf_idf = TfidfVectorizer(stop_words = stopwords)

features_train = count_tf_idf.fit_transform(features_train)
features_test = count_tf_idf.transform(features_test)

# 2. Обучение

In [9]:
# обучим линейную регрессию
model_lr = LogisticRegression(fit_intercept = True, random_state = 12345)
model_lr.fit(features_train, target_train)
print(model_lr.score(features_train, target_train))



0.959115292702084


In [10]:
%%time
# обучим CatBoost
model_cbc = CatBoostClassifier(iterations=120,
                           depth=4,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True)

model_cbc.fit(features_train, target_train, verbose = 10)
print(model_cbc.score(features_train, target_train))

0:	learn: 0.2681262	total: 2.92s	remaining: 5m 48s
10:	learn: 0.1805168	total: 29.1s	remaining: 4m 48s
20:	learn: 0.1597081	total: 55.3s	remaining: 4m 20s
30:	learn: 0.1466500	total: 1m 21s	remaining: 3m 54s
40:	learn: 0.1399643	total: 1m 48s	remaining: 3m 29s
50:	learn: 0.1339637	total: 2m 15s	remaining: 3m 2s
60:	learn: 0.1298000	total: 2m 41s	remaining: 2m 36s
70:	learn: 0.1260875	total: 3m 8s	remaining: 2m 9s
80:	learn: 0.1224943	total: 3m 35s	remaining: 1m 43s
90:	learn: 0.1193782	total: 4m 2s	remaining: 1m 17s
100:	learn: 0.1165053	total: 4m 28s	remaining: 50.6s
110:	learn: 0.1136478	total: 4m 55s	remaining: 24s
119:	learn: 0.1115612	total: 5m 19s	remaining: 0us
0.9643376393322081
CPU times: user 6min 49s, sys: 16.2 s, total: 7min 5s
Wall time: 7min 8s


In [11]:
# предсказания регрессии
predicted = model_lr.predict(features_test)
print(f1_score(predicted, target_test))

0.7350326022525193


In [12]:
# предсказания CatBoost
predicted_cbc = model_cbc.predict(features_test)
print(predicted_cbc)
print(f1_score(predicted_cbc, target_test))

[1. 0. 0. ... 0. 0. 0.]
0.7535145267104031


# 3. Выводы

В ходе обучения мы обучили модель подходящую для решения этой задачи. Классическая регресиия показала неплохой результат и почти справилась с задачей. Пришлось использовать модель CatBoost, которая справилась неплохо, но обучалась долго (100 итераций = 5 минут). Конечно можно уменьшить количество итераций, чтобы ускорить обучение, но соответственно потеряем в точности обучения