#  Поиск токсичных комментариев
Интернет-магазин запускает новый сервис. Теперь пользователи могут редактировать и дополнять описания товаров, как в вики-сообществах. То есть клиенты предлагают свои правки и комментируют изменения других. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию.  
Надо обучить модель классифицировать комментарии на позитивные и негативные. В вашем распоряжении набор данных с разметкой о токсичности правок.   Метрика качества F1 не меньше 0.75.


# Подготовка

In [1]:
# Импорт необходимых библиотек и компонентов
import pandas as pd 
import nltk 
import re

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer 

from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from catboost import CatBoostClassifier

In [2]:
# Загрузка данных
data = pd.read_csv('toxic_comments.csv') 
stop_words = set(stopwords.words('english'))
corpus = data['text'] 

In [3]:
# Посмотрим данные
data.head(20)

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
5,"""\n\nCongratulations from me as well, use the ...",0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,Your vandalism to the Matt Shirvington article...,0
8,Sorry if the word 'nonsense' was offensive to ...,0
9,alignment on this subject and which are contra...,0


In [4]:
# Функции очистки и леммитизации текста
lemmatizer = WordNetLemmatizer() 
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text
def lemmatize_text(text):
    return lemmatizer.lemmatize(clean_text(text))

In [5]:
# Создание столбца с очищенным и леммитизированным текстом
data['clean_text'] = data['text'].apply(lambda x: lemmatize_text(x))

In [6]:
data

Unnamed: 0,text,toxic,clean_text
0,Explanation\nWhy the edits made under my usern...,0,explanation why the edits made under my userna...
1,D'aww! He matches this background colour I'm s...,0,d aww he matches this background colour i am s...
2,"Hey man, I'm really not trying to edit war. It...",0,hey man i am really not trying to edit war it ...
3,"""\nMore\nI can't make any real suggestions on ...",0,more i cannot make any real suggestions on imp...
4,"You, sir, are my hero. Any chance you remember...",0,you sir are my hero any chance you remember wh...
...,...,...,...
159566,""":::::And for the second time of asking, when ...",0,and for the second time of asking when your vi...
159567,You should be ashamed of yourself \n\nThat is ...,0,you should be ashamed of yourself that is a ho...
159568,"Spitzer \n\nUmm, theres no actual article for ...",0,spitzer umm theres no actual article for prost...
159569,And it looks like it was actually you who put ...,0,and it looks like it was actually you who put ...


In [7]:
# Разделение на тренировочную и тестовую выборки
train, test = train_test_split(data, test_size=0.2, random_state=42)

# Обучение

Используем три модели: Мешок слов, TF-IDF, CatBoost 

## Мешок слов

In [8]:
# Формирование фичей и целевого признака train
count_vect = CountVectorizer(stop_words=stop_words)  
corpus_train = train['clean_text'].values.astype('U')
X_train = count_vect.fit_transform(corpus_train) 
y_train = train['toxic'].values 

In [9]:
# Формирование фичей и целевого признака test
corpus_test = test[['clean_text']].values.astype('U') 
X_test = count_vect.transform((corpus_test.ravel())) 
y_test = test['toxic'].values

In [10]:
# Построение модели и вывод результата
model = LogisticRegression(max_iter=1000) 
model.fit(X_train, y_train) 
pred = model.predict(X_test)
f1_bow = f1_score(y_test, pred)
print('F1 на мешке слов:', f1_bow)

F1 на мешке слов: 0.7693888032871083


## TF-IDF

In [11]:
# Формирование фичей и целевого признака train
count_tf_idf = TfidfVectorizer(stop_words=stopwords.words('english')) 
corpus_train = train['clean_text'].values.astype('U')
X_train = count_tf_idf.fit_transform(corpus_train) 
y_train = train['toxic'].values 

# Формирование фичей и целевого признака test
corpus_test = test['clean_text'].values.astype('U') 
X_test = count_tf_idf.transform(corpus_test) 
y_test = test['toxic'].values

# Построение модели и вывод результата
model = LogisticRegression(max_iter=1000) 
model.fit(X_train, y_train) 
pred = model.predict(X_test)
f1_tf_idf = f1_score(y_test, pred)
print('F1 на TF-IDF:', f1_tf_idf)

F1 на TF-IDF: 0.7357997010463377


## CatBoost

In [12]:
X_col = ['text']
y_col = ['toxic']
text_features = ['text']

model = CatBoostClassifier(verbose=100,text_features=text_features, eval_metric = 'F1')
model.fit(train[X_col], train[y_col], eval_set= (test[X_col],test[y_col]))
pred = model.predict(test[X_col])
f1_cb = f1_score(test[y_col], pred)
print('F1 на CatBoost:', f1_cb)

Learning rate set to 0.104979
0:	learn: 0.6525309	test: 0.6788504	best: 0.6788504 (0)	total: 315ms	remaining: 5m 14s
100:	learn: 0.6882322	test: 0.6905845	best: 0.6908263 (99)	total: 17.6s	remaining: 2m 36s
200:	learn: 0.7051434	test: 0.6955911	best: 0.6961094 (182)	total: 34.5s	remaining: 2m 16s
300:	learn: 0.7157885	test: 0.7000176	best: 0.7018775 (290)	total: 50.7s	remaining: 1m 57s
400:	learn: 0.7239167	test: 0.6998947	best: 0.7018775 (290)	total: 1m 6s	remaining: 1m 40s
500:	learn: 0.7318767	test: 0.7028260	best: 0.7030729 (496)	total: 1m 23s	remaining: 1m 22s
600:	learn: 0.7383846	test: 0.7020193	best: 0.7037752 (548)	total: 1m 39s	remaining: 1m 5s
700:	learn: 0.7448524	test: 0.7013718	best: 0.7037752 (548)	total: 1m 55s	remaining: 49.2s
800:	learn: 0.7502519	test: 0.7024648	best: 0.7037752 (548)	total: 2m 11s	remaining: 32.7s
900:	learn: 0.7559269	test: 0.7027978	best: 0.7037752 (548)	total: 2m 27s	remaining: 16.2s
999:	learn: 0.7599388	test: 0.7031963	best: 0.7037752 (548)	tota

# Выводы 

In [13]:
print('F1 на мешке слов:', f1_bow)
print('F1 на TF-IDF:', f1_tf_idf)
print('F1 на CatBoost:', f1_cb)

F1 на мешке слов: 0.7693888032871083
F1 на TF-IDF: 0.7357997010463377
F1 на CatBoost: 0.7037752414398595


Наилучшей моделью является "Мешок слов" с целевым значением F1= 0.77.