 Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию. 

## Подготовка

In [3]:
import warnings
warnings.filterwarnings("ignore") 
import pandas as pd 
import numpy as np 
import time 

import nltk
import spacy


import re 
from nltk.corpus import stopwords as nltk_stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import f1_score,roc_curve 
from sklearn.utils import shuffle 
from sklearn.model_selection import GridSearchCV 

from sklearn.linear_model import LogisticRegression  
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier


In [4]:
lemmatizer = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [5]:
#загрузка данных
df_coment = pd.read_csv('/datasets/toxic_comments.csv') 
df_coment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [6]:
display(df_coment.head(10))

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
5,"""\n\nCongratulations from me as well, use the ...",0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,Your vandalism to the Matt Shirvington article...,0
8,Sorry if the word 'nonsense' was offensive to ...,0
9,alignment on this subject and which are contra...,0


In [7]:
display(df_coment['toxic'].value_counts())
class_ratio = df_coment['toxic'].value_counts()[1] /len(df_coment)
print(f'Доля негативных коментариев =  {class_ratio*100:.4} %' )

0    143346
1     16225
Name: toxic, dtype: int64

Доля негативных коментариев =  10.17 %


In [8]:
class_ratio = df_coment['toxic'].value_counts()[0] / df_coment['toxic'].value_counts()[1]
print(f'Соотношение классов =  {class_ratio:.4}' )

Соотношение классов =  8.835


**Доля негативных коментариев не большая, из-за этого на 1 негативный коментарий приходится 8 положительных таким образом классы не сбалансированны**

In [9]:
def lem_txt(text): #Лемматизация
    text = text.lower()
    doc = lemmatizer(text)
    lemm_text = " ".join([token.lemma_ for token in doc])
    return   lemm_text 

In [10]:
sentence = "The striped bats are hanging on their feet for best"
print(lem_txt(sentence))

the stripe bat be hang on their foot for good


In [11]:
%%time
df_coment['lemm_text'] = df_coment['text'].apply(lem_txt)

CPU times: user 16min 33s, sys: 1.85 s, total: 16min 35s
Wall time: 16min 37s


In [12]:
display(df_coment.head(20))

Unnamed: 0,text,toxic,lemm_text
0,Explanation\nWhy the edits made under my usern...,0,explanation \n why the edit make under my user...
1,D'aww! He matches this background colour I'm s...,0,d'aww ! he match this background colour I be s...
2,"Hey man, I'm really not trying to edit war. It...",0,"hey man , I be really not try to edit war . it..."
3,"""\nMore\nI can't make any real suggestions on ...",0,""" \n more \n I can not make any real suggestio..."
4,"You, sir, are my hero. Any chance you remember...",0,"you , sir , be my hero . any chance you rememb..."
5,"""\n\nCongratulations from me as well, use the ...",0,""" \n\n congratulation from I as well , use the..."
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,cocksucker before you piss around on my work
7,Your vandalism to the Matt Shirvington article...,0,your vandalism to the matt shirvington article...
8,Sorry if the word 'nonsense' was offensive to ...,0,sorry if the word ' nonsense ' be offensive to...
9,alignment on this subject and which are contra...,0,alignment on this subject and which be contrar...


In [13]:
df_coment = df_coment.drop(['text'], axis=1)

In [14]:
target = df_coment['toxic'] 
features = df_coment.drop(['toxic'], axis=1) 

In [15]:
#разделяем на выборку для обучения тестирования и валидации, 60/20/20 
features_train, features_valid, target_train, target_valid = train_test_split(features, 
                                                                              target, 
                                                                              test_size=0.4, 
                                                                              random_state=2504)

features_valid, features_test, target_valid, target_test = train_test_split(features_valid, 
                                                                            target_valid, 
                                                                            test_size=0.5,
                                                                            random_state=2504)

In [16]:
nltk.download('stopwords') 
stopwords = set(nltk_stopwords.words('english')) 

count_tf_idf = TfidfVectorizer(stop_words=stopwords) 

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [17]:
features_train_V = count_tf_idf.fit_transform(features_train['lemm_text'].values)
features_valid_V= count_tf_idf.transform(features_valid['lemm_text'].values)
features_test_V= count_tf_idf.transform(features_test['lemm_text'].values)
print(features_train_V.shape)
print(features_valid_V.shape)
print(features_test_V.shape)

(95742, 127194)
(31914, 127194)
(31915, 127194)


## Обучение

In [18]:
cv_counts = 3 # количество блоков кросс валидации

In [33]:
models_scors = pd.DataFrame(columns = ['model', 'f1', 'lirning_time', 'predicting_time']) # таблица для результатов

### Модель логическая классификация

In [34]:
%%time

model_logic = LogisticRegression() 
train_logic_f1 = cross_val_score(model_logic, 
                      features_train_V, 
                      target_train, 
                      cv=cv_counts, 
                      scoring='f1').mean()
print(f'Cреднее гармоническое полноты и точности F1 = {train_logic_f1:.3}, при CV={cv_counts}')

Cреднее гармоническое полноты и точности F1 = 0.687, при CV=3
CPU times: user 39.4 s, sys: 54.1 s, total: 1min 33s
Wall time: 1min 33s


**С балансировкой классов**

In [35]:
%%time

model_logic_b = LogisticRegression(class_weight='balanced')
train_logic_b_f1= cross_val_score(model_logic_b, 
                                    features_train_V, 
                                    target_train, 
                                    cv=cv_counts, 
                                    scoring='f1').mean()
print(f'Cреднее гармоническое полноты и точности F1 = {train_logic_b_f1:.3}, при CV={cv_counts}')

Cреднее гармоническое полноты и точности F1 = 0.744, при CV=3
CPU times: user 36.6 s, sys: 49.8 s, total: 1min 26s
Wall time: 1min 26s


**По результатам видно, что балансировка классов улучшает показатель качества модели, но показатель все еще не достаточен. Модель обучается достаточно долго тюкю данных много, поэтому используем выравнивание классов путем downsempling.** 

In [19]:
def downsample(features, target, fraction): 
    features_zeros = features[target == 0] 
    features_ones = features[target == 1] 
    target_zeros = target[target == 0] 
    target_ones = target[target == 1] 

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=2504)] + [features_ones]) 
    
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=2504)] + [target_ones]) 
    
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=2504) 
    
    return features_downsampled, target_downsampled 


In [20]:
features_train_down, target_train_down=downsample(features_train, target_train, 0.2)
features_train_down=features_train_down.reset_index(drop=True)
target_train_down=target_train_down.reset_index(drop=True)


In [21]:
features_train_down = count_tf_idf.transform(features_train_down['lemm_text']
                                                   .values)
del count_tf_idf
del stopwords

In [22]:
%%time

model_logic = LogisticRegression() 
train_logic_f1 = cross_val_score(model_logic, 
                      features_train_down, 
                      target_train_down, 
                      cv=cv_counts, 
                      scoring='f1').mean()
print(f'Cреднее гармоническое полноты и точности F1 = {train_logic_f1:.3}, при CV={cv_counts}')

Cреднее гармоническое полноты и точности F1 = 0.831, при CV=3
CPU times: user 28.7 s, sys: 42.6 s, total: 1min 11s
Wall time: 1min 11s


**Проверка наилучшей модели**

In [23]:
%%time

start_time = time.time() 
model_logic.fit(features_train_down, target_train_down)  
lirning_time = time.time() - start_time 

start_time = time.time()
predictions_logic = model_logic.predict(features_valid_V) 
predicting_time = time.time()- start_time


result_logic =f1_score(target_valid, predictions_logic) 
print(f'Cреднее гармоническое полноты и точности F1 = {result_logic:.4}')

Cреднее гармоническое полноты и точности F1 = 0.7619
CPU times: user 6.24 s, sys: 8.28 s, total: 14.5 s
Wall time: 14.5 s


**Сохранение результатов**

In [41]:
models_scors = models_scors.append({'model': 'model_logic', 'f1': result_logic,
                    'lirning_time': lirning_time, 'predicting_time': predicting_time}, ignore_index=True)

### Модель, дерево решений

In [42]:
%%time
model_DTC = DecisionTreeClassifier()
parametrs=[{ 
    'max_depth':range(60,80,1),  
    'random_state':[2504] 
}]
search_best_param=GridSearchCV(model_DTC, 
                               parametrs, 
                               cv=cv_counts, 
                               scoring='f1')  
search_best_param.fit(features_train_down,  target_train_down) 
print(f'Cреднее гармоническое полноты и точности F1, при CV={cv_counts} ')
means =search_best_param.cv_results_['mean_test_score']
for mean, params in zip(means, search_best_param.cv_results_['params']):
    print("%0.6f for %r"% ((mean), params))
print()
print("Наилучшие найденые параметры:")
print()
print(search_best_param.best_params_)


Cреднее гармоническое полноты и точности F1, при CV=3 
0.787324 for {'max_depth': 60, 'random_state': 2504}
0.789692 for {'max_depth': 61, 'random_state': 2504}
0.789821 for {'max_depth': 62, 'random_state': 2504}
0.791598 for {'max_depth': 63, 'random_state': 2504}
0.791423 for {'max_depth': 64, 'random_state': 2504}
0.790479 for {'max_depth': 65, 'random_state': 2504}
0.791272 for {'max_depth': 66, 'random_state': 2504}
0.793128 for {'max_depth': 67, 'random_state': 2504}
0.792280 for {'max_depth': 68, 'random_state': 2504}
0.792322 for {'max_depth': 69, 'random_state': 2504}
0.794068 for {'max_depth': 70, 'random_state': 2504}
0.794470 for {'max_depth': 71, 'random_state': 2504}
0.794261 for {'max_depth': 72, 'random_state': 2504}
0.796022 for {'max_depth': 73, 'random_state': 2504}
0.796437 for {'max_depth': 74, 'random_state': 2504}
0.797156 for {'max_depth': 75, 'random_state': 2504}
0.795744 for {'max_depth': 76, 'random_state': 2504}
0.797511 for {'max_depth': 77, 'random_state

**Проверка наилучшей модели**

In [43]:
%%time

model_DTС = DecisionTreeClassifier(random_state=2504, max_depth=search_best_param.best_params_['max_depth']) 

start_time = time.time() 
model_DTС.fit(features_train_down, target_train_down)
lirning_time = time.time() - start_time 

start_time = time.time()
predictions_DTС = model_DTС.predict(features_valid_V) 
predicting_time = time.time()- start_time

result_DTС =f1_score(target_valid, predictions_DTС) 
print(f'Cреднее гармоническое полноты и точности F1 = {result_DTС:.4}') 

Cреднее гармоническое полноты и точности F1 = 0.6572
CPU times: user 9.26 s, sys: 12 ms, total: 9.27 s
Wall time: 9.28 s


**Сохранение результатов**

In [44]:
models_scors = models_scors.append({'model': 'DecisionTreeClassifier', 'f1': result_DTС,
                    'lirning_time': lirning_time, 'predicting_time': predicting_time}, ignore_index=True)

### Модель случайный лес

In [45]:
%%time
model_RFС = RandomForestClassifier ()
parametrs={
    'n_estimators':range(130,151,5),
    'max_depth':range(13,16,1) 
}
search_best_param_RFC=GridSearchCV(model_RFС,
                                   parametrs,
                                   scoring='f1')
search_best_param_RFC.fit(features_train_down,  target_train_down)

print(f'Cреднее гармоническое полноты и точности F1, при CV={cv_counts}')
means =search_best_param_RFC.cv_results_['mean_test_score']
for mean, params in zip(means, search_best_param_RFC.cv_results_['params']):
    print("%0.6f for %r"% ((mean), params))
print()
print("Наилучшие найденые параметры:")
print()
print(search_best_param_RFC.best_params_)

Cреднее гармоническое полноты и точности F1, при CV=3
0.037250 for {'max_depth': 13, 'n_estimators': 130}
0.032399 for {'max_depth': 13, 'n_estimators': 135}
0.029218 for {'max_depth': 13, 'n_estimators': 140}
0.027249 for {'max_depth': 13, 'n_estimators': 145}
0.031798 for {'max_depth': 13, 'n_estimators': 150}
0.039718 for {'max_depth': 14, 'n_estimators': 130}
0.039944 for {'max_depth': 14, 'n_estimators': 135}
0.037876 for {'max_depth': 14, 'n_estimators': 140}
0.039328 for {'max_depth': 14, 'n_estimators': 145}
0.043614 for {'max_depth': 14, 'n_estimators': 150}
0.058269 for {'max_depth': 15, 'n_estimators': 130}
0.052053 for {'max_depth': 15, 'n_estimators': 135}
0.050398 for {'max_depth': 15, 'n_estimators': 140}
0.045747 for {'max_depth': 15, 'n_estimators': 145}
0.052650 for {'max_depth': 15, 'n_estimators': 150}

Наилучшие найденые параметры:

{'max_depth': 15, 'n_estimators': 130}
CPU times: user 6min 35s, sys: 674 ms, total: 6min 36s
Wall time: 6min 36s


**Проверка наилучшей модели**

In [46]:
%%time

model_RFС = RandomForestClassifier(random_state=2504,
                                   max_depth=search_best_param_RFC.best_params_['max_depth'],
                                  n_estimators=search_best_param_RFC.best_params_['n_estimators'])

start_time = time.time() 
model_RFС.fit(features_train_down, target_train_down)  
lirning_time = time.time() - start_time 

start_time = time.time()
predictions_RFС = model_RFС.predict(features_valid_V) 
predicting_time = time.time()- start_time

result_RFС =f1_score(target_valid, predictions_RFС) 
print(f'Cреднее гармоническое полноты и точности F1 = {result_RFС:.4}') 

Cреднее гармоническое полноты и точности F1 = 0.04204
CPU times: user 7.3 s, sys: 6.07 ms, total: 7.3 s
Wall time: 7.31 s


**Сохранение результатов**

In [47]:
models_scors = models_scors.append({'model': 'RandomForestClassifier', 'f1': result_RFС,
                    'lirning_time': lirning_time, 'predicting_time': predicting_time}, ignore_index=True)

### Модель,  CatBoostClassifier классификация

**Подбор параметров и кросс валидация**

In [48]:
%%time
model_CatBR = CatBoostClassifier()
parametrs=[{'learning_rate':[0.45,0.5,0.55],
                              'iterations': [500], 
                              'depth': [7], 
                              'random_state':[2504], 
                              'verbose':[False] 
           }]
search_best_param_CatBR=GridSearchCV(model_CatBR,parametrs,scoring='f1')
search_best_param_CatBR.fit(features_train_down, target_train_down) 

CPU times: user 1h 52min 38s, sys: 34.7 s, total: 1h 53min 12s
Wall time: 1h 53min 25s


GridSearchCV(estimator=<catboost.core.CatBoostClassifier object at 0x7f08eefbb2e0>,
             param_grid=[{'depth': [7], 'iterations': [500],
                          'learning_rate': [0.45, 0.5, 0.55],
                          'random_state': [2504], 'verbose': [False]}],
             scoring='f1')

In [49]:
print(f'Cреднее гармоническое полноты и точности F1')
means =search_best_param_CatBR.cv_results_['mean_test_score']
for mean, params in zip(means, search_best_param_CatBR.cv_results_['params']):
    print("%0.6f for %r"% ((mean), params))
print()
print("Наилучшие найденые параметры:")
print()
print(search_best_param_CatBR.best_params_)

Cреднее гармоническое полноты и точности F1
0.855149 for {'depth': 7, 'iterations': 500, 'learning_rate': 0.45, 'random_state': 2504, 'verbose': False}
0.854421 for {'depth': 7, 'iterations': 500, 'learning_rate': 0.5, 'random_state': 2504, 'verbose': False}
0.855760 for {'depth': 7, 'iterations': 500, 'learning_rate': 0.55, 'random_state': 2504, 'verbose': False}

Наилучшие найденые параметры:

{'depth': 7, 'iterations': 500, 'learning_rate': 0.55, 'random_state': 2504, 'verbose': False}


**Кросс валидация**

In [50]:
%%time
model_CatBR = CatBoostClassifier(learning_rate= search_best_param_CatBR.best_params_['learning_rate'], 
                              iterations= search_best_param_CatBR.best_params_['iterations'],
                              depth=search_best_param_CatBR.best_params_['depth'],
                              random_state= 2504, 
                              verbose=False 
                              )
result_CV_CatBR=cross_val_score(model_CatBR, 
                    features_train_down,
                    target_train_down, 
                    cv=cv_counts, 
                    scoring='f1').mean()
print(f'Cреднее гармоническое полноты и точности F1 = {result_CV_CatBR:.4}, при CV={cv_counts}') # среднеквадратичная ошибка 

Cреднее гармоническое полноты и точности F1 = 0.8491, при CV=3
CPU times: user 18min 48s, sys: 5.48 s, total: 18min 53s
Wall time: 18min 56s


**Проверка наилучшей модели**

In [51]:
start_time = time.time() 
model_CatBR.fit(features_train_down, target_train_down)
lirning_time = time.time() - start_time 

start_time = time.time()
predictions_CatBR = model_CatBR.predict(features_valid_V) 
predicting_time = time.time()- start_time

result_CatBR =f1_score(target_valid, predictions_CatBR) 
print(f'Cреднее гармоническое полноты и точности F1 = {result_CatBR:.4}')

Cреднее гармоническое полноты и точности F1 = 0.7537


**Сохранение результатов**

In [52]:
models_scors = models_scors.append({'model': 'CatBoostClassifier', 'f1': result_CatBR,
                    'lirning_time': lirning_time, 'predicting_time': predicting_time}, ignore_index=True)

In [53]:
display(models_scors)

Unnamed: 0,model,f1,lirning_time,predicting_time
0,model_logic,0.761947,13.097542,0.005426
1,DecisionTreeClassifier,0.657224,9.25007,0.020988
2,RandomForestClassifier,0.042042,6.703528,0.600744
3,CatBoostClassifier,0.753684,510.451032,0.563545


### Выводы

1. Наилучшее значение метрики f1 было достигнуто моделями  LogisticRegression и CatBoostClassifier, но у логической регрессии меньшее время обучения и предсказания
2. CatBoostClassifier достигает хорошего значения метрики f1, но имеет значительно болшее время обучения. 
3. Для решения задачи выберем модель показавшей наилучшее значение метрики f1, т.е. модель  LogisticRegression
4. Даун семплинг и выравнивание классов помгло улучшить метрику f1 и уменьшить время обучения 

**Тест**

In [24]:
predictions_logic_1 = model_logic.predict(features_test_V)
result_logic_1 =f1_score(target_test, predictions_logic_1)
print(f'Cреднее гармоническое полноты и точности F1 = {result_logic_1:.4}')

Cреднее гармоническое полноты и точности F1 = 0.7531


1. Модель  LogisticRegression позволяет достичь требуемого значения метрики f1
2. Модель  LogisticRegression имеет не большое время обучения и предсказания
3. При более тонкой настройке модель CatBoostClassifier может позволить получить более хорошие результаты в перспективе.