## Загрузка данных

In [1]:
!pip install torch
!pip install transformers
!pip install lightgbm
!pip install torch

In [32]:
import string
import pandas as pd
import numpy as np
import nltk
import re
import torch
import transformers
from tqdm import notebook 
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from torch.nn.utils.rnn import pad_sequence

Загрузим данные и посмотрим на них

In [33]:
data = pd.read_csv('/datasets/toxic_comments.csv')

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


Видим примерно 160000 записей

In [35]:
data['toxic'].value_counts()

0    143346
1     16225
Name: toxic, dtype: int64

Классы несбалансированы

Поскольку вычисления могут идти очень долго, возьмем примерно по 1% записей каждого из классов (пробовал брать 20000 записей, примерно время было 56 часов.)

In [36]:
pos_data = data.query('toxic == 1').sample(170, random_state=42)
neg_data = data.query('toxic == 0').sample(1400, random_state=42)
data_sample = pd.concat([pos_data, neg_data])
data_sample

Unnamed: 0,text,toxic
45045,"""\nI was talking about them running any check ...",1
27198,White Trash\nFuck off you white piece of trash...,1
16664,eat shit get rid of goofs you queers.,1
8158,DUMB ASSES\nIt's was my sockpuppet...joe hazet...,1
142212,"Are all professors of rhetoric pompous, self-c...",1
...,...,...
98074,Images \n\nYou have been uploading copyrighted...,0
157626,"""\n\n Quote \n\nA fairly minor point but do we...",0
151400,'s official site and Hendrick's driver page fo...,0
124061,Yiddish: Asyva (pronunciation) \n\n Asyva (in ...,0


Посмотрим на пример текста из датасета

In [37]:
data_sample.loc[:, 'text'].reset_index(drop=True)[3]

"DUMB ASSES\nIt's was my sockpuppet...joe hazeton... you guys are out of control..... i am making a folder which will be submited to higher athority, one above the GOD KING JIMBO.... I WOULD STRONGLY SUGGEST YOU FIGURE IT OUT FAST...I DON'T MR BH will LIKE HIS FULL NAME PUBLISH ON WIKIPEDIA AS A NON-NOTABLE WITh charges of slander and libel forthwith...  JOEHAZELTON...PS I DON't KNOW BRYAN OR DINO"

Избавимся от цифр, знаков пунктуации и других ненужных символов

In [39]:
def remove_punctuation(text):
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    new_words = tokenizer.tokenize(text)
    result  = ' '.join(new_words)
    result = re.sub(r'[0-9]+', '', result)
    result = result.strip()
    return result

In [40]:
data_sample['text'] = data_sample['text'].apply(remove_punctuation)

Посмотрим на результаты обработки на примере того же текста

In [41]:
data_sample.loc[:, 'text'].reset_index(drop=True)[3]

'DUMB ASSES It s was my sockpuppet joe hazeton you guys are out of control i am making a folder which will be submited to higher athority one above the GOD KING JIMBO I WOULD STRONGLY SUGGEST YOU FIGURE IT OUT FAST I DON T MR BH will LIKE HIS FULL NAME PUBLISH ON WIKIPEDIA AS A NON NOTABLE WITh charges of slander and libel forthwith JOEHAZELTON PS I DON t KNOW BRYAN OR DINO'

Используем встроенный в BERT токенизатор

In [43]:
tokenizer = transformers.BertTokenizer(vocab_file='vocab.txt')

Посколько мы используем уже обученную модель и мы не можем её переобучить, мы должны выбрать те записи, в которых число токенов не превышает 512

In [44]:
max_seq_length = 512

In [45]:
tokenized = data_sample['text'].apply(lambda x: tokenizer.encode(x, 
                                                        truncation=True, 
                                                        max_length=max_seq_length, 
                                                        add_special_tokens=True))

In [46]:
padded = pad_sequence([torch.as_tensor(seq) for seq in tokenized], batch_first=True)

In [48]:
padded = np.array(padded)

In [49]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

Укажем файл конфига и обученной модели

In [52]:
config = transformers.BertConfig.from_pretrained('bert_config.json')
model = transformers.BertModel.from_pretrained('bert-base-uncased', config=config)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Сформируем эмбеддинги

In [21]:
batch_size = 2
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        embeddings.append(batch_embeddings[0][:,0,:].numpy())

  0%|          | 0/785 [00:00<?, ?it/s]

объединим признаки в один массив

In [22]:
features = np.concatenate(embeddings)

Разделим данные на обучающую и тестовую выборки

In [23]:
features_train, features_test, y_train, y_test = train_test_split(features, data_sample['toxic'], test_size=0.2)

## Обучение

На данном шаге обучим три различных модели и посмотрим значение метрики F<sub>1</sub>

In [24]:
model = LogisticRegression(max_iter=1000)
lr_score = cross_val_score(model, features, data_sample['toxic'], cv=5, scoring='f1')

In [25]:
print(f'Значение метрики F1 для логистической регрессии: {max(lr_score):.2f}')

Значение метрики F1 для логистической регрессии: 0.76


In [26]:
param_grid_rf = [
    {
        'max_depth': [i for i in range(1, 10, 2)],
        'min_samples_split': [i for i in range(2, 10, 2)]
    }
]

In [27]:
rf_model = GridSearchCV(RandomForestClassifier(random_state=42, n_estimators=100), param_grid_rf, scoring='f1', cv=5)
rf_pipeline = Pipeline(steps=[('classifier', rf_model)])
rf_pipeline.fit(features_train, y_train)

Pipeline(steps=[('classifier',
                 GridSearchCV(cv=5,
                              estimator=RandomForestClassifier(random_state=42),
                              param_grid=[{'max_depth': [1, 3, 5, 7, 9],
                                           'min_samples_split': [2, 4, 6, 8]}],
                              scoring='f1'))])

In [28]:
rf_pred = rf_pipeline.predict(features_test)
rf_model_score = f1_score(y_test, rf_pred)
print(f'Значение метрики F1 для случайного леса: {rf_model_score}')

Значение метрики F1 для случайного леса: 0.20408163265306123


In [29]:
param_grid_lgbm = [
    {
        'max_depth': [i for i in range(1, 6, 2)],
        'learning_rate': [0.1, 0.01, 0.2]
    }
]

In [30]:
lgbm_model = GridSearchCV(LGBMClassifier(random_state=42), param_grid_lgbm, scoring='f1', cv=5)
lgbm_pipeline = Pipeline(steps=[('classifier', lgbm_model)])
lgbm_pipeline.fit(features_train, y_train)

Pipeline(steps=[('classifier',
                 GridSearchCV(cv=5, estimator=LGBMClassifier(random_state=42),
                              param_grid=[{'learning_rate': [0.1, 0.01, 0.2],
                                           'max_depth': [1, 3, 5]}],
                              scoring='f1'))])

In [31]:
lgbm_pred = lgbm_pipeline.predict(features_test)
lgbm_model_score = f1_score(y_test, lgbm_pred)
print(f'Значение метрики F1 для модели из lightGBM: {lgbm_model_score}')

Значение метрики F1 для модели из lightGBM: 0.47457627118644075


## Выводы

Если использовать больше данных из датасета, возможно мы получим немного другие результаты на других моделях, но с текущей выборкой логистическая регрессия обучается быстрее всех и даёт лучший результат.Поэтому выберем модель логистической регрессии.  
Значение метрики F<sub>1</sub>: 0.76
