In [317]:
import pandas as pd
import numpy as np
import torch
import transformers
from sklearn.model_selection import train_test_split
from tqdm import notebook


<h1>Содержание<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Подготовка" data-toc-modified-id="Подготовка-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Подготовка</a></span></li><li><span><a href="#Обучение" data-toc-modified-id="Обучение-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Обучение</a></span></li><li><span><a href="#Выводы" data-toc-modified-id="Выводы-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Выводы</a></span></li><li><span><a href="#Чек-лист-проверки" data-toc-modified-id="Чек-лист-проверки-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Чек-лист проверки</a></span></li></ul></div>

# Проект для «Викишоп» c BERT


Интернет-магазин «Викишоп» запускает новый сервис. Теперь пользователи могут редактировать и дополнять описания товаров, как в вики-сообществах. То есть клиенты предлагают свои правки и комментируют изменения других. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию. 

Обучите модель классифицировать комментарии на позитивные и негативные. В вашем распоряжении набор данных с разметкой о токсичности правок.

Постройте модель со значением метрики качества *F1* не меньше 0.75. 

**Инструкция по выполнению проекта**

1. Загрузите и подготовьте данные.
2. Обучите разные модели. 
3. Сделайте выводы.

Для выполнения проекта применять *BERT* необязательно, но вы можете попробовать.

**Описание данных**

Данные находятся в файле `toxic_comments.csv`. Столбец *text* в нём содержит текст комментария, а *toxic* — целевой признак.

## Подготовка

In [318]:
try:
    data = pd.read_csv('../datasets/toxic_comments.csv', index_col=[0], parse_dates=[0])

except:
    data = pd.read_csv('https://code.s3.yandex.net/datasets/toxic_comments.csv', index_col=[0], parse_dates=[0])
data

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
159446,""":::::And for the second time of asking, when ...",0
159447,You should be ashamed of yourself \n\nThat is ...,0
159448,"Spitzer \n\nUmm, theres no actual article for ...",0
159449,And it looks like it was actually you who put ...,0


In [319]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159292 entries, 0 to 159450
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159292 non-null  object
 1   toxic   159292 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.6+ MB


In [320]:
# features = data.drop('toxic', axis=1)
# target = data['toxic']
data = data.sample(n=400).reset_index(drop=True)
data

Unnamed: 0,text,toxic
0,"""\nIt is speculation it will be at this PPV st...",0
1,"""\n\nI agree with 100-200,000 in infobox, but ...",0
2,"""\nIt probably labelled it because it is a sma...",0
3,"""\nSo, maybe the sentence could be something l...",0
4,"Nonetheless, in this one instance, I would lik...",0
...,...,...
395,"""\n\n Please stop your disruptive editing. If ...",0
396,The more serious perspective obviously wasn't ...,0
397,"""\n\n Help \n\nHello. I need your help with so...",0
398,excuse me? I have been too busy fighting vanda...,0


In [321]:
# features_train, target_train, features_test, target_test = train_test_split(features, target, test_size=0.2, random_state=12345, stratify=target)
# features_train
# data_tmp = data
#
# for index, sentence in enumerate(data_tmp['text']):
#     if len(sentence) < 500:
#         data_tmp = data_tmp.drop(index=index)
#         print(index)
#
# data_tmp


In [322]:
data = data[data['text'].apply(lambda x: len(x) < 501)]
data = data.reset_index(drop=True)
data

Unnamed: 0,text,toxic
0,"""\nIt is speculation it will be at this PPV st...",0
1,"""\n\nI agree with 100-200,000 in infobox, but ...",0
2,"""\nSo, maybe the sentence could be something l...",0
3,"Nonetheless, in this one instance, I would lik...",0
4,"""\nAaaww and Thank you also for the 'Millionai...",0
...,...,...
311,You support the president!?\nDo you like the s...,1
312,"""\n\n Please stop your disruptive editing. If ...",0
313,The more serious perspective obviously wasn't ...,0
314,excuse me? I have been too busy fighting vanda...,0


In [323]:
tokenizer = transformers.BertTokenizer(
    vocab_file='../materials/vocab.txt')

tokenized = data['text'].apply(
    lambda x: tokenizer.encode(x, add_special_tokens=True))

In [324]:

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
max_len

135

In [325]:
# max_len = 139
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [326]:
padded.shape

(316, 135)

In [327]:
config = transformers.BertConfig.from_json_file(
    '../materials/config.json')
model = transformers.BertModel.from_pretrained(
    '../materials/pytorch_model.bin', config=config)

Some weights of the model checkpoint at ../materials/pytorch_model.bin were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [328]:
batch_size = 100
embeddings = []

for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
    batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)])
    attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])

    with torch.no_grad():
        batch_embeddings = model(batch, attention_mask=attention_mask_batch)

    embeddings.append(batch_embeddings[0][:,0,:].numpy())

  0%|          | 0/3 [00:00<?, ?it/s]

In [329]:
features = np.concatenate(embeddings)
features

array([[-0.25125277, -0.28172967,  0.22037637, ..., -0.5721744 ,
         0.6111305 ,  0.59888893],
       [ 0.06367791, -0.02704437,  0.29600024, ..., -0.21421309,
         0.7872639 ,  0.5366682 ],
       [ 0.02140377,  0.05191457, -0.46557865, ..., -0.60925996,
         0.6501118 ,  0.6718733 ],
       ...,
       [-0.32522666, -0.08437072, -0.00996763, ..., -0.73030025,
         0.6971986 ,  0.78085023],
       [-0.3590529 , -0.14590898, -0.08348237, ..., -0.5665538 ,
         0.65368783,  0.21025373],
       [ 0.36745885,  0.01070492, -0.05497965, ..., -0.20728862,
         0.44282463,  0.5527018 ]], dtype=float32)

## Обучение

In [333]:
data['toxic']

0      0
1      0
2      0
3      0
4      0
      ..
311    1
312    0
313    0
314    0
315    0
Name: toxic, Length: 316, dtype: int64

In [334]:
data['toxic']

0      0
1      0
2      0
3      0
4      0
      ..
311    1
312    0
313    0
314    0
315    0
Name: toxic, Length: 316, dtype: int64

In [332]:
features_train, target_train, features_test, target_test = train_test_split(features, data['toxic'], test_size=0.2, random_state=12345, stratify=data['toxic'])


ValueError: Found input variables with inconsistent numbers of samples: [300, 316]

## Выводы

## Чек-лист проверки

- [x]  Jupyter Notebook открыт
- [ ]  Весь код выполняется без ошибок
- [ ]  Ячейки с кодом расположены в порядке исполнения
- [ ]  Данные загружены и подготовлены
- [ ]  Модели обучены
- [ ]  Значение метрики *F1* не меньше 0.75
- [ ]  Выводы написаны