
Надо обучить модель классифицировать комментарии на позитивные и негативные. В нашем распоряжении набор данных с разметкой о токсичности правок.

Постройте модель со значением метрики качества *F1* не меньше 0.75. 


### Описание данных

Данные находятся в файле `toxic_comments.csv`. Столбец *text* в нём содержит текст комментария, а *toxic* — целевой признак.

# 1. Подготовка

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import catboost
from catboost import CatBoostClassifier

In [2]:
from pymystem3 import Mystem
m = Mystem()
import re
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [3]:
import sys
import spacy

In [4]:
df = pd.read_csv('toxic_comments.csv')

In [5]:
df.head()

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [6]:
df['toxic'].describe()

count    159571.000000
mean          0.101679
std           0.302226
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: toxic, dtype: float64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


### очистка

In [8]:
def clear_text(text):
    document = re.sub(r'\W', ' ', text)
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    # Converting to Lowercase
    document = document.lower()

    cleared = document.split()
    return " ".join(cleared)

In [9]:
df['cleared']=df['text'].apply(clear_text)

In [10]:
df['cleared']

0         explanation why the edits made under my userna...
1         d aww he matches this background colour m seem...
2         hey man m really not trying to edit war it jus...
3         more can make any real suggestions on improvem...
4         you sir are my hero any chance you remember wh...
                                ...                        
159566    and for the second time of asking when your vi...
159567    you should be ashamed of yourself that is horr...
159568    spitzer umm theres no actual article for prost...
159569    and it looks like it was actually you who put ...
159570    and really don think you understand came here ...
Name: cleared, Length: 159571, dtype: object

### леммизация

In [11]:

# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])
def lemmitize(col):
    doc = nlp(col)
    return " ".join([token.lemma_ for token in doc])


In [12]:
%%time
df['lemmas'] = df['cleared'].apply(lemmitize)

CPU times: user 10min 34s, sys: 567 ms, total: 10min 34s
Wall time: 10min 34s


### разделение данных

In [13]:
df = df.drop(['text','cleared'],axis=1)

In [14]:
train_and_valid,test = train_test_split(df, test_size = 0.1, random_state = 111)
train, valid =train_test_split(train_and_valid, test_size = 0.15, random_state = 111)

In [15]:
print(train.shape, "train")
print(valid.shape,"valid")
print(test.shape, " test")

(122071, 2) train
(21542, 2) valid
(15958, 2)  test


### векторизация текста

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vitaliymalcev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
%%time
stopwords = set(nltk_stopwords.words('english'))
#orpus = df['cleared'].values.astype('U')
count_tf_idf = TfidfVectorizer(stop_words=stopwords, max_features = 3000, min_df=5, max_df=0.7,)

train_features = count_tf_idf.fit_transform(train['lemmas'])
test_features = count_tf_idf.transform(test['lemmas'])
valid_features = count_tf_idf.transform(valid['lemmas'])


#print("Размер матрицы:", tf_idf.shape)

CPU times: user 8.1 s, sys: 108 ms, total: 8.21 s
Wall time: 8.36 s


In [18]:
print(train_features.shape, "train")
print(valid_features.shape,"valid")
print(test_features.shape, " test")

(122071, 3000) train
(21542, 3000) valid
(15958, 3000)  test


# 2. Обучение

In [19]:
train_target = train['toxic']
test_target = test['toxic']
valid_target = valid['toxic']

In [20]:
%%time
model = LogisticRegression()
model.fit(train_features,train_target)
predictions = model.predict(test_features)

CPU times: user 1.85 s, sys: 0 ns, total: 1.85 s
Wall time: 2.05 s


In [21]:
print("F1 score:",f1_score(test_target, predictions)) #маловато F1 score: 0.7478711588300629

F1 score: 0.7478711588300629


In [22]:
%%time
model2 = DecisionTreeClassifier(max_depth= 120)
model2.fit(train_features,train_target)
predictions2 = model2.predict(test_features)

CPU times: user 42 s, sys: 12 ms, total: 42 s
Wall time: 42 s


In [23]:
# max_depth= 120 F1 score: 0.7137521222410868
print("F1 score:",f1_score(test_target, predictions2)) #маловато

F1 score: 0.7145287512759442


In [24]:
%%time
for i in range(25,90, 5):
    model2 = DecisionTreeClassifier(max_depth= i)
    model2.fit(train_features,train_target)
    predictions2 = model2.predict(valid_features)
    print("F1 score:",f1_score(valid_target, predictions2), "max_depth:", i)
  
#F1 score: 0.7202706219099663 max_depth: 65 лучший, не достаточно

F1 score: 0.67152466367713 max_depth: 25
F1 score: 0.6834115373999449 max_depth: 30
F1 score: 0.6883645240032547 max_depth: 35
F1 score: 0.6969127516778524 max_depth: 40
F1 score: 0.7005559968228753 max_depth: 45
F1 score: 0.7025748817656332 max_depth: 50
F1 score: 0.7070548124836087 max_depth: 55
F1 score: 0.7105538140020897 max_depth: 60
F1 score: 0.7136198860693942 max_depth: 65
F1 score: 0.7156582705790704 max_depth: 70
F1 score: 0.7135497166409067 max_depth: 75
F1 score: 0.7143587123147674 max_depth: 80
F1 score: 0.7133367399080225 max_depth: 85
CPU times: user 4min 4s, sys: 7.96 ms, total: 4min 4s
Wall time: 4min 4s


In [25]:
%%time
Catmodel = CatBoostClassifier(
    iterations=10000,
    random_seed=63,
    learning_rate=0.1,
    early_stopping_rounds=150,
    loss_function='Logloss',
)

Catmodel.fit(
    train_features, train_target,
    eval_set=(valid_features, valid_target),
    verbose=True,
    plot=False
)

0:	learn: 0.5920640	test: 0.5922038	best: 0.5922038 (0)	total: 791ms	remaining: 2h 11m 46s
1:	learn: 0.5152103	test: 0.5155210	best: 0.5155210 (1)	total: 975ms	remaining: 1h 21m 13s
2:	learn: 0.4539772	test: 0.4545628	best: 0.4545628 (2)	total: 1.16s	remaining: 1h 4m 20s
3:	learn: 0.4054827	test: 0.4063255	best: 0.4063255 (3)	total: 1.34s	remaining: 56m 1s
4:	learn: 0.3684176	test: 0.3694589	best: 0.3694589 (4)	total: 1.53s	remaining: 50m 56s
5:	learn: 0.3388325	test: 0.3400989	best: 0.3400989 (5)	total: 1.71s	remaining: 47m 34s
6:	learn: 0.3157110	test: 0.3170790	best: 0.3170790 (6)	total: 1.9s	remaining: 45m 6s
7:	learn: 0.2973143	test: 0.2988394	best: 0.2988394 (7)	total: 2.08s	remaining: 43m 15s
8:	learn: 0.2831577	test: 0.2848313	best: 0.2848313 (8)	total: 2.26s	remaining: 41m 49s
9:	learn: 0.2722691	test: 0.2739388	best: 0.2739388 (9)	total: 2.44s	remaining: 40m 40s
10:	learn: 0.2618269	test: 0.2637083	best: 0.2637083 (10)	total: 2.62s	remaining: 39m 43s
11:	learn: 0.2540207	test

<catboost.core.CatBoostClassifier at 0x7f814b67d590>

In [26]:
prediction3 = Catmodel.predict(test_features)
print("f1:",f1_score(test_target, prediction3))

#f1: 0.763597582651973

#ура, нужный результат достигнут!

f1: 0.763597582651973


### попробуем bert

In [27]:
import torch
import transformers
from keras_bert import load_trained_model_from_checkpoint
from tqdm import notebook

Using TensorFlow backend.


In [28]:
df = pd.read_csv('toxic_comments.csv') #начнем сначала

In [29]:
folder = 'wwm_cased_L-24_H-1024_A-16'
#BERT-Large, Uncased (Whole Word Masking): 24-layer, 1024-hidden, 16-heads, 340M parameters
#https://github.com/google-research/bert

config_path = folder+'/bert_config.json'
checkpoint_path = folder+'/bert_model.ckpt'
vocab_path = folder+'/vocab.txt'
model_meta_path = folder+'/bert_model.ckpt.meta'
model_data_path = folder+'/bert_model.ckpt.data-00000-of-00001'


In [30]:
tokenizer = transformers.BertTokenizer(
    vocab_file=vocab_path, max_position_embeddings = 510)

In [31]:
df['cleared']=df['text'].apply(clear_text) # очистим

In [32]:
tokinized= df['cleared'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

In [33]:
maxlen = 0
for i in tokinized:
    if len(i) > maxlen:
        maxlen =len(i)
print(maxlen) #2502

2502


In [34]:
padded = np.array([i + [0]*(maxlen - len(i)) for i in tokinized.values])
attention_mask = np.where(padded != 0, 1, 0)

In [42]:
config = transformers.BertConfig.from_json_file(config_path)

In [80]:
model = transformers.BertForPreTraining.from_pretrained(checkpoint_path, config = config, from_tf = True)

In [78]:
%%time

batch_size = 100
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        print(batch.shape)
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        print(attention_mask_batch.shape)
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        embeddings.append(batch_embeddings[0][:,0,:].numpy())
        

# IndexError: index out of range in self. непонятно почему не работает(
        

HBox(children=(FloatProgress(value=0.0, max=1595.0), HTML(value='')))

torch.Size([100, 2502])
torch.Size([100, 2502])



IndexError: index out of range in self

# 3. Выводы

мы добились нужного результата в 0.75F1 используя градиентный бустинг. сложнее всего было справляться с постоянно умирающим кернелом, зато это заставило научиться работать на google cloud platform.