<a href="https://colab.research.google.com/github/Radzon/Toxic_comments_detection/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
!pip install tensorflow



In [35]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.utils import resample

In [37]:
def pre_data_processing(filepath):
  Comments = []
  with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
          parts = line.split(' ', 1)
          if len(parts) != 2:
                continue
          label, text = parts
          if '__label__NORMAL' == label:
            Comments.append((text, 0))
          else:
            Comments.append((text, 1))
  return pd.DataFrame(Comments, columns=['text', 'label'])

In [38]:
# обработка данных в формат датафрейм
df = pre_data_processing('./dataset.txt')

In [None]:
df['ladel'].shape()

In [39]:
# уравнивание двух классов по меньшему классу 
df_majority = df[df.label == 0]
df_minority = df[df.label == 1]

df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority),
                                   random_state=42)

df_balanced = pd.concat([df_majority_downsampled, df_minority])

In [None]:
# настройка токенайзера 
maxWordsCount = 3000
tokenizer = Tokenizer(num_words=maxWordsCount, filters='!–"—#$%&amp;()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r«»', lower=True, split=' ', char_level=False)
tokenizer.fit_on_texts(df_balanced['text'])

In [41]:
# превращение комментариев в вектор с 0 место пропусков (если предложение маленькое)
max_text_len = 16
data = tokenizer.texts_to_sequences(df_balanced['text'])
data_pad = pad_sequences(data, maxlen=max_text_len)
print(data_pad)

[[  43   18  119 ...  216  557  553]
 [   0    0    0 ...  405 3341 1040]
 [   0    0    0 ...  501  591   23]
 ...
 [   0    0    0 ... 1891  124  911]
 [   0    0    0 ...  191 2081  286]
 [   0    0    0 ... 2662   16    3]]


In [42]:
# переименование текста и значений
X = data_pad
Y = np.array(df_balanced['label'])

In [43]:
# случайное перемешивание примеров
indeces = np.random.choice(X.shape[0], size=X.shape[0], replace=False)
X = X[indeces]
Y = Y[indeces]

In [44]:
# разбивка на трейн и тест
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

In [45]:
# создание модели
model = Sequential()
model.add(Embedding(maxWordsCount, 128, input_length=max_text_len))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=Adam(0.0001))

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 16, 128)           512000    
                                                                 
 bidirectional_2 (Bidirecti  (None, 16, 256)           263168    
 onal)                                                           
                                                                 
 dropout_2 (Dropout)         (None, 16, 256)           0         
                                                                 
 bidirectional_3 (Bidirecti  (None, 128)               164352    
 onal)                                                           
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                

In [46]:
# сохранение лучших весов по метрике val_accuracy
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', save_best_only=True, verbose=1)

# предварительная остановка обучения если точность на валидации не изменяется
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, verbose=1, restore_best_weights=True)

In [47]:
# обучение
history = model.fit(X_train, Y_train, batch_size=32, epochs=20, validation_data=(X_test, Y_test), callbacks=[checkpoint, early_stopping])

Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.89344, saving model to best_model.h5
Epoch 2/20


  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.89344 to 0.89516, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_accuracy improved from 0.89516 to 0.89643, saving model to best_model.h5
Epoch 4/20
Epoch 4: val_accuracy improved from 0.89643 to 0.89650, saving model to best_model.h5
Epoch 5/20
Epoch 5: val_accuracy did not improve from 0.89650
Epoch 6/20
Epoch 6: val_accuracy did not improve from 0.89650
Epoch 7/20
Epoch 7: val_accuracy did not improve from 0.89650
Restoring model weights from the end of the best epoch: 4.
Epoch 7: early stopping


In [48]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

def sequence_to_text(list_of_indices):
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

In [49]:
# в переменную t можно  записать свое предложение для проверки
t = "закрой свой рот".lower()
data = tokenizer.texts_to_sequences([t])
t = pad_sequences(data, maxlen=max_text_len)
print( sequence_to_text(data[0]) )

['закрой', 'свой', 'рот']


In [50]:
# проверка предложения
res = model.predict(t)
print(res, np.argmax(res), sep='\n')

[[0.99381876]]
0


In [51]:
# оценка всей моодели на тесте
Y_pred = model.predict(X_test)

Y_pred_classes = (Y_pred > 0.5).astype(int).flatten()

precision = precision_score(Y_test, Y_pred_classes)
recall = recall_score(Y_test, Y_pred_classes)

print(f'Precision: {precision}')
print(f'Recall: {recall}')

Precision: 0.9331928003891682
Recall: 0.8553804994054697
