## Елисеев Е.В.
## Нейросеть классификации характера токсичных комментариев

Данные взяты с <a href="https://www.kaggle.com/datasets/reihanenamdari/youtube-toxicity-data/data">kaggle<a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy 
from tensorflow.keras.metrics import Accuracy
from sklearn.feature_extraction.text import TfidfVectorizer
import sys


data = pd.read_csv("data/youtoxic_english_1000.csv")
data.head()

2023-12-21 22:02:53.708619: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-21 22:02:53.708659: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-21 22:02:53.709251: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-21 22:02:53.714950: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,CommentId,VideoId,Text,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism
0,Ugg2KwwX0V8-aXgCoAEC,04kJtp6pVXI,If only people would just take a step back and...,False,False,False,False,False,False,False,False,False,False,False,False
1,Ugg2s5AzSPioEXgCoAEC,04kJtp6pVXI,Law enforcement is not trained to shoot to app...,True,True,False,False,False,False,False,False,False,False,False,False
2,Ugg3dWTOxryFfHgCoAEC,04kJtp6pVXI,\nDont you reckon them 'black lives matter' ba...,True,True,False,False,True,False,False,False,False,False,False,False
3,Ugg7Gd006w1MPngCoAEC,04kJtp6pVXI,There are a very large number of people who do...,False,False,False,False,False,False,False,False,False,False,False,False
4,Ugg8FfTbbNF8IngCoAEC,04kJtp6pVXI,"The Arab dude is absolutely right, he should h...",False,False,False,False,False,False,False,False,False,False,False,False


In [2]:
# Удалям некоторые стоблцы чтобы облегчить классификацию
data.drop(["CommentId", "VideoId", "IsRadicalism", "IsHomophobic", "IsSexist", "IsNationalist", "IsReligiousHate"], axis = 1, inplace = True)
data

Unnamed: 0,Text,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist
0,If only people would just take a step back and...,False,False,False,False,False,False,False
1,Law enforcement is not trained to shoot to app...,True,True,False,False,False,False,False
2,\nDont you reckon them 'black lives matter' ba...,True,True,False,False,True,False,False
3,There are a very large number of people who do...,False,False,False,False,False,False,False
4,"The Arab dude is absolutely right, he should h...",False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
995,I remember that they sent in the national defe...,False,False,False,False,False,False,False
996,Stats don`t represent the problem. Race baitin...,True,False,False,False,False,True,True
997,The quote from the mother... Wow that hit hard...,False,False,False,False,False,False,False
998,this video is so racist,False,False,False,False,False,False,False


In [3]:
min_df = 8  # Минимальная частота для включения
max_df = 0.8 # Максимальная частота для включения
max_features = 2500 # Максимально возможное число признаков

texts = data["Text"]
labels = data.drop(["Text"], axis = 1)

# Векторизация данных с помощью TF-IDF - оценка важности слова
vectorizer = TfidfVectorizer(max_features=max_features, min_df=min_df, max_df=max_df)
vectorizer.fit(texts)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, 
    labels, 
    test_size=0.2
)

train_texts = vectorizer.transform(train_texts).toarray()
test_texts = vectorizer.transform(test_texts).toarray()
input_shape = [train_texts.shape[1]]

num_classes = labels.shape[1]

# Получение размера изображений в памяти в мб
def get_size(var):
    return round(sys.getsizeof(var) / 1_000_000, 2)


print(f"Количество данных для обучения: {len(train_texts)}, {get_size(train_texts)} мб")
print(f"Количество данных для тестирования: {len(test_texts)}, {get_size(test_texts)} мб")

Количество данных для обучения: 800, 3.47 мб
Количество данных для тестирования: 200, 0.87 мб


In [20]:
from tensorflow.keras.optimizers import RMSprop

# Архитектура сети
model = Sequential([
    # Слой нормализации входных признаков
    BatchNormalization(input_shape=input_shape),
    Dense(256, activation="relu"),
    BatchNormalization(),
    Dropout(0.5),

    Dense(256, activation="relu"),
    BatchNormalization(),
    Dropout(0.5),
    
    Dense(128, activation="relu"),
    BatchNormalization(),
    Dropout(0.5),
    
    Dense(64, activation="relu"),
    BatchNormalization(),
    Dropout(0.5),

    Dense(num_classes, activation="sigmoid")
])

model.compile(optimizer=RMSprop(), loss=BinaryCrossentropy(), metrics=["binary_accuracy"])

epochs=15
batch_size=20
validation_split=0.2

model.fit(train_texts, train_labels, epochs=epochs, batch_size=batch_size, validation_split=validation_split)

evaluation = model.evaluate(test_texts, test_labels)

print("Test Loss:", evaluation[0])
print("Test Accuracy:", evaluation[1])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Loss: 0.35645022988319397
Test Accuracy: 0.8557143211364746


In [22]:
# Проверка своего комментария
def check_comment(string):
    prediction = model.predict(vectorizer.transform([string]).toarray())
    
    for i, j in zip(labels.columns.tolist(), prediction[0].tolist()):
        print(i," : " , j > 0.5)

In [25]:
check_comment("Hello guys, my name is Egor")



IsToxic  :  False
IsAbusive  :  False
IsThreat  :  False
IsProvocative  :  False
IsObscene  :  False
IsHatespeech  :  False
IsRacist  :  False


In [26]:
check_comment("Author, fuck yourself")

IsToxic  :  True
IsAbusive  :  True
IsThreat  :  False
IsProvocative  :  False
IsObscene  :  False
IsHatespeech  :  False
IsRacist  :  False


In [33]:
check_comment("Don't like this")

IsToxic  :  False
IsAbusive  :  False
IsThreat  :  False
IsProvocative  :  False
IsObscene  :  False
IsHatespeech  :  False
IsRacist  :  False
