In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from nltk import word_tokenize, WordNetLemmatizer

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Dense, Bidirectional, GlobalMaxPool1D, LSTM

from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences

from prepare_data import DataPreprocessor



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Radosz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Radosz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
prepData = DataPreprocessor("jigsaw-toxic-comment-classification-challenge/train.csv")

prepData.load_data()
prepData.preprocess_data()
X, y = prepData.get_XY()


                      id                                       comment_text  \
0       0000997932d777bf  explanation\nwhy the edits made under my usern...   
1       000103f0d9cfb60f  d'aww! he matches this background colour i'm s...   
2       000113f07ec002fd  hey man, i'm really not trying to edit war. it...   
3       0001b41b1c6bb37e  "\nmore\ni can't make any real suggestions on ...   
4       0001d958c54c6e35  you, sir, are my hero. any chance you remember...   
...                  ...                                                ...   
159566  ffe987279560d7ff  ":::::and for the second time of asking, when ...   
159567  ffea4adeee384e90  you should be ashamed of yourself \n\nthat is ...   
159568  ffee36eab5c267c9  spitzer \n\numm, theres no actual article for ...   
159569  fff125370e4aaaf3  and it looks like it was actually you who put ...   
159570  fff46fc426af1f9a  "\nand ... i really don't think you understand...   

        toxic  severe_toxic  obscene  threat  insul

In [3]:
# Zmniana wartości klas w binarne
mlb = LabelBinarizer()
y = mlb.fit_transform(y)

# Tokenizacja danych tekstowych
max_words = 20000
max_len = 150
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=max_len)

# Podział metodą train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)




In [4]:
# CNN
inputs = Input(shape=(max_len,))
embedding = Embedding(input_dim=max_words, output_dim=128, input_length=max_len)(inputs)
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding)
pool1 = MaxPooling1D(pool_size=2)(conv1)
conv2 = Conv1D(filters=128, kernel_size=3, activation='relu')(pool1)
pool2 = GlobalMaxPooling1D()(conv2)
dense1 = Dense(128, activation='relu')(pool2)
dropout = Dropout(0.5)(dense1)
outputs = Dense(6, activation='sigmoid')(dropout)
model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define callbacks
checkpoint = ModelCheckpoint('best_model_cnn.keras', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stop = EarlyStopping(monitor='val_loss', patience=2, mode='min', verbose=1)

# Train the model
history_cnn = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2, callbacks=[checkpoint, early_stop])

# Evaluate the model on the testing set
evaluation_cnn = model.evaluate(X_test, y_test)
test_accuracy_cnn = evaluation_cnn[1]
print("Test Accuracy:", test_accuracy_cnn)

Epoch 1/10




[1m797/798[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 32ms/step - accuracy: 0.6754 - loss: 0.1451
Epoch 1: val_loss improved from inf to 0.05243, saving model to best_model_cnn.keras
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 34ms/step - accuracy: 0.6758 - loss: 0.1450 - val_accuracy: 0.9943 - val_loss: 0.0524
Epoch 2/10
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9891 - loss: 0.0504
Epoch 2: val_loss improved from 0.05243 to 0.05160, saving model to best_model_cnn.keras
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 33ms/step - accuracy: 0.9891 - loss: 0.0504 - val_accuracy: 0.9943 - val_loss: 0.0516
Epoch 3/10
[1m797/798[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 30ms/step - accuracy: 0.9867 - loss: 0.0409
Epoch 3: val_loss did not improve from 0.05160
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 32ms/step - accuracy: 0.9867 - loss: 0.0409 - val_accu

In [7]:
print(y_train.shape)

(127656, 6)


In [8]:
inp = Input(shape = (max_len, ))
layer = Embedding(input_dim=max_words, output_dim=128, input_length=max_len)(inp)
layer = Bidirectional(LSTM(50, return_sequences = True, recurrent_dropout = 0.15))(layer)
layer = GlobalMaxPool1D()(layer)
layer = Dropout(0.2)(layer)
layer = Dense(50, activation = 'relu')(layer)
layer = Dropout(0.2)(layer)
layer = Dense(6, activation = 'sigmoid')(layer)
model = Model(inputs = inp, outputs = layer)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
#model.summary()

file_path = 'save_best_model_lstm.keras'
checkpoint = ModelCheckpoint(file_path, monitor = 'val_loss', verbose = 1, save_best_only=True)
early_stop = EarlyStopping(monitor = 'val_loss', patience = 1)

history_lstm = model.fit(X_train, y_train, batch_size = 512, epochs = 10, validation_split = 0.2, callbacks = [checkpoint, early_stop])

# Evaluate the model on the testing set
evaluation_lstm = model.evaluate(X_test, y_test)
test_accuracy_lstm = evaluation_cnn[1]
print("Test Accuracy:", test_accuracy_lstm)







Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 254ms/step - accuracy: 0.4203 - loss: 0.2584
Epoch 1: val_loss improved from inf to 0.05852, saving model to save_best_model_lstm.keras
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 273ms/step - accuracy: 0.4212 - loss: 0.2578 - val_accuracy: 0.9943 - val_loss: 0.0585
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step - accuracy: 0.8603 - loss: 0.0590
Epoch 2: val_loss improved from 0.05852 to 0.04991, saving model to save_best_model_lstm.keras
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 267ms/step - accuracy: 0.8604 - loss: 0.0590 - val_accuracy: 0.9943 - val_loss: 0.0499
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step - accuracy: 0.9052 - loss: 0.0485
Epoch 3: val_loss improved from 0.04991 to 0.04937, saving model to save_best_model_lstm.keras
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m