In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Read data
df = pd.read_csv('CommentToxicity-main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv')

# Define input and target variables
X = df['comment_text']
y = df[df.columns[2:]]

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# Splitting the dataset into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

In [5]:
# Tokenizing and padding text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [6]:
maxlen = 100  # Adjust according to your maximum sequence length
vocab_size = len(tokenizer.word_index) + 1

X_train_pad = pad_sequences(X_train_seq, padding='post', maxlen=maxlen)
X_val_pad = pad_sequences(X_val_seq, padding='post', maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, padding='post', maxlen=maxlen)

In [8]:
# Creating LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 100)) 
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y_train.shape[1], activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training LSTM model
model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_val_pad, y_val), callbacks=[EarlyStopping(patience=3)])


Epoch 1/5
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m522s[0m 297ms/step - accuracy: 0.9863 - loss: 0.1541 - val_accuracy: 0.9923 - val_loss: 0.0899
Epoch 2/5
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m490s[0m 280ms/step - accuracy: 0.9935 - loss: 0.0720 - val_accuracy: 0.9940 - val_loss: 0.0524
Epoch 3/5
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m535s[0m 306ms/step - accuracy: 0.9943 - loss: 0.0465 - val_accuracy: 0.9940 - val_loss: 0.0520
Epoch 4/5
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m462s[0m 264ms/step - accuracy: 0.9940 - loss: 0.0401 - val_accuracy: 0.9940 - val_loss: 0.0513
Epoch 5/5
[1m1746/1746[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m423s[0m 242ms/step - accuracy: 0.9931 - loss: 0.0346 - val_accuracy: 0.9939 - val_loss: 0.0533


<keras.src.callbacks.history.History at 0x185b1956090>

In [11]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [9]:
# Making predictions
input_text = ["you black fuck"]
input_text_seq = tokenizer.texts_to_sequences(input_text)
input_text_pad = pad_sequences(input_text_seq, padding='post', maxlen=maxlen)
predictions = model.predict(input_text_pad)
print("Predictions:", predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 248ms/step
Predictions: [[0.99584085 0.27822772 0.9673341  0.04030428 0.8151385  0.12813687]]


In [10]:
y_val_pred = model.predict(X_val_pad)
y_test_pred = model.predict(X_test_pad)

y_val_pred_class = (y_val_pred > 0.5).astype(int)
y_test_pred_class = (y_test_pred > 0.5).astype(int)

precision = precision_score(y_test, y_test_pred_class, average='micro')
recall = recall_score(y_test, y_test_pred_class, average='micro')
accuracy = accuracy_score(y_test, y_test_pred_class)
print(f'Test Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}')

[1m1048/1048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 22ms/step
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step
Test Precision: 0.7955719557195572, Recall: 0.6805555555555556, Accuracy: 0.9179083693078959
