In [1]:
import pandas as pd
import tensorflow as tf
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, GRU, LSTM, Dense, Bidirectional, Dropout
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from keras import backend
import re
import string
import nltk
from nltk.corpus import stopwords

I start by importing the dataset that contains eight columns. The first one contains the comments to be preprocessed, the following six are labels indicating the presence or absence of a certain feature of the comment, a “clean” comment will have all labels of value 0. Finally, the last column contains the sum of the labels found in each comment. 

In [2]:
df = pd.read_csv('Filter_Toxic_Comments_dataset.csv')
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,sum_injurious
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0


I assign to X the comments and to y the values of each label belonging to the comment

In [3]:
X = df['comment_text'].values
y = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

Let's create train and test set

In [4]:
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

Let's create functions to preprocess the text, in the end a text will be returned that is free of special characters, stopwords (such as articles and prepositions) and already tokenized. 

In [5]:
def preprocess_text(text):
    """
    Prende in ingresso del testo, rimuove i caratteri speciali e lo converte in minuscolo.
    Restituisce poi il testo preprocessato.
    """
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

def remove_stopwords(tokens):
    """
    Prende in ingresso dei token ed elimina le stopwords. Restituisce i token "filtrati"
    """
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

def preprocess_pipeline(text):
    """
    Applica tutte le funzioni di preprocessing e restituisce dei token "puliti"
    """
    text = preprocess_text(text)
    tokens = nltk.word_tokenize(text)
    filtered_tokens = remove_stopwords(tokens)
    return filtered_tokens

I apply preprocessing separately to the train and test set to avoid data leakage.

In [6]:
X_train_preprocessed = [preprocess_pipeline(text) for text in X_train]

X_test_preprocessed = [preprocess_pipeline(text) for text in X_test]

Let's create a tokenizer to convert tokens into numeric sequences. With the parameter num_words=10000 I indicate that the tokenizer will maintain a vocabulary of the 10,000 most frequent words in the text data

In [7]:
num_words = 10000
tokenizer = Tokenizer(num_words=10000)

In [8]:
tokenizer.fit_on_texts(X_train_preprocessed)

Applying tokenizer on train set and test set

In [9]:
train_sequences= tokenizer.texts_to_sequences(X_train_preprocessed)
test_sequences= tokenizer.texts_to_sequences(X_test_preprocessed)

Here we calculate vocabulary_size which is needed to correctly define the Embedding layer that maps words into dense vectors, and maxlen which is important to perform padding of sequences so that they all have the same length.

In [10]:
vocabulary_size = len(tokenizer.word_index)+1
maxlen = len(max(train_sequences,key=len))

Let's apply padding

In [11]:
padded_train_sequences = pad_sequences(train_sequences,maxlen=maxlen)
padded_test_sequences = pad_sequences(test_sequences,maxlen=maxlen)

I create a callback so that I stop training a model if performance does not improve for a number of epochs. It will go to monitor val_loss and val_accuracy, if they do not improve for two epochs in a row the training ends. Restore_best_weights=True causes the model weights to be restored to the values that got the best val_loss before the termination.

In [12]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2, 
                               restore_best_weights=True)

We will now go on to compare three types of models: GRU (Gated Recurrent Unit), LSTM(Long short-term memory) and Bidirectional LSTM.

GRU

Here I create a sequential model. 
First we add an Embedding layer that projects each word into a dense vector of size 128. 
Then we add a GRU layer with 64 units and tanh activation function. This is the main recurrent layer that will process the embedded text sequences.
Then we add a Dropout layer with a rate of 0.5 to help prevent overfitting during training. Finally we add a dense layer with 6 output nodes and sigmoid activation. We will use this construction pattern for the other two models as well.

In [13]:
backend.clear_session()
model_GRU = Sequential()
model_GRU.add(Embedding(vocabulary_size, 128))
model_GRU.add(GRU(64,activation='tanh'))
model_GRU.add(Dropout(0.5))
model_GRU.add(Dense(6,activation='sigmoid'))
model_GRU.build(input_shape=(None, maxlen))
model_GRU.summary()

2024-05-31 13:43:13.329383: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-05-31 13:43:13.329412: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-05-31 13:43:13.329415: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-05-31 13:43:13.329715: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-31 13:43:13.329730: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Let's compile the model using rmsprop as the optimizer and binary_crossentropy as the loss function since we are dealing with a binary classification, we choose accuracy as the metric to be evaluated during model training.

In [14]:
model_GRU.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])

First let's train the model for five epochs

In [15]:
model_GRU.fit(padded_train_sequences,y_train, 
                      validation_data=(padded_test_sequences,y_test), 
                      epochs= 5, batch_size=256, callbacks=[early_stopping])

Epoch 1/5


2024-05-31 13:43:13.992500: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 543ms/step - accuracy: 0.6535 - loss: 0.2115 - val_accuracy: 0.9867 - val_loss: 0.0737
Epoch 2/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 479ms/step - accuracy: 0.9319 - loss: 0.0705 - val_accuracy: 0.9928 - val_loss: 0.0587
Epoch 3/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 462ms/step - accuracy: 0.9317 - loss: 0.0584 - val_accuracy: 0.9928 - val_loss: 0.0551
Epoch 4/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 466ms/step - accuracy: 0.9433 - loss: 0.0548 - val_accuracy: 0.9897 - val_loss: 0.0542
Epoch 5/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 471ms/step - accuracy: 0.9358 - loss: 0.0521 - val_accuracy: 0.9928 - val_loss: 0.0537


<keras.src.callbacks.history.History at 0x31ea933d0>

Then let's evaluate the model

In [16]:
model_GRU.evaluate(padded_test_sequences, y_test)

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 28ms/step - accuracy: 0.9932 - loss: 0.0526


[0.05373097583651543, 0.9927933812141418]

LSTM

In [17]:
backend.clear_session()
model_LSTM = Sequential()
model_LSTM.add(Embedding(vocabulary_size, 128))
model_LSTM.add(LSTM(64,activation='tanh'))
model_LSTM.add(Dropout(0.5))
model_LSTM.add(Dense(6,activation='sigmoid'))
model_LSTM.build(input_shape=(None, maxlen))
model_LSTM.summary()

In [18]:
model_LSTM.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics = ['accuracy'])

In [19]:
model_LSTM.fit(padded_train_sequences,y_train, 
                      validation_data=(padded_test_sequences,y_test), 
                      epochs= 5, batch_size=256, callbacks=[early_stopping])

Epoch 1/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m377s[0m 754ms/step - accuracy: 0.5725 - loss: 0.1886 - val_accuracy: 0.9869 - val_loss: 0.0817
Epoch 2/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m491s[0m 984ms/step - accuracy: 0.9538 - loss: 0.0815 - val_accuracy: 0.9936 - val_loss: 0.0592


<keras.src.callbacks.history.History at 0x3177881d0>

In [20]:
model_LSTM.evaluate(padded_test_sequences, y_test)

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 28ms/step - accuracy: 0.9873 - loss: 0.0798


[0.08167001605033875, 0.9869027137756348]

BIDIRECTIONAL LSTM

In [21]:
backend.clear_session()
model_BiLSTM = Sequential()
model_BiLSTM.add(Embedding(vocabulary_size, 128))
model_BiLSTM.add(Bidirectional(LSTM(64,activation='tanh')))
model_BiLSTM.add(Dropout(0.5))
model_BiLSTM.add(Dense(6,activation='sigmoid'))
model_BiLSTM.build(input_shape=(None, maxlen))
model_BiLSTM.summary()

In [22]:
model_BiLSTM.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics = ['accuracy'])

In [23]:
model_BiLSTM.fit(padded_train_sequences,y_train, 
                      validation_data=(padded_test_sequences,y_test), 
                      epochs= 5, batch_size=256, callbacks=[early_stopping])

Epoch 1/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m570s[0m 1s/step - accuracy: 0.7176 - loss: 0.1706 - val_accuracy: 0.9936 - val_loss: 0.0886
Epoch 2/5
[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m620s[0m 1s/step - accuracy: 0.9824 - loss: 0.0764 - val_accuracy: 0.9937 - val_loss: 0.0618


<keras.src.callbacks.history.History at 0x38e4eb350>

In [24]:
model_BiLSTM.evaluate(padded_test_sequences, y_test)

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 48ms/step - accuracy: 0.9940 - loss: 0.0867


[0.08855487406253815, 0.993639349937439]

We'll use the last trained model. To test its operation we created a function that prints on screen the comment, the labels assigned to it, and the labels predicted by the model

In [25]:
def print_comment_and_labels(X_test, y_test, y_pred, index):
    print("Commento:")
    print(X_test[index])
    print("\nLabel effettive:")
    print("Toxic: {}, Severe Toxic: {}, Obscene: {}, Threat: {}, Insult: {}, Identity Hate: {}".format(
        y_test[index][0], y_test[index][1], y_test[index][2], y_test[index][3], y_test[index][4], y_test[index][5]))
    print("\nLabel predette:")
    print("Toxic: {}, Severe Toxic: {}, Obscene: {}, Threat: {}, Insult: {}, Identity Hate: {}".format(
        y_pred[index][0], y_pred[index][1], y_pred[index][2], y_pred[index][3], y_pred[index][4], y_pred[index][5]))


In [112]:
y_pred = model_BiLSTM.predict(padded_test_sequences)
y_pred = (y_pred > 0.3).astype(int)

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 50ms/step


In [113]:
print_comment_and_labels(X_test, y_test, y_pred, 7)

Commento:
"::::Doing a word search is not reading the article. Others can take it from me that this is precisely what Coase says. I will be happy to explain to anyone else who is interested (not you either Luke). THF is not capable of engaging in rational debate on these issues. He has a clear conflict of interest, because he is a professional lobbyist, and appears to be well trained in this Karl Rove style of engagement.  

"

Label effettive:
Toxic: 0, Severe Toxic: 0, Obscene: 0, Threat: 0, Insult: 0, Identity Hate: 0

Label predette:
Toxic: 0, Severe Toxic: 0, Obscene: 0, Threat: 0, Insult: 0, Identity Hate: 0


In [114]:
print_comment_and_labels(X_test, y_test, y_pred, 23)

Commento:
YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COCKSUCKER! YOU ARE A MOTHJER FUCKER COC

The model works! The right labels for offensive comments are predicted.