Load the libraries

In [26]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, GRU
from sklearn.model_selection import train_test_split
import re

In [2]:
###Read the dataset

In [4]:
df = pd.read_csv('toxic-comments.csv')

In [5]:
df.shape

(159571, 8)

In [6]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

Define target variables

In [13]:
toxicities = ["toxic", "severe_toxic", "obscene", "threat",
               "insult", "identity_hate"]

Clean the text

In [8]:
def clean_text(text):
    text = text.lower()   #lowercase
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)   # remove non-alphanumeric characters
    return text

In [10]:
df['comment_text'] = df["comment_text"].apply(clean_text)

In [11]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i m se...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i m really not trying to edit war it s...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i can t make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,and for the second time of asking when your v...,0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself that is a ho...,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres no actual article for prost...,0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


## Define input and output variables

In [None]:
#Feature and target preparation
comments = df["comment_text"].tolist()
targets = df[toxicities].values        # to convert into arrays

In [15]:
targets

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [16]:
targets.shape

(159571, 6)

## Prepare the data

In [18]:
#Tokenizing and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
padded_sequences = pad_sequences(sequences, maxlen=200)

In [19]:
padded_sequences.shape

(159571, 200)

In [20]:
padded_sequences

array([[   0,    0,    0, ..., 4539, 2252,  972],
       [   0,    0,    0, ...,  980,  577,  185],
       [   0,    0,    0, ...,    1,  732,  464],
       ...,
       [   0,    0,    0, ...,   12, 3463, 4381],
       [   0,    0,    0, ...,  153,   36,   10],
       [   0,    0,    0, ..., 1614, 2037,   89]])

## Cross validate

In [23]:
X_train,X_test, y_train, y_test = train_test_split(padded_sequences, targets, test_size=0.2, random_state=0)

In [24]:
X_train.shape

(127656, 200)

In [25]:
X_test.shape

(31915, 200)

## Build the model

In [27]:
# Model Definition
model = Sequential()
model.add(Embedding(5000, 128, input_length=200))
model.add(GRU(64))
model.add(Dense(6, activation="sigmoid"))  #multilabel classification with sigmoid function




In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          640000    
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense (Dense)               (None, 6)                 390       
                                                                 
Total params: 677638 (2.58 MB)
Trainable params: 677638 (2.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
from keras.utils import plot_model

In [36]:
#plot_model(model, show_dtype=True, show_layer_activations=True, show_layer_names=True, show_shapes=True)

## Compile the model 

In [32]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])




## Train the model

In [33]:
model.fit(X_train, y_train, epochs = 3, batch_size=32, validation_data=(X_test,y_test))

Epoch 1/3


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2a7a4b38550>

## Predict on new data

In [49]:
new_comment = "he is a very good person!"

In [50]:
new_sequence = tokenizer.texts_to_sequences([clean_text(new_comment)])
padded = pad_sequences(new_sequence, maxlen=200)
prediction = model.predict(padded)[0]



In [51]:
prediction

array([0.10928448, 0.00076029, 0.00890803, 0.00012984, 0.02138797,
       0.00160304], dtype=float32)

In [52]:
for toxicity, prob in zip(toxicities, prediction):
    print(f"{toxicity}: {prob:.2f}")

toxic: 0.11
severe_toxic: 0.00
obscene: 0.01
threat: 0.00
insult: 0.02
identity_hate: 0.00
