In [25]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import Dense , Embedding , LSTM , GRU
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re

In [5]:
df = pd.read_csv("toxic-comments.csv")


In [6]:
df.shape

(159571, 8)

In [7]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [12]:
## Define target variables (adapt based on your needs)
toxicities = ["toxic" , "severe_toxic" ,'obscene', 'threat',
             'insult', 'identity_hate' ]

#### Text Cleaning

In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]+" , " " ,  text)  #remove non alphabetic characters
    return text

In [14]:
df["comment_text"] = df["comment_text"].apply(clean_text)

In [15]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i m se...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i m really not trying to edit war it s...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i can t make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,and for the second time of asking when your v...,0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself that is a ho...,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres no actual article for prost...,0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


### Define input and output variables

In [16]:
## Feature and target preparation
comments = df["comment_text"].tolist()
targets = df[toxicities].values

In [18]:
targets.shape

(159571, 6)

### Prepare the data

In [19]:
### Tokenization and Padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
padded_sequences = pad_sequences(sequences, maxlen=200)

In [22]:
padded_sequences.shape

(159571, 200)

### Cross Validate

In [23]:
X_train , X_test , y_train , y_test = train_test_split(
    padded_sequences , targets , test_size = 0.2 , random_state = 0
)

In [24]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((127656, 200), (31915, 200), (127656, 6), (31915, 6))

#### Build the model

In [27]:
model = Sequential()
model.add(Embedding(5000, 128 , input_length=200))
model.add(GRU(64))           
model.add(Dense(6, activation="sigmoid"))




In [28]:
from keras.utils import plot_model

In [30]:
#plot_model(model , show_dtype=True , show_layer_activations=True ,show_layer_names=True , show_shapes=True )

### Compile the model

In [31]:
model.compile(loss = "binary_crossentropy" , optimizer="adam" , metrics=["accuracy"])




### Train the model

In [32]:
model.fit(X_train ,y_train , epochs=3 , batch_size=32 ,validation_data=(X_test , y_test))

Epoch 1/3


Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2937e9e4f10>

### Predict on the new data

In [143]:
new_comment = "Good to see you back on social media"

In [144]:
new_comment = clean_text(new_comment)

In [145]:
#tokenizer.fit_on_texts(new_comment)
sequences = tokenizer.texts_to_sequences(new_comment)
padded_new_sequences = pad_sequences(sequences, maxlen=200)

In [146]:
prediction = model.predict(padded_new_sequences)[0]



In [147]:
prediction

array([0.03210543, 0.00582975, 0.06107085, 0.0020895 , 0.0162067 ,
       0.00748804], dtype=float32)

In [148]:
for toxicity , prob in zip(toxicities , prediction):
    print(f"{toxicity} : {prob:.2f}")

toxic : 0.03
severe_toxic : 0.01
obscene : 0.06
threat : 0.00
insult : 0.02
identity_hate : 0.01


In [149]:
for toxicity , prob in zip(toxicities , prediction):
    print(f"{toxicity} : {prob:.2f}")

toxic : 0.03
severe_toxic : 0.01
obscene : 0.06
threat : 0.00
insult : 0.02
identity_hate : 0.01


In [150]:
pip install gensim