In [2]:
import os 
import pandas as pd 
import numpy as np 
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import LSTM , Dropout , Bidirectional , Dense ,Embedding




**Importing the data:**

In [3]:
df = pd.read_csv('comments.csv')

In [4]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
df.iloc[2]['comment_text']

"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info."

In [8]:
df[df.columns[2:]].iloc[6]

toxic            1
severe_toxic     1
obscene          1
threat           0
insult           1
identity_hate    0
Name: 6, dtype: int64

**Preprocces:**

In [10]:
from tensorflow.keras.layers import TextVectorization

In [11]:
#Feature
X = df['comment_text']

#Labels
y = df[df.columns[2:]].values



In [12]:
MAX_WORDS = 200000 #nombre des mots 

In [13]:
vectorizer = TextVectorization(max_tokens = MAX_WORDS,
                              output_sequence_length = 1800, 
                              output_mode = 'int')
#output_sequence_length : max length of our sentence in token 

In [14]:
vectorizer.adapt(X.values) 

In [15]:
vectorizer('Hello world , life is great')[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([286, 261, 305,   9, 275], dtype=int64)>

In [16]:
vectorized_text = vectorizer(X.values)
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  643,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2506, ...,     0,     0,     0],
       [  425,   440,    70, ...,     0,     0,     0],
       ...,
       [32141,  7329,   383, ...,     0,     0,     0],
       [    5,    12,   533, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [17]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) #helps bottlenecks 

In [18]:
batchX , batchy = dataset.as_numpy_iterator().next()

In [19]:
train = dataset.take(int(len(dataset)*.7))

val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))

test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))
                                              

In [20]:
train_gen = train.as_numpy_iterator()

In [21]:
train_gen.next()

(array([[   425,    651,     14, ...,      0,      0,      0],
        [   170,      1,     41, ...,      0,      0,      0],
        [   459,      4,  70704, ...,      0,      0,      0],
        ...,
        [  3071,    135,    361, ...,      0,      0,      0],
        [ 91306,     28,    123, ...,      0,      0,      0],
        [     3, 142611,      5, ...,      0,      0,      0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0]], dtype=int64))

**Creating our squential Model:**

In [22]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import LSTM , Dropout , Bidirectional , Dense ,Embedding


In [23]:
model = Sequential()

#Create the embedding layer
model.add(Embedding(MAX_WORDS +1,32))

#Create the bidirectional LSTM layer  
model.add(Bidirectional(LSTM(32,activation = 'tanh')))
 
#feautre extractor fully connected layers
model.add(Dense(128,activation = 'relu'))
model.add(Dense(256,activation = 'relu'))
model.add(Dense(128,activation = 'relu'))
# final layer
model.add(Dense(6,activation = 'sigmoid'))


In [24]:
model.compile(loss='BinaryCrossentropy', optimizer = 'Adam')


In [33]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

**The next fit of our model takes time ! for my case it takes 1h:30min**

In [28]:
history = model.fit(train, epochs=1 , validation_data = val) ## IT TAKES TIME

In [29]:
history.history

**Testing some predictions:**

In [34]:
input_text = vectorizer('I am going to kill you ')

In [35]:
batch = test.as_numpy_iterator().next()

In [33]:
input_text

In [32]:
result = model.predict(np.expand_dims(input_text,0))
print(result*100)

In [31]:
df.columns[2:]

**Evaluation of the model with some mesures:**

In [39]:
#Evaluate the model 
from tensorflow.keras.metrics import Precision , Recall , CategoricalAccuracy

In [40]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy() 

In [30]:
for batch in test.as_numpy_iterator():
    X_true , y_true = batch
    yhat = model.predict(X_true)
    
    y_true = y_true.flatten()
    
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [25]:
 print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()},Accuracy:{acc.result().numpy()}')

**Using gradio interface to test our model:**

In [34]:
import gradio as gr

In [35]:
model.save('Toxicity.h5')

In [36]:
#if we want to reload our model 
model1 = tf.keras.models.load_model('Toxicity.h5')

In [38]:
#simple test of our imported model 
input_str = vectorizer('I hate you')
res = model1.predict(np.expand_dims(input_str,0))
print(res)

[[0.5021187  0.49650365 0.5017468  0.49797675 0.50047195 0.4979084 ]]


In [39]:
#Just to compare values with columns 
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [40]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model1.predict(vectorized_comment)
    
    
    text = ''
    
    for idx , col in enumerate(df.columns[2:]):
        text+= '{}: {}\n'.format(col,results[0][idx]>0.5)
    return text 

In [42]:
#Create our inerface to use it in gradio for a good user experience 
interface = gr.Interface(fn = score_comment,
                        inputs = gr.inputs.Textbox(lines =2, placeholder = 'Say something to Houssem in english please ! '),
                        outputs = 'text')


IMPORTANT: You are using gradio version 3.9.1, however version 3.14.0 is available, please upgrade.
--------


In [43]:
interface.launch(share =True) 

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://9903f51cef4d0b00.gradio.app

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


(<gradio.routes.App at 0x24b4e344490>,
 'http://127.0.0.1:7860/',
 'https://9903f51cef4d0b00.gradio.app')

