# Comment Toxicity detection

dataset : https://www.kaggle.com/datasets/julian3833/jigsaw-toxic-comment-classification-challenge

This contains comments in textual form that are classified on the level of toxicity and it's nature.
We will build an LSTM based model to predict the toxicity of new comments.

The data is multi - output meaning that each record may correspond to more than one output label.

In [11]:
# installing dependencies
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [12]:
# np.expand_dims??

In [13]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [14]:
# Vectorize the text for processing
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization

X = df['comment_text']  # comment text
Y = df[df.columns[2:]]; # drop everything other than labels

# X.head()
# Y.head()
print(X.shape)
print(Y.shape)

(159571,)
(159571, 6)


In [15]:
Y = Y.values # converting the labels to an numpy array

In [16]:
Y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [6]:
# Now we need to vectorize the text, each word maps to a unique id. This will be done by TextVectorization function
# So we need to create a vocabulary, set the maximum number of words allowed in the vocabulary
# help(TextVectorization)

# Some Info on TextVectorization

# The vocabulary for the layer must be either supplied on construction or
#  |  learned via `adapt()`. When this layer is adapted, it will analyze the
#  |  dataset, determine the frequency of individual string values, and create a
#  |  vocabulary from them. This vocabulary can have unlimited size or be capped,
#  |  depending on the configuration options for this layer; if there are more
#  |  unique values in the input than the maximum vocabulary size, the most frequent
#  |  terms will be used to create the vocabulary.

In [19]:
MAX_FEATURES = 200000 # Vocbulary size
# Create the vectorizer
# help(TextVectorization)
vectorizer = TextVectorization(max_tokens = MAX_FEATURES,output_sequence_length = 1800, output_mode = 'int')

In [23]:
type(X)
type(X.values)

numpy.ndarray

In [22]:

vectorizer.adapt(X.values)


In [29]:
# vectorizer.get_vocabulary()
print(vectorizer("Hello world"))

print(vectorizer("Hello my name is Rahul")[:5])


tf.Tensor([286 261   0 ...   0   0   0], shape=(1800,), dtype=int64)
tf.Tensor([  286    28   109     9 24198], shape=(5,), dtype=int64)


In [30]:
# Vectorize all the comments
vectorized_text = vectorizer(X.values)


In [33]:
vectorized_text
# Each comment is allowed a maximum of 1800 words

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  643,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2506, ...,     0,     0,     0],
       [  425,   440,    70, ...,     0,     0,     0],
       ...,
       [32141,  7329,   383, ...,     0,     0,     0],
       [    5,    12,   533, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

Refer: https://www.tensorflow.org/api_docs/python/tf/data/Dataset

In [36]:
# Now we will create a tensorflow data pipe line
# map, cache, shuffle, batch, prefetch


dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,Y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)   # Buffer-size
dataset = dataset.batch(16)         # Batch size
dataset = dataset.prefetch(16)      # Pre-fetching helps prevent bottlenecks
# dataset input pipelines should end with a call to prefetch. This allows later elements to be prepared while the current 
# element is being processed. This often improves latency and throughput, at the cost of using 
# additional memory to store prefetched elements.

In [41]:
batchX, batchY = dataset.as_numpy_iterator().next()
len(batchX)

16

In [44]:
# first 70% of the dataset is taken for test data
# then 20% is taken for validataion
# 10% for testing
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [56]:
print(len(train)) # This number is actually the number of batches of training data
# Each batch contains 16 comments

train_generator = train.as_numpy_iterator()
# train_generator.next()

6981


# Build the Model

In [57]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [60]:
model = Sequential()
# help(Embedding)
# Embedding Layer
model.add(Embedding(MAX_FEATURES+1,32))
# Biderctional LSTM
model.add(Bidirectional(LSTM(32,activation = 'tanh')))
# Fully Connected layer
model.add(Dense(128,activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))


In [62]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dense_2 (Dense)             (None, 6)                 774       
                                                                 
Total params: 6,442,278
Trainable params: 6,442,278
Non-trainable params: 0
______________________________________________

In [63]:
# Training
history = model.fit(train, epochs=1, validation_data=val)



In [64]:
model.save("Comment_toxicity.h5")

Explore the model perfomance and make some predictions

In [69]:
history.history

{'loss': [0.06431057304143906], 'val_loss': [0.04635939747095108]}

In [119]:
# Let us test the model by using our own string 
# Tokenise the input string and use predict() to get the output
# Then use an threshold of 0.5 for detecting the labels
Text = "You freaking suck! I am going to hit you." 

input_text = vectorizer(Text)

res = model.predict(np.expand_dims(input_text,0))
# print(res)
# print((res > 0.5).astype(int))
labels = list(df.columns[2:])
output = (res > 0.5).astype(int)
print("The Comment is : ",end = ' ')
for i,x in enumerate(output[0]):
    if(x==1):
        print(labels[i],end = ' ')

The Comment is :  toxic obscene insult 

Test the model on testset

In [120]:
batch_X, batch_y = test.as_numpy_iterator().next()
# (model.predict(batch_X) > 0.5).astype(int)


Evaluate

In [122]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [124]:
# Evaluation Metrics
Pre = Precision()
Re = Recall()
Acc = CategoricalAccuracy()

for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    Pre.update_state(y_true, yhat)
    Re.update_state(y_true, yhat)
    Acc.update_state(y_true, yhat)

In [125]:
print(f'Precision: {Pre.result().numpy()}, Recall:{Re.result().numpy()}, Accuracy:{Acc.result().numpy()}')

Precision: 0.8321626782417297, Recall:0.6549316048622131, Accuracy:0.47843530774116516


# Interface

In [126]:
!pip install gradio jinja2

Collecting gradio

You should consider upgrading via the 'c:\users\rahul raaghav a\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.



  Downloading gradio-3.0.19-py3-none-any.whl (5.1 MB)
     ---------------------------------------- 5.1/5.1 MB 3.1 MB/s eta 0:00:00
Collecting uvicorn
  Downloading uvicorn-0.17.6-py3-none-any.whl (53 kB)
     ---------------------------------------- 53.6/53.6 KB 1.4 MB/s eta 0:00:00
Collecting pycryptodome
  Downloading pycryptodome-3.14.1-cp35-abi3-win_amd64.whl (1.8 MB)
     ---------------------------------------- 1.8/1.8 MB 3.0 MB/s eta 0:00:00
Collecting analytics-python
  Downloading analytics_python-1.4.0-py2.py3-none-any.whl (15 kB)
Collecting fsspec
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
     -------------------------------------- 140.6/140.6 KB 2.8 MB/s eta 0:00:00
Collecting markdown-it-py[linkify,plugins]
  Downloading markdown_it_py-2.1.0-py3-none-any.whl (84 kB)
     ---------------------------------------- 84.5/84.5 KB 4.6 MB/s eta 0:00:00
Collecting ffmpy
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py): started
  Preparing me

In [128]:
import tensorflow as tf
import gradio as gr

In [129]:
model = tf.keras.models.load_model('Comment_toxicity.h5')

In [130]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [131]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')



In [132]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7860/
Running on public URL: https://45578.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


(<gradio.routes.App at 0x19e3fe99490>,
 'http://127.0.0.1:7860/',
 'https://45578.gradio.app')