<a href="https://colab.research.google.com/github/SumedhNakod/Projects/blob/master/Toxicity_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing Dependencies and Datasets

In [None]:
!pip install tensorflow tensorflow-gpu==2.11.0 pandas matplotlib sklearn

In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Toxic Comment/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv')

In [None]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


#Pre-processing
1. Data CLeaning.
2. Tokenization.

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [None]:
MAX_FEATURES = 200000

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length = 1800,
                               output_mode = 'int')

In [None]:
vectorizer.adapt(X.values)

In [None]:
vectorizer.get_vocabulary()

In [None]:
vectorized_text = vectorizer(X.values)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [None]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

#Create Sequential Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout, Dense, Embedding

In [None]:
model = Sequential()
 
model.add(Embedding(MAX_FEATURES+1, 32))

model.add(Bidirectional(LSTM(32,activation='tanh')))

model.add(Dense(128,activation = 'relu'))
model.add(Dense(256,activation = 'relu'))
model.add(Dense(128,activation = 'relu'))

model.add(Dense(6,activation = 'sigmoid'))

In [None]:
model.compile(loss = 'BinaryCrossentropy',optimizer = 'Adam')


#Training

In [None]:
history = model.fit(train, epochs = 5, validation_data = val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

#Predictions

In [None]:
#Sample

text_vectorized = vectorizer('I hate you !, Im going to hurt you')
res = model.predict(np.expand_dims(text_vectorized,0))
res



array([[0.87065846, 0.00093864, 0.02316078, 0.01619224, 0.19605483,
        0.03126711]], dtype=float32)

In [None]:
#Testing on test set
batch_X,batch_y = test.as_numpy_iterator().next()
(model.predict(batch_X)>0.5).astype(int)



array([[1, 1, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

#Evaluating

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

for batch in test.as_numpy_iterator():
  X_true, y_true = batch
  yhat = model.predict(X_true)

  y_true = y_true.flatten()
  yhat = yhat.flatten()

  pre.update_state(y_true, yhat)
  re.update_state(y_true, yhat)
  acc.update_state(y_true, yhat)

print(f'Precision: {pre.result().numpy()},Recall:{re.result().numpy()},Accuracy: {acc.result().numpy()}')

Precision: 0.8810794949531555,Recall:0.818682849407196,Accuracy: 0.49548646807670593


In [None]:
model.save('/content/drive/MyDrive/Toxic Comment/jigsaw-toxic-comment-classification-challenge/toxicity.h5')

In [None]:
#Reloding the model
#model = tf.keras.models.load_model('/content/drive/MyDrive/Toxic Comment/jigsaw-toxic-comment-classification-challenge/toxicity.h5')


#Gradio APP

In [None]:
!pip install gradio jinja2

In [None]:
import gradio as gr
import tensorflow as tf

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [None]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

interface.launch(share = True)

  super().__init__(
  super().__init__(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://8a0f2793a30af1b328.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


