# 0. Loading the Dependencies

In [1]:
!pip install tensorflow pandas matplotlib scikit-learn



In [2]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [3]:
df = pd.read_csv(os.path.join('jigsaw','train.csv'))

In [4]:
df[df['toxic']==1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


In [5]:
df.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\nAnd ... I really don't think you understand...",0,0,0,0,0,0


In [6]:
df.iloc[50] ['comment_text']

'"\n\nBI, you said you wanted to talk\n\nAt the bottom of the lead section you have written:\n\n""Its promoter speculated in 1994 that the skyhook concept could be cost competitive with what is realistically thought to be achievable using a space elevator, but the skyhook is not competitive with other rotating tether concepts. In addition, the rotating skyhook is in fact deemed ""not engineeringly feasible using presently available materials"".""\n\nRegarding:  In addition, the rotating skyhook is in fact deemed ""not engineeringly feasible using presently available materials""\n\nThat statement appears to come from Ref [3] on page 10.  The full quote is\n\n""If the mass of the tether alone started to exceed 200 times the mass of the payload,\nthen that was an indication the particular scenario being considered was not engineeringly\nfeasible using presently available materials, although the application might become feasible in\nthe near future as better materials become available with

In [7]:
df[df.columns[2:]]


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
159566,0,0,0,0,0,0
159567,0,0,0,0,0,0
159568,0,0,0,0,0,0
159569,0,0,0,0,0,0


# 1. Process

In [8]:
from tensorflow.keras.layers import TextVectorization

In [9]:
X = df['comment_text']
Y = df[df.columns[2:]].values

In [10]:
max_words = 200000 # no of words in the vocab

In [11]:
vectorizer =  TextVectorization(max_tokens = max_words,
                            output_sequence_length=2000,
                            output_mode = 'int')

In [12]:
vectorizer.adapt(X.values)

In [13]:
vectorizer('HEllo world, life sucks')[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([286, 261, 305, 736,   0], dtype=int64)>

In [14]:
type(X.values)

numpy.ndarray

In [15]:
vectorized_text = vectorizer(X.values)

In [16]:
vectorized_text

<tf.Tensor: shape=(159571, 2000), dtype=int64, numpy=
array([[  643,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2506, ...,     0,     0,     0],
       [  425,   440,    70, ...,     0,     0,     0],
       ...,
       [32141,  7329,   383, ...,     0,     0,     0],
       [    5,    12,   533, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [17]:
#MCSHBAP --> map, cache, shuffle, batch, prefetch    2ways_to_do --> from_tensor_slices OR list_files
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, Y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) #helps prevent bottleneck

In [18]:
batch_X, batch_Y = dataset.as_numpy_iterator().next()

In [19]:
batch_X.shape

(16, 2000)

In [20]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [21]:
len(train)

6981

In [22]:
train_generator = train.as_numpy_iterator()

In [23]:
train_generator

<tensorflow.python.data.ops.dataset_ops._NumpyIterator at 0x19e115ac0d0>

# 2. Create Sequential Model

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM , Dropout, Bidirectional, Dense, Embedding

In [25]:
model = Sequential()
# create the embedding layer
model.add(Embedding(max_words+1, 32))
# Bidirectional LSTM layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# feature extractor full connected layers
model.add(Dense(128, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))

# final layer 
model.add(Dense(6, activation = 'sigmoid'))

In [26]:
model.compile(loss ='BinaryCrossentropy', optimizer ='adam')

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [28]:
model.fit(train, epochs = 5, validation_data = val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x19e113caf80>

# 3. Make Predections

In [29]:
input_text = vectorizer('You are a bitch with big fucking ass')

In [30]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [31]:
batch = test.as_numpy_iterator().next()

In [32]:
batch_X, batch_Y = test.as_numpy_iterator().next()

In [33]:
(model.predict(batch_X) > 0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [34]:
batch_Y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [35]:
model.predict(np.expand_dims(input_text,0))



array([[0.9998227 , 0.21636973, 0.9974322 , 0.00106816, 0.946379  ,
        0.01807267]], dtype=float32)

In [36]:
res = model.predict(np.expand_dims(input_text,0))



# 4. Evaluate the model

In [37]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [38]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [39]:
for batch in test.as_numpy_iterator():

    #unpack the batch
    X_true, Y_true = batch

    # make a prediction
    yhat = model.predict(X_true)


    #flatten the prediction

    Y_true = Y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(Y_true, yhat)
    re.update_state(Y_true, yhat)
    acc.update_state(Y_true, yhat)



In [40]:
print(f'precision:{pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy:{acc.result().numpy()}')

precision:0.8835427761077881, Recall: 0.8162513971328735, Accuracy:0.5035105347633362


# 5. Test the Gradio

In [41]:
!pip install gradio



In [42]:
model.save('toxicity_comment_detection.h5')

In [51]:
model = tf.keras.models.load_model('toxicity_comment_detection.h5')

In [52]:
model

<keras.engine.sequential.Sequential at 0x19f13a30850>

In [53]:
import tensorflow as tf
import gradio as gr

In [54]:
input_str = vectorizer('creep you are')

In [55]:
res = model.predict(np.expand_dims(input_str,0))



In [57]:
res

array([[6.0411084e-02, 2.2768620e-06, 6.9269743e-03, 3.8574240e-03,
        9.7250082e-03, 3.6264150e-03]], dtype=float32)

In [66]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [69]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [74]:
interface.launch(share=True)

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
Running on public URL: https://03ae4fd3df146c33fb.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


