<h1>Installing Dependecies and Importing Data</h1>

In [1]:
! pip install tensorflow  pandas matplotlib sklearn



In [2]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [3]:
df = pd.read_csv(
    os.path.join('comment-data','train.csv')
)

In [4]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


<h1>Preprocess</h1>

In [5]:
from tensorflow.keras.layers import TextVectorization  
# used for mapping text features to integer sequences

In [6]:
X = df['comment_text']          # storing all the comments
y = df[df.columns[2:]].values   # numpy array storing all the toxicity values

In [7]:
MAX_FEATURES = 200000    # number of words in the vocab

In [8]:
# initialising text vectorisation layer
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                              output_sequence_length=1800,  #max length of the input sentence(trim to 1800)
                              output_mode='int')

In [9]:
# learning all the values from the inputs
vectorizer.adapt(X.values)

In [10]:
# converting all our sentence into arrays of integers
vectorized_text = vectorizer(X.values)

In [11]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [12]:
# data pipline
# MCSHBAP - map. cache, shuffle, batch, prefetch 
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [13]:
# partitioning the dataset - 70% for training, 20% for validation & 10% for testing
train = dataset.take(int(len(dataset)*0.7))
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

In [14]:
# iterates through all the batches
train_generator = train.as_numpy_iterator()

<h1>Creating Sequential Model</h1>

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding 

In [16]:
# Instantiating the model
model = Sequential()

# ----------------------------Adding Layers------------------------------------------------
# Embedding Layer
model.add(Embedding(MAX_FEATURES+1,32))
# Bidirectional LSTM Layer - needs tanh activation 
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer : gives a value between 0 & 1 due to sigmoid activation
model.add(Dense(6, activation='sigmoid'))

In [17]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          6400032   
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                16640     
_________________________________________________________________
dense (Dense)                (None, 128)               8320      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 774       
Total params: 6,491,686
Trainable params: 6,491,686
Non-trainable params: 0
______________________________________________

In [19]:
# Training the model
history = model.fit(train, epochs=1, validation_data=val)



<h1>Making Prdeictions</h1>

In [20]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [22]:
(model.predict(batch_X) > 0.5).astype(int)

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

<h1>Evaluating Model</h1>

In [24]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [25]:
# instantiating our evaluation metrics
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [26]:
for batch in test.as_numpy_iterator():
    # unpacking the batch
    X_true, y_true = batch
    # making a prediction
    yhat = model.predict(X_true)
    
    # Flattening the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [27]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

Precision: 0.756040096282959, Recall: 0.7553724050521851, Accuracy: 0.5075225830078125


<h1>Testing</h1>

In [30]:
model.save('toxicity.h5')

In [31]:
def score_comment(comment):
    vectorized_com = vectorizer([comment])
    results = model.predict(vectorized_com)
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}:{}\n'.format(col,results[0][idx]>0.5)
    return text