In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np


In [2]:
df = pd.read_csv('./jigsaw-toxic-comment-classification-challenge/train.csv/train.csv')


In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
df.iloc[0]['comment_text']

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

# Preprocess

In [5]:
from tensorflow.keras.layers import TextVectorization

In [6]:
# taking only the toxic severe toxic etc. from data
x=df['comment_text']
y=df[df.columns[2:]].values
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [7]:
MAX_WORDS = 200000

In [8]:
#initializing the vectorization
vectorizer = TextVectorization(max_tokens=MAX_WORDS,
                               output_sequence_length=1800,
                               output_mode='int') 

In [9]:
 # Represent x as a numpy arraya and converting text to be tokenised
vectorizer.adapt(x.values)
type(x.values)

numpy.ndarray

In [10]:
# eg. for vectorizer working
vectorizer('hello world')

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([288, 263,   0, ...,   0,   0,   0], dtype=int64)>

In [11]:
#converting the comment_text to numerical values vectorising (or tokenizing)
vectorized_text = vectorizer(x.values)
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [12]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000) # shuffling the data      
dataset = dataset.batch(16) # making the sample batch of 16
dataset = dataset.prefetch(8)

In [13]:
train = dataset.take(int(len(dataset)*.7)) # taking 70 percent of out dataset
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)) # validation data
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

# loading the model

In [14]:
from keras.models import load_model

# Load the model
model = load_model("model1.h5")

# Now 'model' contains the loaded model


# Making Predictions

In [15]:
input_text = vectorizer("I am will freaking kill you !! you piss drinker nigga")

In [16]:
input_text

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([ 8, 74, 44, ...,  0,  0,  0], dtype=int64)>

In [17]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [18]:
type(model.predict(np.expand_dims(input_text,0)))



numpy.ndarray

In [19]:
model.predict(np.expand_dims(input_text,0))




array([[0.8209966 , 0.1389728 , 0.66925853, 0.28515217, 0.585315  ,
        0.8807923 ]], dtype=float32)

# Evaluation

In [20]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [21]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [22]:
batch = test.as_numpy_iterator().next()
batch

(array([[  7457,     46,    168, ...,      0,      0,      0],
        [    14,      9,     21, ...,      0,      0,      0],
        [121912,     49,    738, ...,      0,      0,      0],
        ...,
        [   181,    338,    160, ...,      0,      0,      0],
        [    94,     13,     42, ...,      0,      0,      0],
        [   191,    288, 156485, ...,      0,      0,      0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

In [23]:
for batch in test.as_numpy_iterator():
    # unpack the batch
    X_true, y_true = batch
    # make the prediction
    yhat = model.predict(X_true)

    
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true,yhat)
    re.update_state(y_true,yhat)
    acc.update_state(y_true,yhat)



In [24]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')


Precision: 0.9520158767700195, Recall:0.9682356119155884, Accuracy:0.5115346312522888


### With epochs 20

In [28]:
model1 = load_model("./HarmonyGuard/model3.h5")

In [29]:
for batch in test.as_numpy_iterator():
    # unpack the batch
    X_true, y_true = batch
    # make the prediction
    yhat = model1.predict(X_true)

    
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true,yhat)
    re.update_state(y_true,yhat)
    acc.update_state(y_true,yhat)

print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')


Precision: 0.9599884152412415, Recall:0.9714953899383545, Accuracy:0.523570716381073
