# Installing Dependencies

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
df = pd.read_csv('data/train.csv')

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Preprocessing

In [4]:
from tensorflow.keras.layers import TextVectorization

In [5]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [6]:
X = df['comment_text']  
y = df[df.columns[2:]].values   # y is a numpy array    

In [7]:
MAX_FEATURES = 200000    # max number of words in the vocabulary  

In [8]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, 
                               output_sequence_length=1800, 
                               output_mode='int') # output_sequence_length is the max length of the output sequences  

In [9]:
vectorizer.adapt(X.values)

In [10]:
vectorized_text = vectorizer(X.values)

In [11]:
# Basic pipeline for data preprocessing
# MCSBAP - Map, cache, shuffle, batch, prefetch from_tensor_slices, list_files 

dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))  # create a dataset from the tensor slices
dataset = dataset.cache()   # cache the dataset
dataset = dataset.shuffle(160000)   # shuffle the dataset    
dataset = dataset.batch(16)  # batch size is 16
dataset = dataset.prefetch(8)   # prefetch 8 batches

In [12]:
batch_X, batch_y = dataset.as_numpy_iterator().next()  # get the next batch of data

In [13]:
batch_X.shape, batch_y.shape

((16, 1800), (16, 6))

In [14]:
train = dataset.take(int(len(dataset)*0.7)) # 70% of the data is used for training
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))   # 20% of the data is used for validation
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))  # 10% of the data is used for testing
  

In [15]:
len(train), len(val), len(test)

(6981, 1994, 997)

# Create Sequential Model

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding, Input

In [29]:
model = Sequential([
    Input(shape=(1800,)),   # input layer
    Embedding(input_dim=MAX_FEATURES, output_dim=32),    # embedding layer
    Bidirectional(LSTM(32, activation='tanh')),  # bidirectional LSTM layer
    Dense(128, activation='relu'),   # dense layer
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(6, activation='sigmoid'), # output layer
])

In [34]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [31]:
model.summary()

In [35]:
history = model.fit(train, epochs=1, validation_data=val)

[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3465s[0m 495ms/step - accuracy: 0.9723 - loss: 0.0851 - val_accuracy: 0.9946 - val_loss: 0.0483


# Make Predictions 

In [45]:
input_text = vectorizer('your fucking ugly')

In [37]:
batch = test.as_numpy_iterator().next()

In [38]:
input_text

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([  21,   30, 1025, ...,    0,    0,    0], dtype=int64)>

In [None]:
res = model.predict(np.expand_dims(input_text, axis=0)) 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step


In [47]:
res

array([[0.9949022 , 0.2229113 , 0.9549315 , 0.0176293 , 0.7843742 ,
        0.09508211]], dtype=float32)

In [48]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

# Evaluate

In [50]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [51]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():  # iterate over the test dataset 
    X_true, y_true = batch  
    y_pred = model.predict(X_true)
    
    y_true = y_true.flatten()   # flatten the true labels
    y_pred = y_pred.flatten()   # flatten the predicted labels  
    
    pre.update_state(y_true, y_pred)
    re.update_state(y_true, y_pred)
    acc.update_state(y_true, y_pred)
    

In [53]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}, Accuracy: {acc.result().numpy()}')      

Precision: 0.899954080581665, Recall: 0.5528615713119507, Accuracy: 0.476429283618927, Accuracy: 0.476429283618927


In [54]:
from tensorflow.keras.models import load_model

model.save('toxicity.keras')