In [97]:
#loading all libraies needed for the project

import pandas as pd
import numpy as np 
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Embedding, Dense, TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy


In [71]:
#importing Training dataset

df = pd.read_csv("train.csv.zip")

df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Preping the data for Training

In [73]:
#defining X and y

X = df["comment_text"]
y = df[df.columns[2:]].values


In [74]:
#preparing the vectorizer and defining the number of features

features = 200000
vectorizer = TextVectorization(max_tokens= features, output_sequence_length= 1800, output_mode= "int")
vectorizer.adapt(X.values)
vectorized_text = vectorizer(X.values)
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [75]:
#using tensorflow to batch the data

dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) 

In [76]:
#batch_X, batch_y =  dataset.as_numpy_iterator().next()

In [77]:
#batch_y.shape

In [78]:
#slicing the training data into 3 sections for training

train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [79]:
train_generator = train.as_numpy_iterator()


In [80]:
#testing to see if the batching works

train_generator.next()

(array([[86091,     8,   134, ...,     0,     0,     0],
        [ 5252,  7467,   999, ...,     0,     0,     0],
        [ 9497,     7,  9497, ...,     0,     0,     0],
        ...,
        [  983,   655,   179, ...,     0,     0,     0],
        [   18,     8,    33, ...,     0,     0,     0],
        [    8,    19,   282, ...,     0,     0,     0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 1, 0],
        [1, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 1],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

In [81]:
#preparing the Sequential model for training using the Bidirectional method

model = Sequential()
model.add(Embedding(features+1, 32))
model.add(Bidirectional(LSTM(32, activation="tanh")))
model.add(Dense(128, activation = "relu"))
model.add(Dense(256, activation = "relu"))
model.add(Dense(128, activation = "relu"))
model.add(Dense(6, activation = "sigmoid"))

In [82]:
model.compile(optimizer= "Adam",
              loss=tf.keras.losses.BinaryCrossentropy())
              

In [83]:
#I dont know why this showed up like this, but the training worked afterwards, so its ok I guess.

model.summary()

## Training the data

In [84]:
#Command to start the training, would have used more epochs, but ran out of time, model could potentially have been more accurte with more epochs
#side note, tried running without verbose, and model wouldnt run.

history = model.fit(train, epochs=1, validation_data=val, verbose=1)

[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2658s[0m 380ms/step - loss: 0.0822 - val_loss: 0.0489


## Testing the results

In [None]:
#using some random input text to see how well the training worked
#results aren't showing because we deleted them, but decided to add them back last minute

input_text = vectorizer('You freaking suck! I am going to hit you.')

In [None]:
res = model.predict(input_text)

In [None]:
(res > 0.5).astype(int)

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

In [None]:
res.shape

## Prepping for submission

In [108]:
test_df = pd.read_csv("test.csv")

In [109]:
input_text = vectorizer(test_df['comment_text'])
input_text = np.expand_dims(input_text, axis=0)
input_text = tf.reshape(input_text, shape=(-1, 1800))
pred = (model.predict(input_text) > 0.5 ).astype(int)

[1m4787/4787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m354s[0m 74ms/step


In [110]:
print(pred)
pred.shape

out_df = pd.concat([test_df, pd.DataFrame(pred)], axis=1)
out_df.head()

out_df = out_df.rename(columns={0: 'toxic', 1: 'severe_toxic', 2: 'obscene', 3: 'threat', 4: 'insult', 5: 'identity_hate'})
out_df = out_df.drop(columns=['comment_text'])
out_df.head()

out_df.to_csv('out.csv', index=False)

[[1 1 1 0 1 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ...
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [1 0 1 0 1 0]]


## This section calculates accuracy and some other stuff

In [98]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [99]:
for batch in test.as_numpy_iterator():
    X_true, y_true = batch
    yhat = model.predict(X_true)
    
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72

In [100]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.7218456864356995, Recall:0.7900142669677734, Accuracy:0.49949848651885986
