<a href="https://colab.research.google.com/github/Serurays/Comment_Toxicity_NLP/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install tensorflow pandas matplotlib scikit-learn



In [4]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [5]:
df = pd.read_csv("/content/train.csv")

In [6]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
df[df["toxic"] == 1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


# **Preprocessing**

In [8]:
from tensorflow.keras.layers import TextVectorization
X = df["comment_text"]
y = df[df.columns[2:]].values

In [9]:
MAX_FEATURES = 200000

In [10]:
vectorizer = TextVectorization(
    max_tokens=MAX_FEATURES,
    output_sequence_length=1800,
    output_mode="int"
)

In [11]:
vectorizer.adapt(X.values)

In [12]:
vectorizer("Hello world, life is great!")[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([288, 263, 306,   9, 275])>

In [13]:
vectorized_text = vectorizer(X.values)

In [14]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [15]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,  y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [16]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [17]:
batch_X.shape, batch_y.shape

((16, 1800), (16, 6))

In [18]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [19]:
len(train), len(val), len(test)

(6981, 1994, 997)

# **Create Sequential Model**

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Bidirectional, Embedding

In [21]:
model = Sequential()
model.add(Embedding(MAX_FEATURES+1, 32))
model.add(LSTM(32, activation="tanh"))
model.add(Dense(128, activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dense(6, activation="sigmoid"))

In [22]:
model.compile(loss="BinaryCrossentropy", optimizer="Adam")

In [23]:
model.summary()

In [None]:
history = model.fit(train, epochs=1, validation_data=val)

[1m2264/6981[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1:27:07[0m 1s/step - loss: 0.1667

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

# **Prediction**

In [None]:
input_text = vectorizer("You freaking suck!")
input_text

In [None]:
res = model.predict(np.expand_dims(input_text, axis=0))

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

In [None]:
batch_y

# **Evaluation**

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
  X_true, y_true = batch

  y_hat = model.predict(y_true)

  y_true, y_hat = y_true.flatten(), y_hat.flatten()

  pre.update_state(y_true, y_hat)
  re.update_state(y_true, y_hat)
  acc.update_state(y_true, y_hat)

In [None]:
print(f"Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}")