# Comment toxicity using LSTM

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import os
import tensorflow as tf 

In [2]:
gpus=tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu,True)

In [3]:
file_path = os.path.join(r"D:\toxic_comments",'train.csv')
df = pd.read_csv(file_path)

In [4]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [5]:
df['comment_text'][4]

"You, sir, are my hero. Any chance you remember what page that's on?"

In [6]:
df[df.columns[2:]].iloc[4]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 4, dtype: int64

# Text Vectorization

In [7]:
from tensorflow.keras.layers import TextVectorization

In [8]:
X=df['comment_text']
y=df[df.columns[2:]].values

In [9]:
X

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [10]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [11]:
MAX_FEATURES=200000 #Number of words

In [12]:
vectorizer=TextVectorization(max_tokens=MAX_FEATURES,
                            output_sequence_length=1800,
                            output_mode='int')

In [13]:
vectorizer.adapt(X.values)

In [14]:
vectorized_text=vectorizer(X.values)

In [15]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  642,    75,     2, ...,     0,     0,     0],
       [    1,    54,  2497, ...,     0,     0,     0],
       [  424,   437,    70, ...,     0,     0,     0],
       ...,
       [31970,  7300,   382, ...,     0,     0,     0],
       [    5,    12,   532, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [16]:
dataset=tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset=dataset.cache()
dataset=dataset.shuffle(150000)
dataset=dataset.batch(16)
dataset=dataset.prefetch(8) #Helps in bottlenecks

In [17]:
batch_X,batch_y=dataset.as_numpy_iterator().next()

In [18]:
batch_X.shape

(16, 1800)

In [19]:
batch_y.shape

(16, 6)

In [20]:
train=dataset.take(int(len(dataset)*.7))
val=dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test=dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [21]:
len(test)

997

In [22]:
train_generator=train.as_numpy_iterator()

In [23]:
train_generator.next()

(array([[    7,   495,   672, ...,     0,     0,     0],
        [   77,    82,     2, ...,     0,     0,     0],
        [15032,  2201,    14, ...,     0,     0,     0],
        ...,
        [    8,    69,    61, ...,     0,     0,     0],
        [ 6851,     7,    19, ...,     0,     0,     0],
        [   45,    65,    23, ...,     0,     0,     0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

# Model Building

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dropout,Bidirectional,Dense,Embedding

In [25]:
model=Sequential()
model.add(Embedding(MAX_FEATURES+1,32))
model.add(Bidirectional(LSTM(32,activation='tanh')))
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(6,activation='sigmoid'))

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [27]:
model.compile(loss='BinaryCrossentropy',optimizer='Adam')

For practice purpose, I used only one epoch as it will take too much time.

In [28]:
history=model.fit(train,epochs=1,validation_data=val)



In [37]:
input_text=vectorizer("I am going to kill you")

In [38]:
batch=test.as_numpy_iterator().next()

In [39]:
res=model.predict(np.expand_dims(input_text,0))

In [40]:
res

array([[0.73834956, 0.01037964, 0.27422133, 0.01787125, 0.31130478,
        0.04518363]], dtype=float32)