In [18]:
import os 
import pandas as pd
import numpy as np
import tensorflow as tf

In [19]:
mkdir = '/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv'

In [20]:
df = pd.read_csv(mkdir)

In [21]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [22]:
df['comment_text'][6]

'COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'

In [23]:
df[df.columns[2:]].iloc[6]

toxic            1
severe_toxic     1
obscene          1
threat           0
insult           1
identity_hate    0
Name: 6, dtype: int64

# PreProcess

In [24]:
from tensorflow.keras.layers import TextVectorization

In [25]:
X = df['comment_text']
Y = df[df.columns[2:]].values

In [27]:
Max_features = 20000 #No of words in vocabulary
vectorizer = TextVectorization(max_tokens = Max_features , output_sequence_length = 1800 , output_mode = 'int')
# Text Vectorization set the text to lower case and remove punctuation

In [28]:
vectorizer.adapt(X.values)

In [29]:
vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'to', 'of', 'and', 'a', 'you', 'i', 'is']

In [30]:
Vectorized_text = vectorizer(X.values)

In [31]:
dataset = tf.data.Dataset.from_tensor_slices((Vectorized_text,Y))
dataset = dataset.cache()
dataset = dataset.shuffle(16000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(6)

In [32]:
batch_x , batch_y = dataset.as_numpy_iterator().next()

In [34]:
batch_x.shape

(16, 1800)

In [36]:
len(dataset)

9974

In [35]:
train =dataset.take(int(len(dataset)*0.7))
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

In [37]:
train_generator = train.as_numpy_iterator()

# Create Sequential Model

In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [57]:
model = Sequential()
model.add(Embedding(Max_features+1, 32))
model.add(Bidirectional(LSTM(32 , activation = 'tanh')))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(6 , activation = 'sigmoid'))

In [58]:
model.compile(loss = 'BinaryCrossentropy' , optimizer = 'adam')

In [59]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 32)          640032    
                                                                 
 bidirectional_2 (Bidirecti  (None, 64)                16640     
 onal)                                                           
                                                                 
 dense_8 (Dense)             (None, 128)               8320      
                                                                 
 dense_9 (Dense)             (None, 256)               33024     
                                                                 
 dense_10 (Dense)            (None, 128)               32896     
                                                                 
 dense_11 (Dense)            (None, 6)                 774       
                                                      

In [60]:
history  = model.fit(train , epochs= 1 ,validation_data = val)



In [64]:
history.history

{'loss': [0.06175604462623596], 'val_loss': [0.04889845848083496]}

In [65]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize = (8,5))
pd.DataFrame(history.history).plot()
plt.show()

# Make Prediction

In [67]:
input_text = vectorizer('I hate you!')

In [68]:
batch = test.as_numpy_iterator().next()

In [86]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

### Here we can compare this probability with index/columns of dataframe

In [69]:
model.predict(np.array([input_text]))



array([[0.7403466 , 0.0234674 , 0.36900547, 0.02503971, 0.33090192,
        0.06069288]], dtype=float32)

In [70]:
res = model.predict(np.expand_dims(input_text,0))



# Evaluate Model

In [71]:
from tensorflow.keras.metrics import Precision,Recall, CategoricalAccuracy

In [72]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [73]:
for batch in test.as_numpy_iterator():
    X_true , Y_true = batch
    yhat= model.predict(X_true)
    
    Y_true = Y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(Y_true , yhat)
    re.update_state(Y_true , yhat)
    acc.update_state(Y_true , yhat)



In [75]:
print(f'Precision : {pre.result().numpy()},Recall : {re.result().numpy()},Accuracy = {acc.result().numpy()}')

Precision : 0.8143520355224609,Recall : 0.6387566328048706,Accuracy = 0.45336008071899414


In [88]:
model.save('toxicity.h5')

  saving_api.save_model(
