<a href="https://colab.research.google.com/github/Saketkr06/NLP/blob/main/Deep_Learning_For_NLP_Zero_To_Transformers_%26_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install kaggle



In [2]:
# configuring the path of Kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
# API to fetch the dataset from Kaggle
!kaggle competitions download -c jigsaw-multilingual-toxic-comment-classification

Downloading jigsaw-multilingual-toxic-comment-classification.zip to /content
100% 1.08G/1.08G [00:14<00:00, 61.6MB/s]
100% 1.08G/1.08G [00:14<00:00, 81.8MB/s]


In [22]:
!kaggle datasets download -d takuok/glove840b300dtxt

Downloading glove840b300dtxt.zip to /content
 99% 2.06G/2.08G [00:32<00:00, 96.4MB/s]
100% 2.08G/2.08G [00:32<00:00, 68.6MB/s]


In [23]:
# extracting the compessed Dataset
from zipfile import ZipFile
dataset = '/content/glove840b300dtxt.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [4]:
# extracting the compessed Dataset
from zipfile import ZipFile
dataset = '/content/jigsaw-multilingual-toxic-comment-classification.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [5]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM,GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import Embedding
from keras.layers import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
strategy = tf.distribute.MirroredStrategy()

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff



In [6]:
train=pd.read_csv('/content/jigsaw-toxic-comment-train.csv')
validation=pd.read_csv('/content/validation.csv')
test=pd.read_csv('/content/test.csv')

In [7]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)

In [8]:
train=train.loc[:12000,:]
train.shape

(12001, 3)

In [9]:
train['comment_text'].apply(lambda x:len(str(x).split())).max()

1403

In [10]:
def roc_auc(predictions,target):
  fpr,tpr,thresholds=metrics.roc_curve(target,predictions)
  roc_auc=metrics.auc(fpr,tpr)
  return roc_auc

In [11]:
train.columns

Index(['id', 'comment_text', 'toxic'], dtype='object')

In [12]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values,
                                                  stratify=train.toxic.values,
                                                  random_state=42,
                                                  test_size=0.2, shuffle=True)

In [13]:
from keras.utils import pad_sequences

In [14]:
token=text.Tokenizer(num_words=None)
max_len=1500

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq=token.texts_to_sequences(xtrain)
xvalid_seq=token.texts_to_sequences(xvalid)

#zero pad sequences
xtrain_pad=pad_sequences(xtrain_seq,maxlen=max_len)
xvalid_pad = pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [15]:
%%time
with strategy.scope():
    # A simpleRNN without any pretrained embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     input_length=max_len))
    model.add(SimpleRNN(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1500, 300)         13049100  
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               40100     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 13,089,301
Trainable params: 13,089,301
Non-trainable params: 0
_________________________________________________________________
CPU times: user 433 ms, sys: 168 ms, total: 601 ms
Wall time: 555 ms


In [16]:
model.fit(xtrain_pad, ytrain,epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ac5dfc30e50>

In [19]:
scores=model.predict(xvalid_pad)
print("AUC :",(roc_auc(scores,yvalid)))

AUC : 0.8758880481785135


In [20]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,yvalid)})

In [21]:
xtrain_seq[:1]

[[664,
  65,
  7,
  19,
  2262,
  14102,
  5,
  2262,
  20439,
  6071,
  4,
  71,
  32,
  20440,
  6620,
  39,
  6,
  664,
  65,
  11,
  8,
  20441,
  1502,
  38,
  6072]]

In [25]:
embeddings_index={}
f=open('/content/glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
  values=line.split(' ')
  word=values[0]
  coefs=np.asarray([float(val) for val in values[1:]])
  embeddings_index[word]=coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196018it [04:43, 7740.07it/s]

Found 2196017 word vectors.





LSTM

In [26]:
embedding_matrix=np.zeros((len(word_index) + 1,300))
for word,i in tqdm(word_index.items()):
  embedding_vector=embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i]=embedding_vector

100%|██████████| 43496/43496 [00:00<00:00, 80285.38it/s]


In [27]:
%%time
with strategy.scope():
  model=Sequential()
  model.add(Embedding(len(word_index)+1,
                      300,
                      weights=[embedding_matrix],
                      input_length=max_len,
                      trainable=False))

  model.add(LSTM(100,dropout=0.3,recurrent_dropout=0.3))
  model.add(Dense(1,activation='sigmoid'))
  model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1500, 300)         13049100  
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 13,209,601
Trainable params: 160,501
Non-trainable params: 13,049,100
_________________________________________________________________
CPU times: user 811 ms, sys: 161 ms, total: 972 ms
Wall time: 1.84 s


In [29]:
model.fit(xtrain_pad,ytrain,epochs=5,batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ac58ff89c00>

In [30]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.97%


In [31]:
scores_model.append({'Model': 'LSTM','AUC_Score': roc_auc(scores,yvalid)})

In [34]:
with strategy.scope():
  model=Sequential()
  model.add(Embedding(len(word_index)+1,
                      300,
                      weights=[embedding_matrix],
                      input_length=max_len,
                      trainable=False))
  model.add(SpatialDropout1D(0.3))
  model.add(GRU(300))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 1500, 300)         13049100  
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 1500, 300)        0         
 lDropout1D)                                                     
                                                                 
 gru_1 (GRU)                 (None, 300)               541800    
                                                                 
 dense_3 (Dense)             (None, 1)                 301       
                                                                 
Total params: 13,591,201
Trainable params: 542,101
Non-trainable params: 13,049,100
_________________________________________________________________


In [33]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ac5d768d4b0>

In [36]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.38%


In [37]:
scores_model.append({'Model': 'GRU','AUC_Score': roc_auc(scores,yvalid)})

In [38]:
#Visualization of Results obtained from various Deep learning models
results = pd.DataFrame(scores_model).sort_values(by='AUC_Score',ascending=False)
results.style.background_gradient(cmap='Blues')

Unnamed: 0,Model,AUC_Score
1,LSTM,0.971554
0,SimpleRNN,0.875888
2,GRU,0.380294
