##Implementation of simple RNN and LSTM for text classification (sentiment analysis)
- In this python file, we will add multiple layers, to reduce the overfit
  - We will use dropouts
  - Batch normalization
  - Layer Normalization
  - Stacking
  - Bidirectional LSTM


In [1]:
import os
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, BatchNormalization, Dropout, Embedding, LayerNormalization, Bidirectional
from tensorflow.keras.layers import LayerNormalization
from tensorflow.keras.preprocessing.text import Tokenizer

## text preprocessing modules
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import Word2Vec
import re
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
### Load the data
review_data=pd.read_csv("/content/Imdb_preprocessed (1).csv.gz")

In [3]:
review_data.columns

Index(['Unnamed: 0', 'review', 'sentiment', 'cleaned_reveiws_w/o_SW',
       'cleaned_reviews_with_SW', 'cleaned_reviews_with_SW2', 'review_length',
       'sentiment_target'],
      dtype='object')

In [5]:
### Take the reviews text as a list of elements and then perform the basic preprocessing 
text= list(review_data['cleaned_reviews_with_SW'])
text[0:2]

['reviewer mentioned watching episode hooked right exactly happened thing struck brutality unflinching scene violence set right word trust faint hearted timid pull punch regard drug sex violence hardcore classic use word called nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda city home aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement far away say main appeal fact go show dare forget pretty picture painted mainstream audience forget charm forget romance mess episode saw struck nasty surreal say ready watched developed taste got accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order away mannered middle class inmate turned prison bitch lack street skill prison experience watching comfortable uncomfortable viewing touch darker ',
 'wonderful little production filming technique una

In [6]:
### Now the reviews are faily clean, we will tokenize the reviews and get word embeddings for these
tokenizer=Tokenizer(num_words=5000,lower=True,oov_token='UNK')


In [8]:
## Before applying the tokenizer, lets split the data into train test
train_text=text[:len(text)-500]
test_text=text[-500:]
print(len(text),len(train_text),len(test_text))

50000 49500 500


In [9]:
### Now lets tokenize the reviews
tokenizer.fit_on_texts(train_text)

In [10]:
review_data['sentiment'].replace(['positive','negative'],[1,0],inplace=True)
review_data['sentiment'].value_counts()

Y=list(review_data['sentiment'])
train_y=Y[:len(Y)-500]
test_y=Y[-500:]
print(len(train_y),len(test_y))

49500 500


In [11]:
train_indices=tokenizer.texts_to_sequences(train_text)
train_indices=np.asarray(train_indices)
train_y=np.asarray(train_y)

  train_indices=np.asarray(train_indices)


In [12]:
## As mentioned we are limiting the number of words say 64
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length=64
train_indices=pad_sequences(train_indices,maxlen=max_length,padding='post')

In [13]:
test_indices=tokenizer.texts_to_sequences(test_text)
test_indices=pad_sequences(test_indices,maxlen=max_length,padding='post')
test_indices=np.asarray(test_indices)
test_y=np.asarray(test_y)

In [14]:
### Creating the model architecture Using dropouts
vocab= len(tokenizer.word_index)+1
model= Sequential()
model.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length))
model.add(Dropout(0.3))
model.add(SimpleRNN(32))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 64, 100)           13522400  
                                                                 
 dropout (Dropout)           (None, 64, 100)           0         
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                4256      
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 13,526,689
Trainable params: 13,526,689
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
model.fit(train_indices,train_y,batch_size=32,epochs=1)
# Final evaluation of the model
scores = model.evaluate(test_indices, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 85.00%


In [16]:
## Creating the model architecture-- Dropouts with LSTM
vocab= len(tokenizer.word_index)+1
model= Sequential()
model.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length))
model.add(Dropout(0.3))
model.add(LSTM(32))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 64, 100)           13522400  
                                                                 
 dropout_2 (Dropout)         (None, 64, 100)           0         
                                                                 
 lstm (LSTM)                 (None, 32)                17024     
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 13,539,457
Trainable params: 13,539,457
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
model.fit(train_indices,train_y,batch_size=32)
# Final evaluation of the model
scores = model.evaluate(test_indices, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 84.80%


In [18]:
#### Lets Look at having a batch normalization alng with dropout 
vocab= len(tokenizer.word_index)+1
model= Sequential()
model.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length))
model.add(Dropout(0.3))
model.add(SimpleRNN(32))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 64, 100)           13522400  
                                                                 
 dropout_4 (Dropout)         (None, 64, 100)           0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                4256      
                                                                 
 batch_normalization (BatchN  (None, 32)               128       
 ormalization)                                                   
                                                                 
 dropout_5 (Dropout)         (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                      

In [19]:
model.fit(train_indices,train_y,batch_size=32)
# Final evaluation of the model
scores = model.evaluate(test_indices, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 82.60%


In [20]:
#### Lets Look at having a layer normalization alng with dropout 
vocab= len(tokenizer.word_index)+1
model= Sequential()
model.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length))
model.add(Dropout(0.3))
model.add(SimpleRNN(32))
model.add(LayerNormalization())
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 64, 100)           13522400  
                                                                 
 dropout_6 (Dropout)         (None, 64, 100)           0         
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                4256      
                                                                 
 layer_normalization (LayerN  (None, 32)               64        
 ormalization)                                                   
                                                                 
 dropout_7 (Dropout)         (None, 32)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                      

In [21]:
model.fit(train_indices,train_y,batch_size=32)
# Final evaluation of the model
scores = model.evaluate(test_indices, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 81.40%


In [22]:
### Strengthening using bidirectional LSTM
vocab= len(tokenizer.word_index)+1
model= Sequential()
model.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length))
model.add(Dropout(0.3))
model.add(Bidirectional(SimpleRNN(32)))
model.add(LayerNormalization())
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 64, 100)           13522400  
                                                                 
 dropout_8 (Dropout)         (None, 64, 100)           0         
                                                                 
 bidirectional (Bidirectiona  (None, 64)               8512      
 l)                                                              
                                                                 
 layer_normalization_1 (Laye  (None, 64)               128       
 rNormalization)                                                 
                                                                 
 dropout_9 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                

In [23]:
model.fit(train_indices,train_y,batch_size=32)
# Final evaluation of the model
scores = model.evaluate(test_indices, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 82.40%


In [24]:
### Stacked LSTM
vocab= len(tokenizer.word_index)+1
model= Sequential()
model.add(Embedding(input_dim=vocab,output_dim=100,input_length=max_length))
model.add(Dropout(0.3))
model.add(LSTM(32,return_sequences=True))
model.add(LayerNormalization())
model.add(LSTM(32))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 64, 100)           13522400  
                                                                 
 dropout_10 (Dropout)        (None, 64, 100)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64, 32)            17024     
                                                                 
 layer_normalization_2 (Laye  (None, 64, 32)           64        
 rNormalization)                                                 
                                                                 
 lstm_2 (LSTM)               (None, 32)                8320      
                                                                 
 dropout_11 (Dropout)        (None, 32)                0         
                                                      

In [25]:
model.fit(train_indices,train_y,batch_size=32)
# Final evaluation of the model
scores = model.evaluate(test_indices, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 83.40%
