# Sentiment Analysis on IMDB dataset

In [1]:
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
# from keras.utils import to_categorical
from keras import models
from keras import layers

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D

AttributeError: module 'tensorflow.compat.v2' has no attribute '__internal__'

## Import and Analyse the imdb data

In [2]:
from keras.datasets import imdb
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=10000)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

In [3]:
len(data)   # Checking the size of data

50000

In [4]:
len(training_data), len(training_targets)

(25000, 25000)

In [5]:
len(testing_data), len(testing_targets)

(25000, 25000)

In [6]:
training_data[1]

[1,
 194,
 1153,
 194,
 8255,
 78,
 228,
 5,
 6,
 1463,
 4369,
 5012,
 134,
 26,
 4,
 715,
 8,
 118,
 1634,
 14,
 394,
 20,
 13,
 119,
 954,
 189,
 102,
 5,
 207,
 110,
 3103,
 21,
 14,
 69,
 188,
 8,
 30,
 23,
 7,
 4,
 249,
 126,
 93,
 4,
 114,
 9,
 2300,
 1523,
 5,
 647,
 4,
 116,
 9,
 35,
 8163,
 4,
 229,
 9,
 340,
 1322,
 4,
 118,
 9,
 4,
 130,
 4901,
 19,
 4,
 1002,
 5,
 89,
 29,
 952,
 46,
 37,
 4,
 455,
 9,
 45,
 43,
 38,
 1543,
 1905,
 398,
 4,
 1649,
 26,
 6853,
 5,
 163,
 11,
 3215,
 2,
 4,
 1153,
 9,
 194,
 775,
 7,
 8255,
 2,
 349,
 2637,
 148,
 605,
 2,
 8003,
 15,
 123,
 125,
 68,
 2,
 6853,
 15,
 349,
 165,
 4362,
 98,
 5,
 4,
 228,
 9,
 43,
 2,
 1157,
 15,
 299,
 120,
 5,
 120,
 174,
 11,
 220,
 175,
 136,
 50,
 9,
 4373,
 228,
 8255,
 5,
 2,
 656,
 245,
 2350,
 5,
 4,
 9837,
 131,
 152,
 491,
 18,
 2,
 32,
 7464,
 1212,
 14,
 9,
 6,
 371,
 78,
 22,
 625,
 64,
 1382,
 9,
 8,
 168,
 145,
 23,
 4,
 1690,
 15,
 16,
 4,
 1355,
 5,
 28,
 6,
 52,
 154,
 462,
 33,
 89,
 78,
 2

In [9]:
print("Categories:", np.unique(targets))
print("Number of unique words or Vocab Size:", len(np.unique(np.hstack(data))))

Categories: [0 1]
Number of unique words or Vocab Size: 9998


In [10]:
# Calculating the average and standard deviation of lenghths all reviews 
length = [len(i) for i in data]
print("Average Review length:", np.mean(length))
print("Standard Deviation:", round(np.std(length)))

Average Review length: 234.75892
Standard Deviation: 173


In [11]:
# print the specific review and its label
print("Review:", data[2])
print("Label:", targets[2])

Review: [1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5974, 54, 61, 369, 13, 71, 149, 14, 22, 112, 4, 2401, 311, 12, 16, 3711, 33, 75, 43, 1829, 296, 4, 86, 320, 35, 534, 19, 263, 4821, 1301, 4, 1873, 33, 89, 78, 12, 66, 16, 4, 360, 7, 4, 58, 316, 334, 11, 4, 1716, 43, 645, 662, 8, 257, 85, 1200, 42, 1228, 2578, 83, 68, 3912, 15, 36, 165, 1539, 278, 36, 69, 2, 780, 8, 106, 14, 6905, 1338, 18, 6, 22, 12, 215, 28, 610, 40, 6, 87, 326, 23, 2300, 21, 23, 22, 12, 272, 40, 57, 31, 11, 4, 22, 47, 6, 2307, 51, 9, 170, 23, 595, 116, 595, 1352, 13, 191, 79, 638, 89, 2, 14, 9, 8, 106, 607, 624, 35, 534, 6, 227, 7, 129, 113]
Label: 0


In [12]:
# Dictionary mapping word indices back into the original words so that we can read them. 
# It replaces every unknown word with a “#”. 
# It does this by using the get_word_index() function.

index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()]) 
decoded = " ".join( [reverse_index.get(i - 3, "#") for i in data[0]] )
print(decoded)

# this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert # is an amazing actor and now the same being director # father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for # and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also # to the two little boy's that played the # of norman and paul they were just brilliant children are often left out of the # list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you thi

In [13]:
reverse_index

{34701: 'fawn',
 52006: 'tsukino',
 52007: 'nunnery',
 16816: 'sonja',
 63951: 'vani',
 1408: 'woods',
 16115: 'spiders',
 2345: 'hanging',
 2289: 'woody',
 52008: 'trawling',
 52009: "hold's",
 11307: 'comically',
 40830: 'localized',
 30568: 'disobeying',
 52010: "'royale",
 40831: "harpo's",
 52011: 'canet',
 19313: 'aileen',
 52012: 'acurately',
 52013: "diplomat's",
 25242: 'rickman',
 6746: 'arranged',
 52014: 'rumbustious',
 52015: 'familiarness',
 52016: "spider'",
 68804: 'hahahah',
 52017: "wood'",
 40833: 'transvestism',
 34702: "hangin'",
 2338: 'bringing',
 40834: 'seamier',
 34703: 'wooded',
 52018: 'bravora',
 16817: 'grueling',
 1636: 'wooden',
 16818: 'wednesday',
 52019: "'prix",
 34704: 'altagracia',
 52020: 'circuitry',
 11585: 'crotch',
 57766: 'busybody',
 52021: "tart'n'tangy",
 14129: 'burgade',
 52023: 'thrace',
 11038: "tom's",
 52025: 'snuggles',
 29114: 'francesco',
 52027: 'complainers',
 52125: 'templarios',
 40835: '272',
 52028: '273',
 52130: 'zaniacs',

### Split Data into Training and Test Data
Data is already splitted into training and Testing Data

### Build the Tokenizer

In [21]:
desired_vocab_size = 10000 #Vocablury size

### Pad Sequences

In [22]:
#Define maximum number of words to consider in each review
max_review_length = 300

In [23]:
#Pad training and test reviews
training_data = tf.keras.preprocessing.sequence.pad_sequences(training_data.tolist(),
                                                        maxlen=max_review_length,
                                                        padding='pre', truncating='post')
testing_data = tf.keras.preprocessing.sequence.pad_sequences(testing_data.tolist(), 
                                                       maxlen=max_review_length, 
                                                       padding='pre', truncating='post')

In [24]:
len(training_data[5]), len(testing_data[5])    # checking the lenght of review after padding

(300, 300)

In [25]:
training_data[5]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [26]:
training_data.shape, testing_data.shape 

((25000, 300), (25000, 300))

### Build the Graph using RNN_LSTM

In [27]:
#Initialize model
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

Add Embedding layer
 - Embedding Layer Input = Batch_Size * Length of each review

In [28]:
model.add(tf.keras.layers.Embedding(desired_vocab_size + 1, #Vocablury size
                                    50, #Embedding size
                                    input_length=max_review_length) #Number of words in each review
          )

In [29]:
model.output

<tf.Tensor 'embedding/embedding_lookup/Identity_1:0' shape=(None, 300, 50) dtype=float32>

Embedding Layer Output - 
[Batch_Size , Review Length , Embedding_Size]

Add LSTM Layer with 256 as RNN state size

In [30]:
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.LSTM(256)) #RNN State - size of cell state and hidden state
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

In [31]:
#Compile the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 50)           500050    
_________________________________________________________________
dropout (Dropout)            (None, 300, 50)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 256)               314368    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 814,675
Trainable params: 814,675
Non-trainable params: 0
_________________________________________________________________


In [33]:
result = model.fit(training_data,training_targets,
          epochs=5,
          batch_size=32,          
          validation_data=(testing_data, testing_targets))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Simple LSTM Model Evaluation

In [34]:
# Training Data
score, acc = model.evaluate(training_data, training_targets, batch_size=32)
print("Training loss is:", score)
print('Training accuracy is:', acc)

# Testing data
score, acc = model.evaluate(testing_data, testing_targets, batch_size=32)
print("Test loss is:", score)
print('Test accuracy is:', acc)

Training loss is: 0.10468510538339615
Training accuracy is: 0.9657999873161316
Test loss is: 0.3895992934703827
Test accuracy is: 0.856719970703125


### Generate predictions from Simple LSTM model

In [38]:
print("Generate predictions for 5 samples")
probability_predictions = model.predict(testing_data[:5])
print("Actual Sentiment:", testing_targets[:5])

predictions = []
for i in probability_predictions:
    if i>=0.5:      # taking threshold as middle value
        predictions.append(1)
    else:
        predictions.append(0)
        
print("Predicted Sentiment:", predictions)     

Generate predictions for 5 samples
Actual Sentiment: [0 1 1 0 1]
Predicted Sentiment: [0, 1, 1, 1, 1]


We can use transfer learning using Glove and Word2Vec Embeddings

## Word2Vec Embeddings

### Download Google Word2Vec model

In [39]:
!pip install googledrivedownloader



In [40]:
from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='0B7XkCwpI5KDYNlNUTTlSS21pQmM',
                                    dest_path='./GoogleNews-vectors-negative300.bin.gz',
                                    unzip=True)

In [41]:
import gzip
import shutil

In [42]:
with gzip.open('GoogleNews-vectors-negative300.bin.gz', 'rb') as f_in:
    with open('GoogleNews-vectors-negative300.bin', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

##### Get Pre-trained Embeddings

In [43]:
import gensim

In [44]:
from gensim.models import Word2Vec, KeyedVectors

# Load pretrained model
model_word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [51]:
t = tf.keras.preprocessing.text.Tokenizer(num_words=desired_vocab_size, oov_token=32) # num_words -> Vocablury size

In [52]:
embedding_vector_length = 300

In [53]:
#Initialize embedding matrix
embedding_matrix = np.zeros((desired_vocab_size + 1, embedding_vector_length))

In [54]:
#Load word vectors for each word from Google Word2Vec model
for word, i in sorted(t.word_index.items(),key=lambda x:x[1]):
    if i > (desired_vocab_size+1):
        break
    try:
        embedding_vector = model_word2vec[word] #Reading word's embedding from Google Word2Vec
        embedding_matrix[i] = embedding_vector
    except:
        pass

In [55]:
#Initialize model
tf.keras.backend.clear_session()
model_word2vec = tf.keras.Sequential()

In [56]:
model_word2vec.add(tf.keras.layers.Embedding(desired_vocab_size + 1, #Vocablury size
                                    300, #Embedding size
                                    weights=[embedding_matrix],
                                    trainable=False,
                                    input_length=max_review_length) #Number of words in each review
          )

In [57]:
model_word2vec.add(tf.keras.layers.Dropout(0.2))
model_word2vec.add(tf.keras.layers.LSTM(256)) #RNN State - size of cell state and hidden state
model_word2vec.add(tf.keras.layers.Dropout(0.2))
model_word2vec.add(tf.keras.layers.Dense(1,activation='sigmoid'))

In [58]:
model_word2vec.output

<tf.Tensor 'dense/Sigmoid:0' shape=(None, 1) dtype=float32>

In [59]:
#Compile the model
model_word2vec.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [60]:
model_word2vec.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          3000300   
_________________________________________________________________
dropout (Dropout)            (None, 300, 300)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 256)               570368    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 3,570,925
Trainable params: 570,625
Non-trainable params: 3,000,300
_________________________________________________________________


In [62]:
result_word2vec = model_word2vec.fit(training_data,training_targets,
          epochs=5,
          batch_size=32,          
          validation_data=(testing_data, testing_targets))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Word2Vec Embeddings Model Evaluation

In [63]:
# Training Data
score, acc = model_word2vec.evaluate(training_data, training_targets, batch_size=32)
print("Training loss is:", score)
print('Training accuracy is:', acc)

# Testing data
score, acc = model_word2vec.evaluate(testing_data, testing_targets, batch_size=32)
print("Test loss is:", score)
print('Test accuracy is:', acc)

Training loss is: 0.693271279335022
Training accuracy is: 0.5
Test loss is: 0.6932714581489563
Test accuracy is: 0.5


### Generate predictions from Word2Vec model

In [65]:
print("Generate predictions for 5 samples")
probability_predictions = model_word2vec.predict(testing_data[:5])
print("Actual Sentiment:", testing_targets[:5])

predictions = []
for i in probability_predictions:
    if i>=0.5:      # taking threshold as middle value
        predictions.append(1)
    else:
        predictions.append(0)
        
print("Predicted Sentiment:", predictions)     

Generate predictions for 5 samples
Actual Sentiment: [0 1 1 0 1]
Predicted Sentiment: [0, 0, 0, 0, 0]


## Glove Embeddings Model

In [66]:
from gensim.scripts.glove2word2vec import glove2word2vec

In [67]:
#Glove file - we are using model with 50 embedding size
glove_input_file = 'glove.6B.300d.txt'

#Name for word2vec file
word2vec_output_file = 'glove.6B.300d.txt.word2vec'

#Convert Glove embeddings to Word2Vec embeddings
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 300)

### Get Embeddings (from Pre-trained model)

Pre-trained Glove model has 400,000 unique words (Vocabulary size). We do not need all the words. Moreover, we have to arrange word embeddings according to word index created by our tokenizers above. So we will extract word embeddings for only the words that we are interested in.

In [68]:
from gensim.scripts.glove2word2vec import glove2word2vec

In [69]:
#Glove file - we are using model with 50 embedding size
glove_input_file = 'glove.6B.300d.txt'

#Name for word2vec file
word2vec_output_file = 'glove.6B.300d.txt.word2vec'

#Convert Glove embeddings to Word2Vec embeddings
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 300)

In [70]:
from gensim.models import Word2Vec, KeyedVectors

In [71]:
# Load pretrained Glove model (in word2vec form)
model_glove = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [72]:
#Embedding length based on selected model - we are using 300d here.
embedding_vector_length = 300

Initialize a embedding matrix which we will populate for our vocabulary words.

In [73]:
#Initialize embedding matrix
embedding_matrix = np.zeros((desired_vocab_size + 1, embedding_vector_length))

In [74]:
embedding_matrix.shape

(10001, 300)

Load word vectors for each word in our vocabulary from from Glove pre-trained model

In [75]:
for word, i in sorted(t.word_index.items(),key=lambda x:x[1]):
    if i > (desired_vocab_size+1):
        break
    try:
        embedding_vector = model_glove[word] #Reading word's embedding from Glove model for a given word
        embedding_matrix[i] = embedding_vector
    except:
        pass

In [76]:
#Word the - index 1
embedding_matrix[3]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

### Build Model

In [77]:
#Initialize model
tf.keras.backend.clear_session()
model_glove = tf.keras.Sequential()

Add Embedding layer
 - Embedding Layer Input = Batch_Size * Length of each review

In [78]:
model_glove.add(tf.keras.layers.Embedding(desired_vocab_size + 1, #Vocablury size
                                    embedding_vector_length, #Embedding size
                                    weights=[embedding_matrix], #Embeddings taken from pre-trained model
                                    trainable=False, #As embeddings are already available, we will not train this layer. It will act as lookup layer.
                                    input_length=max_review_length) #Number of words in each review
          )

In [79]:
model_glove.output

<tf.Tensor 'embedding/embedding_lookup/Identity_1:0' shape=(None, 300, 300) dtype=float32>

Embedding Layer Output - 
[Batch_Size , Review Length , Embedding_Size]

Add LSTM Layer with 256 as RNN state size

In [80]:
model_glove.add(tf.keras.layers.Dropout(0.2))
model_glove.add(tf.keras.layers.LSTM(256)) #RNN State - size of cell state and hidden state
model_glove.add(tf.keras.layers.Dropout(0.2))

In [81]:
model_glove.output

<tf.Tensor 'dropout_1/cond/Identity:0' shape=(None, 256) dtype=float32>

Use Dense layer for output layer

In [82]:
model_glove.add(tf.keras.layers.Dense(1,activation='sigmoid'))

In [83]:
#Compile the model
model_glove.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [84]:
model_glove.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          3000300   
_________________________________________________________________
dropout (Dropout)            (None, 300, 300)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 256)               570368    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 3,570,925
Trainable params: 570,625
Non-trainable params: 3,000,300
_________________________________________________________________


### Train Glove Model

In [86]:
result_glove = model_glove.fit(training_data,training_targets,
          epochs=5,
          batch_size=32,
          validation_data=(testing_data, testing_targets))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Glove Embeddings Model Evaluation

In [87]:
# Training Data
score, acc = model_glove.evaluate(training_data, training_targets, batch_size=32)
print("Training loss is:", score)
print('Training accuracy is:', acc)

# Testing data
score, acc = model_glove.evaluate(testing_data, testing_targets, batch_size=32)
print("Test loss is:", score)
print('Test accuracy is:', acc)

Training loss is: 0.6931529641151428
Training accuracy is: 0.5
Test loss is: 0.6931532025337219
Test accuracy is: 0.5


## Generate predictions from Glove model

In [88]:
print("Generate predictions for 5 samples")
probability_predictions = model_glove.predict(testing_data[:5])
print("Actual Sentiment:", testing_targets[:5])

predictions = []
for i in probability_predictions:
    if i>=0.5:      # taking threshold as middle value
        predictions.append(1)
    else:
        predictions.append(0)
        
print("Predicted Sentiment:", predictions)     

Generate predictions for 5 samples
Actual Sentiment: [0 1 1 0 1]
Predicted Sentiment: [0, 0, 0, 0, 0]


## Bidirectional LSTM

In [174]:
#Initialize model
tf.keras.backend.clear_session()
model_bidirectional = tf.keras.Sequential()

In [175]:
model_bidirectional.add(tf.keras.layers.Embedding(desired_vocab_size + 1, #Vocablury size
                                    embedding_vector_length, #Embedding size
                                    weights=[embedding_matrix], #Embeddings taken from pre-trained model
                                    trainable=False, #As embeddings are already available, we will not train this layer. It will act as lookup layer.
                                    input_length=max_review_length))

In [176]:
model_bidirectional.add(Bidirectional(LSTM(128, return_sequences = True)))
model_bidirectional.add(Dense(64, activation="relu"))
model_bidirectional.add(Dropout(0.5))
model_bidirectional.add(Dense(16, activation="relu"))
model_bidirectional.add(Dropout(0.5))
# model_bidirectional.add(Dense(32, activation="relu"))
# model_bidirectional.add(Dropout(0.4))
model_bidirectional.add(Dense(1, activation="sigmoid"))
model_bidirectional.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [177]:
model_bidirectional.output

<tf.Tensor 'dense_2/Sigmoid:0' shape=(None, 300, 1) dtype=float32>

In [178]:
model_bidirectional.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 50)           500050    
_________________________________________________________________
bidirectional (Bidirectional (None, 300, 256)          183296    
_________________________________________________________________
dense (Dense)                (None, 300, 64)           16448     
_________________________________________________________________
dropout (Dropout)            (None, 300, 64)           0         
_________________________________________________________________
dense_1 (Dense)              (None, 300, 16)           1040      
_________________________________________________________________
dropout_1 (Dropout)          (None, 300, 16)           0         
_________________________________________________________________
dense_2 (Dense)              (None, 300, 1)            1

### Train Bidirectional Model

In [179]:
result_bidirectional = model_bidirectional.fit(training_data,training_targets,
          epochs=10,
          batch_size=32,          
          validation_data=(testing_data, testing_targets))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Bidirectional Model Evaluation

In [180]:
# Training Data
score, acc = model_bidirectional.evaluate(training_data, training_targets, batch_size=32)
print("Training loss is:", score)
print('Training accuracy is:', acc)

# Testing data
score, acc = model_bidirectional.evaluate(testing_data, testing_targets, batch_size=32)
print("Test loss is:", score)
print('Test accuracy is:', acc)

Training loss is: 0.3224371373653412
Training accuracy is: 0.8612843751907349
Test loss is: 0.4780578911304474
Test accuracy is: 0.7893074750900269


## Generate predictions from Bidirectional model

In [181]:
print("Generate predictions for 5 samples")
probability_predictions = model_bidirectional.predict(testing_data[:5])
print("Actual Sentiment:", testing_targets[:5])
print(probability_predictions)
# predictions = []
# for i in probability_predictions:
#     if i>=0.5:      # taking threshold as middle value
#         predictions.append(1)
#     else:
#         predictions.append(0)
        
# print("Predicted Sentiment:", predictions)     

Generate predictions for 5 samples
Actual Sentiment: [0 1 1 0 1]
[[[0.42315376]
  [0.43027514]
  [0.43443483]
  ...
  [0.12200108]
  [0.1592671 ]
  [0.12225032]]

 [[0.96915287]
  [0.97026396]
  [0.9711535 ]
  ...
  [0.97331214]
  [0.9867103 ]
  [0.9903425 ]]

 [[0.41279334]
  [0.40157527]
  [0.3991126 ]
  ...
  [0.95756257]
  [0.97160673]
  [0.9712478 ]]

 [[0.34416068]
  [0.35147148]
  [0.3558355 ]
  ...
  [0.9658786 ]
  [0.9733903 ]
  [0.9894284 ]]

 [[0.9994641 ]
  [0.99948215]
  [0.9994974 ]
  ...
  [0.99908376]
  [0.99474204]
  [0.9834581 ]]]


## Conlcusion: We could see out of all Simple LSTM and Bidirectional is providing the best prediction results. 
There could be further improvements like changing dropuouts and adding more hidden layers. However, due to time
constraints, we could consider this as the best result.