# Sentiment Analysis Using IMDB Movie Data Set

### Technologies Used:
- Keras
- LTSM RNN Model
- Dataset: IMdB Movie Review Dataset

### Loading the dataset and importing the libraries

In [84]:
#importing necessary libraries

import numpy
from numpy import array
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Dropout
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras.models import load_model
import re
import numpy as np
from nltk.tokenize import word_tokenize
import nltk

# fixing a random seed
numpy.random.seed(7)

In [71]:
# downloading the IMdB dataset but limiting the vocabulary size to the first 5000 words
top_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [79]:
print('Reviews --> ') #Reviews are stored as a sequence of integers. Word IDs are preassigned to individual words
print(X_train[6])
print('Labels --> ') #Label is an integer 0 --> negative, 1 --> positive
print(y_train[6])

Reviews --> 
[1, 2, 365, 1234, 5, 1156, 354, 11, 14, 2, 2, 7, 1016, 2, 2, 356, 44, 4, 1349, 500, 746, 5, 200, 4, 4132, 11, 2, 2, 1117, 1831, 2, 5, 4831, 26, 6, 2, 4183, 17, 369, 37, 215, 1345, 143, 2, 5, 1838, 8, 1974, 15, 36, 119, 257, 85, 52, 486, 9, 6, 2, 2, 63, 271, 6, 196, 96, 949, 4121, 4, 2, 7, 4, 2212, 2436, 819, 63, 47, 77, 2, 180, 6, 227, 11, 94, 2494, 2, 13, 423, 4, 168, 7, 4, 22, 5, 89, 665, 71, 270, 56, 5, 13, 197, 12, 161, 2, 99, 76, 23, 2, 7, 419, 665, 40, 91, 85, 108, 7, 4, 2084, 5, 4773, 81, 55, 52, 1901]
Labels --> 
1


In [80]:
word2id = imdb.get_word_index() #Using the dictionary to map the WordIDs to original words
id2word = {i: word for word, i in word2id.items()}
print('Review (Words) --> ')
print([id2word.get(i, ' ') for i in X_train[6]])
print('Label --> ')
print(y_train[6])

Review (Words) --> 
['the', 'and', 'full', 'involving', 'to', 'impressive', 'boring', 'this', 'as', 'and', 'and', 'br', 'villain', 'and', 'and', 'need', 'has', 'of', 'costumes', 'b', 'message', 'to', 'may', 'of', 'props', 'this', 'and', 'and', 'concept', 'issue', 'and', 'to', "god's", 'he', 'is', 'and', 'unfolds', 'movie', 'women', 'like', "isn't", 'surely', "i'm", 'and', 'to', 'toward', 'in', "here's", 'for', 'from', 'did', 'having', 'because', 'very', 'quality', 'it', 'is', 'and', 'and', 'really', 'book', 'is', 'both', 'too', 'worked', 'carl', 'of', 'and', 'br', 'of', 'reviewer', 'closer', 'figure', 'really', 'there', 'will', 'and', 'things', 'is', 'far', 'this', 'make', 'mistakes', 'and', 'was', "couldn't", 'of', 'few', 'br', 'of', 'you', 'to', "don't", 'female', 'than', 'place', 'she', 'to', 'was', 'between', 'that', 'nothing', 'and', 'movies', 'get', 'are', 'and', 'br', 'yes', 'female', 'just', 'its', 'because', 'many', 'br', 'of', 'overly', 'to', 'descent', 'people', 'time', 'ver

In [82]:
#Now we can view the word to ID correlation using word2id
word2ID = list(word2id.items())[:20] #extracting the first 20 word pairs
top_20_wordIDs = word2ID[:20] 
for word, id in top_20_wordIDs:
    print(f"Word: {word}, ID: {id}")

Word: fawn, ID: 34701
Word: tsukino, ID: 52006
Word: nunnery, ID: 52007
Word: sonja, ID: 16816
Word: vani, ID: 63951
Word: woods, ID: 1408
Word: spiders, ID: 16115
Word: hanging, ID: 2345
Word: woody, ID: 2289
Word: trawling, ID: 52008
Word: hold's, ID: 52009
Word: comically, ID: 11307
Word: localized, ID: 40830
Word: disobeying, ID: 30568
Word: 'royale, ID: 52010
Word: harpo's, ID: 40831
Word: canet, ID: 52011
Word: aileen, ID: 19313
Word: acurately, ID: 52012
Word: diplomat's, ID: 52013


In [83]:
#getting the maximum and minimum length for the review
print('Maximum length: {}'.format(
len(max((X_train + X_test), key=len))))

print('Minimum length: {}'.format(
len(min((X_train + X_test), key=len))))


Maximum length: 2697
Minimum length: 70


In [85]:
#Limiting the length of maximum review to 500 for feeding into our RNN padding the shorter ones using pad_sequences()

max_length = 600
X_train = sequence.pad_sequences(X_train, maxlen=max_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_length)

In [88]:
#Model Creation

embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_length))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 600, 32)           320000    
                                                                 
 dropout_2 (Dropout)         (None, 600, 32)           0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               53200     
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 373301 (1.42 MB)
Trainable params: 373301 (1.42 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/50
Ep

<keras.src.callbacks.History at 0x1eb5a956d50>

In [92]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: ", (scores[1]*100))

Accuracy:  86.22400164604187


In [93]:
model.save("sentiment_analysis.h5")

  saving_api.save_model(


In [None]:
model = load_model("sentiment_analysis.h5")