<a href="https://colab.research.google.com/github/Sivasurya-J/DataScienceAcademicProjects/blob/main/IMDB_Movie_reviews_Sentiment_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from tensorflow.keras.datasets import imdb

In [2]:
number_of_words = 10000

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=number_of_words)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


**Data Exploration**

In [3]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

In [4]:
%pprint

Pretty printing has been turned OFF


In [5]:
X_train[123]

[1, 307, 5, 1301, 20, 1026, 2511, 87, 2775, 52, 116, 5, 31, 7, 4, 91, 1220, 102, 13, 28, 110, 11, 6, 137, 13, 115, 219, 141, 35, 221, 956, 54, 13, 16, 11, 2714, 61, 322, 423, 12, 38, 76, 59, 1803, 72, 8, 2, 23, 5, 967, 12, 38, 85, 62, 358, 99]

**Decoding a Movie Review**

In [6]:
word_to_index = imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [7]:
word_to_index['great']

84

In [8]:
index_to_word = {index: word for (word, index) in word_to_index.items()}

In [9]:
[index_to_word[i] for i in range(1, 51)]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i', 'this', 'that', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'on', 'not', 'you', 'are', 'his', 'have', 'he', 'be', 'one', 'all', 'at', 'by', 'an', 'they', 'who', 'so', 'from', 'like', 'her', 'or', 'just', 'about', "it's", 'out', 'has', 'if', 'some', 'there', 'what', 'good', 'more']

In [10]:
' '.join([index_to_word.get(i - 3, '?') for i in X_train[123]])

'? beautiful and touching movie rich colors great settings good acting and one of the most charming movies i have seen in a while i never saw such an interesting setting when i was in china my wife liked it so much she asked me to ? on and rate it so other would enjoy too'

In [11]:
y_train[123]

1

***Data Preparation***

In [12]:
words_per_review = 200

from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train = pad_sequences(X_train, maxlen=words_per_review)

In [13]:
X_train.shape

(25000, 200)

In [14]:
X_test = pad_sequences(X_test, maxlen=words_per_review)
X_test.shape

(25000, 200)

In [15]:
from sklearn.model_selection import train_test_split
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, random_state=11, test_size=0.20)

In [16]:
X_test.shape, X_val.shape

((20000, 200), (5000, 200))

**Creating Neural Network**

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.layers import Embedding

rnn=Sequential()
rnn.add(Embedding(input_dim=number_of_words, output_dim=128, input_length=words_per_review))
# adding LSTM layer
rnn.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
rnn.add(Dense(units=1, activation='sigmoid'))

In [18]:
rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [19]:
rnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,411,713
Trainable params: 1,411,713
Non-trainable params: 0
_________________________________________________________________


**Training and Evaluating Model**

In [20]:
rnn.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History object at 0x7f8c22e1bfd0>

In [21]:
results = rnn.evaluate(X_test, y_test)

