<a href="https://colab.research.google.com/github/Saifullah785/deep-learning-ai-journey/blob/main/Lecture_29_RNN_Code_Example_in_Keras/Lecture_29_RNN_Code_Example_in_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **RNN Sentiment Analysis**

In [1]:
# In this cell, we are creating a list of documents (or sentences) that we will use for our sentiment analysis task.
# Each string in the list represents a separate document.
import numpy as np

docs = ['go pakistan',
        'pakistan go',
        'hip hip hurray',
        'jeetega bhai jeetega pakistan jeetega',
        'pakistan zindabad',
        'afridi afridi',
        'qahid ki qahid'
        'hum koi ghulam hai',
        'pti jeetega',
        'imran khan zindabad',
        'oh sab de mah a gai he',

]

In [2]:
# Here, we are importing the Tokenizer class from the tensorflow.keras.preprocessing.text module.
# The Tokenizer is used to vectorize a text corpus, by turning each text into either a sequence of integers or into a vector.
# We are initializing a Tokenizer object and specifying an out-of-vocabulary (oov) token.
# The oov_token is used to represent words that are not in the vocabulary of the tokenizer.
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='<nothings>')

In [3]:
# In this cell, we are training the tokenizer on our list of documents.
# The fit_on_texts method updates the internal vocabulary based on the list of texts provided.
tokenizer.fit_on_texts(docs)

In [15]:
# This cell displays the word_index attribute of the tokenizer.
# The word_index is a dictionary that maps words to their integer representations.
# The keys are the words and the values are the corresponding integers.
tokenizer.word_index

{'<nothings>': 1,
 'pakistan': 2,
 'jeetega': 3,
 'go': 4,
 'hip': 5,
 'zindabad': 6,
 'afridi': 7,
 'hurray': 8,
 'bhai': 9,
 'qahid': 10,
 'ki': 11,
 'qahidhum': 12,
 'koi': 13,
 'ghulam': 14,
 'hai': 15,
 'pti': 16,
 'imran': 17,
 'khan': 18,
 'oh': 19,
 'sab': 20,
 'de': 21,
 'mah': 22,
 'a': 23,
 'gai': 24,
 'he': 25}

In [16]:
# This cell displays the word_counts attribute of the tokenizer.
# The word_counts is an ordered dictionary that contains the counts of each word in the documents.
# The keys are the words and the values are their frequencies.
tokenizer.word_counts

OrderedDict([('go', 2),
             ('pakistan', 4),
             ('hip', 2),
             ('hurray', 1),
             ('jeetega', 4),
             ('bhai', 1),
             ('zindabad', 2),
             ('afridi', 2),
             ('qahid', 1),
             ('ki', 1),
             ('qahidhum', 1),
             ('koi', 1),
             ('ghulam', 1),
             ('hai', 1),
             ('pti', 1),
             ('imran', 1),
             ('khan', 1),
             ('oh', 1),
             ('sab', 1),
             ('de', 1),
             ('mah', 1),
             ('a', 1),
             ('gai', 1),
             ('he', 1)])

In [17]:
# This cell displays the document_count attribute of the tokenizer.
# The document_count is the number of documents that the tokenizer was trained on.
tokenizer.document_count

10

In [18]:
# In this cell, we are converting our list of documents into a list of sequences of integers.
# The texts_to_sequences method of the tokenizer is used for this purpose.
# Each document is converted into a sequence of integers based on the word_index.
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[4, 2],
 [2, 4],
 [5, 5, 8],
 [3, 9, 3, 2, 3],
 [2, 6],
 [7, 7],
 [10, 11, 12, 13, 14, 15],
 [16, 3],
 [17, 18, 6],
 [19, 20, 21, 22, 23, 24, 25]]

In [4]:
# This cell calculates the size of the vocabulary.
# The vocabulary size is the number of unique words in the documents, plus the oov_token if specified.
len(tokenizer.word_index)

25

In [6]:
# Here, we are importing the pad_sequences function from the keras.utils module.
# The pad_sequences function is used to ensure that all sequences in a list have the same length.
from keras.utils import pad_sequences

In [7]:
# In this cell, we are padding our sequences to ensure they all have the same length.
# We are using 'post' padding, which means that zeros will be added to the end of the sequences.
sequences = pad_sequences(sequences,padding='post')
sequences

array([[ 4,  2,  0,  0,  0,  0,  0],
       [ 2,  4,  0,  0,  0,  0,  0],
       [ 5,  5,  8,  0,  0,  0,  0],
       [ 3,  9,  3,  2,  3,  0,  0],
       [ 2,  6,  0,  0,  0,  0,  0],
       [ 7,  7,  0,  0,  0,  0,  0],
       [10, 11, 12, 13, 14, 15,  0],
       [16,  3,  0,  0,  0,  0,  0],
       [17, 18,  6,  0,  0,  0,  0],
       [19, 20, 21, 22, 23, 24, 25]], dtype=int32)

In [19]:
# Here we are importing the necessary modules for building our RNN model.
# We import the imdb dataset, the Sequential model, and the Dense, SimpleRNN, and Embedding layers.
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten

In [20]:
# In this cell, we are loading the IMDB dataset.
# The IMDB dataset is a large collection of movie reviews, pre-processed and ready to be used for sentiment analysis.
# The data is already split into training and testing sets.
(X_train,y_train),(X_test,y_test) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [21]:
# This cell displays the first training example from the IMDB dataset.
# The training examples are sequences of integers, where each integer represents a word.
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [22]:
# This cell calculates the length of the third training example in the IMDB dataset.
len(X_train[2])

141

In [23]:
# In this cell, we are padding both the training and testing sequences to a maximum length of 50.
# We are using 'post' padding, which means that zeros will be added to the end of the sequences.
# We are also truncating sequences that are longer than 50.
X_train = pad_sequences(X_train,padding='post',maxlen=50)
X_test = pad_sequences(X_test,padding='post',maxlen=50)

In [24]:
# This cell displays the first training example after padding.
# You can see that the sequence now has a length of 50.
X_train[0]

array([2071,   56,   26,  141,    6,  194, 7486,   18,    4,  226,   22,
         21,  134,  476,   26,  480,    5,  144,   30, 5535,   18,   51,
         36,   28,  224,   92,   25,  104,    4,  226,   65,   16,   38,
       1334,   88,   12,   16,  283,    5,   16, 4472,  113,  103,   32,
         15,   16, 5345,   19,  178,   32], dtype=int32)

In [25]:
# In this cell, we are defining our SimpleRNN model.
# We are using a Sequential model, which is a linear stack of layers.
# The first layer is a SimpleRNN layer with 32 units. The input_shape is (50, 1) because our sequences have a length of 50 and we are feeding one feature at a time.
# The second layer is a Dense layer with a single output unit and a 'sigmoid' activation function, which is suitable for binary classification.
model = Sequential()

model.add(SimpleRNN(32,input_shape=(50,1),return_sequences=False))
model.add(Dense(1,activation='sigmoid'))

model.summary()

  super().__init__(**kwargs)


In [26]:
# In this cell, we are compiling our model.
# We are using 'binary_crossentropy' as the loss function, which is suitable for binary classification problems.
# We are using the 'adam' optimizer, which is a popular and effective optimization algorithm.
# We are also specifying that we want to track the 'accuracy' metric during training.
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [27]:
# In this cell, we are training our model.
# We are using the fit method of the model to train it on the training data (X_train, y_train).
# We are training for 5 epochs, which means that the model will see the entire training dataset 5 times.
# We are also providing validation data (X_test, y_test) to evaluate the model's performance on unseen data after each epoch.
model.fit(X_train,y_train,epochs=5,validation_data=(X_test,y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 16ms/step - accuracy: 0.4983 - loss: 0.7297 - val_accuracy: 0.5040 - val_loss: 0.6934
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 13ms/step - accuracy: 0.5067 - loss: 0.6935 - val_accuracy: 0.5070 - val_loss: 0.6944
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.5149 - loss: 0.6919 - val_accuracy: 0.5028 - val_loss: 0.6937
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.5079 - loss: 0.6926 - val_accuracy: 0.5008 - val_loss: 0.6940
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 13ms/step - accuracy: 0.5091 - loss: 0.6923 - val_accuracy: 0.5028 - val_loss: 0.6939


<keras.src.callbacks.history.History at 0x7e49fa641450>