# **Part 1: Recurrent Neural Network**

### **Importing packages**

In [21]:
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM,Dropout
from keras.datasets import imdb

from tensorflow.keras.utils import to_categorical

import warnings
warnings.filterwarnings('ignore')
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

### **Preparing Dataset**

In [3]:
max_features = 1000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)


In [4]:
x_train[0]

array([ 15, 256,   4,   2,   7,   2,   5, 723,  36,  71,  43, 530, 476,
        26, 400, 317,  46,   7,   4,   2,   2,  13, 104,  88,   4, 381,
        15, 297,  98,  32,   2,  56,  26, 141,   6, 194,   2,  18,   4,
       226,  22,  21, 134, 476,  26, 480,   5, 144,  30,   2,  18,  51,
        36,  28, 224,  92,  25, 104,   4, 226,  65,  16,  38,   2,  88,
        12,  16, 283,   5,  16,   2, 113, 103,  32,  15,  16,   2,  19,
       178,  32], dtype=int32)

### **Visualize the data**

In [5]:
INDEX_FROM=3   # word index offset

word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in x_train[10] ))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
a lot of <UNK> <UNK> the <UNK> plot the characters are all very interesting in their own way and the fact that the book itself almost takes on its own character is very well done anyone <UNK> that the film won't <UNK> by the end won't be disappointed either as the ending both makes sense and <UNK> to be quite <UNK> overall <UNK> is a truly great horror film and one of the best of the <UNK> highly <UNK> viewing


### **Building a Model**

In [6]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(16, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Build model...
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 8)           8000      
                                                                 
 lstm (LSTM)                 (None, 16)                1600      
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 9617 (37.57 KB)
Trainable params: 9617 (37.57 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### **Model Training**

In [7]:
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Write the training input and output, batch size, and testing input and output

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=30,
          validation_data=(x_test, y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7e6e52497e20>

### **Testing**

In [8]:
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.3981856107711792
Test accuracy: 0.8264399766921997


### **Prediction**

In [9]:
prediction = model.predict(x_test[22220:22221])
print('Prediction value:',prediction[0])
print('Test Label:',y_test[22220:22221])

Prediction value: [0.9421185]
Test Label: [1]


### **Other RNN Layers**
keras.layers.RNN(cell, return_sequences=False);
keras.layers.SimpleRNN(units, activation='tanh');
keras.layers.GRU(units, activation='tanh', recurrent_activation='hard_sigmoid');
keras.layers.ConvLSTM2D(filters, kernel_size, strides=(1, 1), padding='valid', );
keras.layers.SimpleRNNCell(units, activation='tanh');
keras.layers.GRUCell(units, activation='tanh', recurrent_activation='hard_sigmoid');
keras.layers.LSTMCell(units, activation='tanh', recurrent_activation='hard_sigmoid');
keras.layers.CuDNNGRU(units, kernel_initializer='glorot_uniform');
keras.layers.CuDNNLSTM(units, kernel_initializer='glorot_uniform');

# **Part 2: Recurrent Neural Network with Custom Dataset**

In [10]:
!wget https://notebooks.azure.com/vipulmishra/projects/labgail/raw/Senti.csv

--2024-05-13 09:23:58--  https://notebooks.azure.com/vipulmishra/projects/labgail/raw/Senti.csv
Resolving notebooks.azure.com (notebooks.azure.com)... 13.107.246.40, 13.107.213.40, 2620:1ec:bdf::40, ...
Connecting to notebooks.azure.com (notebooks.azure.com)|13.107.246.40|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://visualstudio.microsoft.com/vs/features/notebooks-at-microsoft [following]
--2024-05-13 09:23:58--  https://visualstudio.microsoft.com/vs/features/notebooks-at-microsoft
Resolving visualstudio.microsoft.com (visualstudio.microsoft.com)... 23.5.154.136
Connecting to visualstudio.microsoft.com (visualstudio.microsoft.com)|23.5.154.136|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://visualstudio.microsoft.com/vs/features/notebooks-at-microsoft/ [following]
--2024-05-13 09:23:59--  https://visualstudio.microsoft.com/vs/features/notebooks-at-microsoft/
Reusing existing connect

### **Load data**

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
import pandas as pd
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Custom_data.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

### **Visualize data**

In [16]:
data.head(10)

Unnamed: 0,text,sentiment
0,I love this car,Positive
1,This view is amazing,Positive
2,I feel great this morning,Positive
3,I am so excited about the concert,Positive
4,He is my best friend,Positive
5,I do not like this car,Negative
6,This view is horrible,Negative
7,I feel tired this morning,Negative
8,I am not looking forward to the concert,Negative
9,He is my enemy,Negative


### **Format data**

In [17]:
data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')

max_fatures = 2000
tokenizer = Tokenizer(nb_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

### **Training set**

In [18]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print('Shape of training samples:',X_train.shape,Y_train.shape)
print('Shape of testing samples:',X_test.shape,Y_test.shape)

Shape of training samples: (6, 8) (6, 2)
Shape of testing samples: (4, 8) (4, 2)


### **Design a model**

In [45]:
model = Sequential()
model.add(Embedding(max_fatures, 128 ,input_length = X.shape[1]))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 8, 128)            256000    
                                                                 
 dropout_7 (Dropout)         (None, 8, 128)            0         
                                                                 
 lstm_8 (LSTM)               (None, 128)               131584    
                                                                 
 dense_8 (Dense)             (None, 2)                 258       
                                                                 
Total params: 387842 (1.48 MB)
Trainable params: 387842 (1.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


### **Training**

In [46]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose=2)

Epoch 1/5
1/1 - 3s - loss: 0.6948 - accuracy: 0.5000 - 3s/epoch - 3s/step
Epoch 2/5
1/1 - 0s - loss: 0.6888 - accuracy: 0.5000 - 22ms/epoch - 22ms/step
Epoch 3/5
1/1 - 0s - loss: 0.6821 - accuracy: 0.6667 - 21ms/epoch - 21ms/step
Epoch 4/5
1/1 - 0s - loss: 0.6744 - accuracy: 0.6667 - 21ms/epoch - 21ms/step
Epoch 5/5
1/1 - 0s - loss: 0.6671 - accuracy: 0.6667 - 20ms/epoch - 20ms/step


<keras.src.callbacks.History at 0x7e6e4dfde860>

### **Validation**

In [47]:
score,acc = model.evaluate(X_test, Y_test, verbose = 1, batch_size = batch_size)
print("Score: %.2f" % (score))
print("Accuracy: %.2f" % (acc))

Score: 0.72
Accuracy: 0.25


### **Formatting Test Example**

In [48]:
text = 'We are going to Delhi'
tester = np.array([text])
tester = pd.DataFrame(tester)
tester.columns = ['text']

tester['text'] = tester['text'].apply(lambda x: x.lower())
tester['text'] = tester['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

max_fatures = 2000
test = tokenizer.texts_to_sequences(tester['text'].values)
test = pad_sequences(test)

if X.shape[1]>test.shape[1]:
    test = np.pad(test[0], (X.shape[1]-test.shape[1],0), 'constant')

test = np.array([test])

prediction = model.predict(test)
print('Prediction value:',prediction[0])

Prediction value: [0.5413757  0.45862424]


# **Part 3: RNN Design Choices**

### **Influence of number of nodes**

### **LSTM with 8 nodes**

In [49]:
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, None, 8)           8000      
                                                                 
 lstm_9 (LSTM)               (None, 8)                 544       
                                                                 
 dense_9 (Dense)             (None, 1)                 9         
                                                                 
Total params: 8553 (33.41 KB)
Trainable params: 8553 (33.41 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________



Test score: 0.40820351243019104
Test accuracy: 0.8145599961280823


### **Influence of Embedding**

In [50]:
model = Sequential()
model.add(Embedding(max_features, 4))
model.add(LSTM(16, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, None, 4)           4000      
                                                                 
 lstm_10 (LSTM)              (None, 16)                1344      
                                                                 
 dense_10 (Dense)            (None, 1)                 17        
                                                                 
Total params: 5361 (20.94 KB)
Trainable params: 5361 (20.94 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Test score: 0.4122101068496704
Test accuracy: 0.8126000165939331


# **Influence of Dropout**
### **Dropout with probability 0.5**

In [52]:
model = Sequential()
model.add(Embedding(max_features, 4))
model.add(LSTM(16, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, None, 4)           4000      
                                                                 
 lstm_12 (LSTM)              (None, 16)                1344      
                                                                 
 dense_12 (Dense)            (None, 1)                 17        
                                                                 
Total params: 5361 (20.94 KB)
Trainable params: 5361 (20.94 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Test score: 0.4273240268230438
Test accuracy: 0.8037199974060059


# **Multilayered RNNs**
### **RNN with 2 layer LSTM**

In [53]:
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, None, 8)           8000      
                                                                 
 lstm_13 (LSTM)              (None, None, 8)           544       
                                                                 
 lstm_14 (LSTM)              (None, None, 8)           544       
                                                                 
 lstm_15 (LSTM)              (None, 8)                 544       
                                                                 
 dense_13 (Dense)            (None, 1)                 9         
                                                                 
Total params: 9641 (37.66 KB)
Trainable params: 9641 (37.66 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Test score: 0.42159