# Author: Dr Ayan Debnath
## Date: 13th October 2023
## NLP Live Class - Data Science Master 1 Revision class

### Topics: RNN

In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer

In [2]:
docs = ['recurrent neural network',
		'neural network',
		'artificial neural',
		'connections between nodes',
		'can create a cycle',
		'allowing output',
		'some nodes to affect subsequent',
		'exhibit temporal',
		'dynamic behavior',
		'type of Neural Network',
    'affect subsequent']

In [4]:
tokenizer = Tokenizer(oov_token = '<nothing>')
tokenizer.fit_on_texts(docs)

In [5]:
tokenizer.word_index


{'<nothing>': 1,
 'neural': 2,
 'network': 3,
 'nodes': 4,
 'affect': 5,
 'subsequent': 6,
 'recurrent': 7,
 'artificial': 8,
 'connections': 9,
 'between': 10,
 'can': 11,
 'create': 12,
 'a': 13,
 'cycle': 14,
 'allowing': 15,
 'output': 16,
 'some': 17,
 'to': 18,
 'exhibit': 19,
 'temporal': 20,
 'dynamic': 21,
 'behavior': 22,
 'type': 23,
 'of': 24}

In [6]:
tokenizer.word_counts

OrderedDict([('recurrent', 1),
             ('neural', 4),
             ('network', 3),
             ('artificial', 1),
             ('connections', 1),
             ('between', 1),
             ('nodes', 2),
             ('can', 1),
             ('create', 1),
             ('a', 1),
             ('cycle', 1),
             ('allowing', 1),
             ('output', 1),
             ('some', 1),
             ('to', 1),
             ('affect', 2),
             ('subsequent', 2),
             ('exhibit', 1),
             ('temporal', 1),
             ('dynamic', 1),
             ('behavior', 1),
             ('type', 1),
             ('of', 1)])

In [7]:
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[7, 2, 3],
 [2, 3],
 [8, 2],
 [9, 10, 4],
 [11, 12, 13, 14],
 [15, 16],
 [17, 4, 18, 5, 6],
 [19, 20],
 [21, 22],
 [23, 24, 2, 3],
 [5, 6]]

In [8]:
len(sequences[0])

3

In [9]:
len(sequences[2])

2

In [11]:
len(sequences[0]) == len(sequences[2])

False

In [2]:
from keras.utils import pad_sequences

In [13]:
sequences = pad_sequences(sequences,padding='post')

In [14]:
sequences

array([[ 7,  2,  3,  0,  0],
       [ 2,  3,  0,  0,  0],
       [ 8,  2,  0,  0,  0],
       [ 9, 10,  4,  0,  0],
       [11, 12, 13, 14,  0],
       [15, 16,  0,  0,  0],
       [17,  4, 18,  5,  6],
       [19, 20,  0,  0,  0],
       [21, 22,  0,  0,  0],
       [23, 24,  2,  3,  0],
       [ 5,  6,  0,  0,  0]], dtype=int32)

In [15]:
len(sequences[0]) == len(sequences[2])

True

# Sentiment Analysis

In [3]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten

In [4]:
(X_train,y_train),(X_test,y_test) = imdb.load_data()

In [5]:
X_train

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1

In [20]:
X_test

array([list([1, 591, 202, 14, 31, 6, 717, 10, 10, 18142, 10698, 5, 4, 360, 7, 4, 177, 5760, 394, 354, 4, 123, 9, 1035, 1035, 1035, 10, 10, 13, 92, 124, 89, 488, 7944, 100, 28, 1668, 14, 31, 23, 27, 7479, 29, 220, 468, 8, 124, 14, 286, 170, 8, 157, 46, 5, 27, 239, 16, 179, 15387, 38, 32, 25, 7944, 451, 202, 14, 6, 717]),
       list([1, 14, 22, 3443, 6, 176, 7, 5063, 88, 12, 2679, 23, 1310, 5, 109, 943, 4, 114, 9, 55, 606, 5, 111, 7, 4, 139, 193, 273, 23, 4, 172, 270, 11, 7216, 10626, 4, 8463, 2801, 109, 1603, 21, 4, 22, 3861, 8, 6, 1193, 1330, 10, 10, 4, 105, 987, 35, 841, 16873, 19, 861, 1074, 5, 1987, 17975, 45, 55, 221, 15, 670, 5304, 526, 14, 1069, 4, 405, 5, 2438, 7, 27, 85, 108, 131, 4, 5045, 5304, 3884, 405, 9, 3523, 133, 5, 50, 13, 104, 51, 66, 166, 14, 22, 157, 9, 4, 530, 239, 34, 8463, 2801, 45, 407, 31, 7, 41, 3778, 105, 21, 59, 299, 12, 38, 950, 5, 4521, 15, 45, 629, 488, 2733, 127, 6, 52, 292, 17, 4, 6936, 185, 132, 1988, 5304, 1799, 488, 2693, 47, 6, 392, 173, 4, 21686, 4

In [21]:
y_train

array([1, 0, 0, ..., 0, 1, 0])

In [22]:
print(len(X_train[2]))
print(len(X_train[3]))

141
550


In [6]:
X_train = pad_sequences(X_train,padding='post', maxlen=50)
X_test = pad_sequences(X_test,padding='post', maxlen=50)



In [7]:
print(len(X_train[2]))
print(len(X_train[3]))

50
50


In [25]:
# RNN
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten

model = Sequential()
model.add(SimpleRNN(32,input_shape=(50,1),return_sequences=False))
model.add(Dense(1,activation='sigmoid'))

model.summary


<bound method Model.summary of <keras.src.engine.sequential.Sequential object at 0x7ebe48d7dc00>>

In [26]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])


In [28]:
model.fit(X_train, y_train, epochs=5, validation_data = (X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ebe49d96380>

In [10]:
# Embeddings

model = Sequential()
model.add(Embedding(10000,output_dim=1,input_length=50))
model.add(SimpleRNN(32,input_shape=(50,1),return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 1)             10000     
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                1088      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 11121 (43.44 KB)
Trainable params: 11121 (43.44 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train,epochs=1,validation_data=(X_test,y_test))

InvalidArgumentError: ignored

In [12]:
X_train.shape

(25000, 50)

In [33]:
pred = model.predict(X_test)
print(pred)

InvalidArgumentError: ignored