# RNN Integer Encoding Approach

In [1]:
import numpy as np

docs= ['Hello Everyone', 'Good Evening', 'How are you', 'Hope you are doing well',
      'Enjoy your day', 'RNN CNN ANN', 'RNN NLP LSTM', 'ANN CNN NLP', 'Python Tableau PowerBI',
      'RNN CNN BERT NLP ANN']

In [2]:
len(docs)

10

In [3]:
from keras.preprocessing.text import Tokenizer
tokenizer= Tokenizer(oov_token='Amit')
# oov_token= out-of-vocab token

In [4]:
tokenizer.fit_on_texts(docs)

In [5]:
tokenizer.word_index

{'Amit': 1,
 'rnn': 2,
 'cnn': 3,
 'ann': 4,
 'nlp': 5,
 'are': 6,
 'you': 7,
 'hello': 8,
 'everyone': 9,
 'good': 10,
 'evening': 11,
 'how': 12,
 'hope': 13,
 'doing': 14,
 'well': 15,
 'enjoy': 16,
 'your': 17,
 'day': 18,
 'lstm': 19,
 'python': 20,
 'tableau': 21,
 'powerbi': 22,
 'bert': 23}

In [6]:
tokenizer.word_counts

OrderedDict([('hello', 1),
             ('everyone', 1),
             ('good', 1),
             ('evening', 1),
             ('how', 1),
             ('are', 2),
             ('you', 2),
             ('hope', 1),
             ('doing', 1),
             ('well', 1),
             ('enjoy', 1),
             ('your', 1),
             ('day', 1),
             ('rnn', 3),
             ('cnn', 3),
             ('ann', 3),
             ('nlp', 3),
             ('lstm', 1),
             ('python', 1),
             ('tableau', 1),
             ('powerbi', 1),
             ('bert', 1)])

In [7]:
tokenizer.document_count

10

In [8]:
sequences= tokenizer.texts_to_sequences(docs)

In [9]:
sequences

[[8, 9],
 [10, 11],
 [12, 6, 7],
 [13, 7, 6, 14, 15],
 [16, 17, 18],
 [2, 3, 4],
 [2, 5, 19],
 [4, 3, 5],
 [20, 21, 22],
 [2, 3, 23, 5, 4]]

Padding

In [10]:
from keras.utils import pad_sequences

In [11]:
seq= pad_sequences(sequences, padding='pre')
seq1= pad_sequences(sequences, padding='post')

In [12]:
seq

array([[ 0,  0,  0,  8,  9],
       [ 0,  0,  0, 10, 11],
       [ 0,  0, 12,  6,  7],
       [13,  7,  6, 14, 15],
       [ 0,  0, 16, 17, 18],
       [ 0,  0,  2,  3,  4],
       [ 0,  0,  2,  5, 19],
       [ 0,  0,  4,  3,  5],
       [ 0,  0, 20, 21, 22],
       [ 2,  3, 23,  5,  4]], dtype=int32)

In [13]:
seq1

array([[ 8,  9,  0,  0,  0],
       [10, 11,  0,  0,  0],
       [12,  6,  7,  0,  0],
       [13,  7,  6, 14, 15],
       [16, 17, 18,  0,  0],
       [ 2,  3,  4,  0,  0],
       [ 2,  5, 19,  0,  0],
       [ 4,  3,  5,  0,  0],
       [20, 21, 22,  0,  0],
       [ 2,  3, 23,  5,  4]], dtype=int32)

# IMDB movie review sentiment classification dataset

In [14]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, Flatten

In [15]:
(x_train, y_train), (x_test, y_test)= imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [16]:
x_train.shape

(25000,)

In [17]:
y_train.shape

(25000,)

In [18]:
x_test.shape

(25000,)

In [19]:
y_test.shape

(25000,)

In [20]:
x_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [21]:
len(x_train[0])

218

In [22]:
len(x_train[1])

189

Padding required, since we have different length in all reviews.

In [23]:
from keras.utils import pad_sequences

x_train= pad_sequences(x_train, padding='post', maxlen=100)
# Don't use maxlen, and then try
x_test= pad_sequences(x_test, padding='post', maxlen=100)

In [24]:
len(x_train[0])

100

In [25]:
len(x_train[1])

100

In [26]:
x_train[0]

array([ 1415,    33,     6,    22,    12,   215,    28,    77,    52,
           5,    14,   407,    16,    82, 10311,     8,     4,   107,
         117,  5952,    15,   256,     4, 31050,     7,  3766,     5,
         723,    36,    71,    43,   530,   476,    26,   400,   317,
          46,     7,     4, 12118,  1029,    13,   104,    88,     4,
         381,    15,   297,    98,    32,  2071,    56,    26,   141,
           6,   194,  7486,    18,     4,   226,    22,    21,   134,
         476,    26,   480,     5,   144,    30,  5535,    18,    51,
          36,    28,   224,    92,    25,   104,     4,   226,    65,
          16,    38,  1334,    88,    12,    16,   283,     5,    16,
        4472,   113,   103,    32,    15,    16,  5345,    19,   178,
          32], dtype=int32)

In [27]:
model= Sequential()
model.add(SimpleRNN(32, input_shape=(100,1), return_sequences=False))# Use return_sequences=True in case of Name entity recognition, NLP  
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 32)                1088      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 1,121
Trainable params: 1,121
Non-trainable params: 0
_________________________________________________________________


![image.png](attachment:image.png)

In [28]:
#model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=32, epochs= 20)

# RNN - Embeding approach

In [29]:
docs= ['Hello Everyone', 'Good Evening', 'How are you', 'Hope you are doing well',
      'Enjoy your day', 'RNN CNN ANN', 'RNN NLP LSTM', 'ANN CNN NLP', 'Python Tableau PowerBI',
      'RNN CNN BERT NLP ANN']

In [30]:
from keras.preprocessing.text import Tokenizer
tokenizer= Tokenizer()

In [31]:
tokenizer.fit_on_texts(docs)

In [32]:
len(tokenizer.word_index)

22

In [33]:
tokenizer.word_index

{'rnn': 1,
 'cnn': 2,
 'ann': 3,
 'nlp': 4,
 'are': 5,
 'you': 6,
 'hello': 7,
 'everyone': 8,
 'good': 9,
 'evening': 10,
 'how': 11,
 'hope': 12,
 'doing': 13,
 'well': 14,
 'enjoy': 15,
 'your': 16,
 'day': 17,
 'lstm': 18,
 'python': 19,
 'tableau': 20,
 'powerbi': 21,
 'bert': 22}

In [34]:
sequences= tokenizer.texts_to_sequences(docs)

In [35]:
sequences

[[7, 8],
 [9, 10],
 [11, 5, 6],
 [12, 6, 5, 13, 14],
 [15, 16, 17],
 [1, 2, 3],
 [1, 4, 18],
 [3, 2, 4],
 [19, 20, 21],
 [1, 2, 22, 4, 3]]

In [36]:
from keras.utils import pad_sequences
seq3= pad_sequences(sequences, padding='post')
seq3

array([[ 7,  8,  0,  0,  0],
       [ 9, 10,  0,  0,  0],
       [11,  5,  6,  0,  0],
       [12,  6,  5, 13, 14],
       [15, 16, 17,  0,  0],
       [ 1,  2,  3,  0,  0],
       [ 1,  4, 18,  0,  0],
       [ 3,  2,  4,  0,  0],
       [19, 20, 21,  0,  0],
       [ 1,  2, 22,  4,  3]], dtype=int32)

In [37]:
model= Sequential()
model.add(Embedding(22, output_dim=2, input_length=5))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 5, 2)              44        
                                                                 
Total params: 44
Trainable params: 44
Non-trainable params: 0
_________________________________________________________________


In [38]:
model.compile('adam', 'accuracy')

In [49]:
pred= model.predict(seq3)
print(pred)

ValueError: ignored

In [40]:
# Using Embeding method with IMDB Dataset

In [41]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, Flatten

In [42]:
'''
Integer encoding
model= Sequential()
model.add(SimpleRNN(32, input_shape=(100,1), return_sequences=False))# Use return_sequences=True in case of Name entity recognition, NLP  
model.add(Dense(1, activation='sigmoid'))
model.summary()
'''

"\nInteger encoding\nmodel= Sequential()\nmodel.add(SimpleRNN(32, input_shape=(100,1), return_sequences=False))# Use return_sequences=True in case of Name entity recognition, NLP  \nmodel.add(Dense(1, activation='sigmoid'))\nmodel.summary()\n"

In [43]:
(x_train, y_train), (x_test, y_test)= imdb.load_data()

In [44]:
from keras.utils import pad_sequences

x_train= pad_sequences(x_train, padding='post', maxlen=100)
# Don't use maxlen, and then try
x_test= pad_sequences(x_test, padding='post', maxlen=100)

In [45]:
model= Sequential()
model.add(Embedding(10000, output_dim=2, input_length=100))# Use return_sequences=True in case of Name entity recognition, NLP  
model.add(SimpleRNN(32, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 2)            20000     
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                1120      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 21,153
Trainable params: 21,153
Non-trainable params: 0
_________________________________________________________________


In [46]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=32, epochs= 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fbcf39060b0>