## How integer encode using keras

In [1]:
import numpy as np

In [1]:
docs = ['recurrent neural network',
		'neural network',
		'artificial neural',
		'connections between nodes',
		'can create a cycle',
		'allowing output',
		'some nodes to affect subsequent',
		'exhibit temporal',
		'dynamic behavior',
		'type of Neural Network',
    'affect subsequent']

In [2]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='<nothing>')  # oov: out of vocabulary means unknown words in test data will be replaced by <nothing>

In [3]:
tokenizer.fit_on_texts(docs)

In [4]:
tokenizer.word_index # word_index is a dictionary of words and their uniquely assigned integers

{'<nothing>': 1,
 'neural': 2,
 'network': 3,
 'nodes': 4,
 'affect': 5,
 'subsequent': 6,
 'recurrent': 7,
 'artificial': 8,
 'connections': 9,
 'between': 10,
 'can': 11,
 'create': 12,
 'a': 13,
 'cycle': 14,
 'allowing': 15,
 'output': 16,
 'some': 17,
 'to': 18,
 'exhibit': 19,
 'temporal': 20,
 'dynamic': 21,
 'behavior': 22,
 'type': 23,
 'of': 24}

In [5]:
tokenizer.word_counts # word_counts is a dictionary of words and their counts

OrderedDict([('recurrent', 1),
             ('neural', 4),
             ('network', 3),
             ('artificial', 1),
             ('connections', 1),
             ('between', 1),
             ('nodes', 2),
             ('can', 1),
             ('create', 1),
             ('a', 1),
             ('cycle', 1),
             ('allowing', 1),
             ('output', 1),
             ('some', 1),
             ('to', 1),
             ('affect', 2),
             ('subsequent', 2),
             ('exhibit', 1),
             ('temporal', 1),
             ('dynamic', 1),
             ('behavior', 1),
             ('type', 1),
             ('of', 1)])

In [6]:
tokenizer.document_count # document_count is the number of documents that were used to fit the Tokenizer

11

In [7]:
sequences = tokenizer.texts_to_sequences(docs) # texts_to_sequences() converts a list of texts to a list of sequences of integers
sequences

[[7, 2, 3],
 [2, 3],
 [8, 2],
 [9, 10, 4],
 [11, 12, 13, 14],
 [15, 16],
 [17, 4, 18, 5, 6],
 [19, 20],
 [21, 22],
 [23, 24, 2, 3],
 [5, 6]]

In [9]:
from keras.utils import pad_sequences
padded_sequences = pad_sequences(sequences, padding='post') # pad_sequences() transforms a list of sequences into a 2D Numpy array of shape (num_samples, maxlen)

In [10]:
padded_sequences

array([[ 7,  2,  3,  0,  0],
       [ 2,  3,  0,  0,  0],
       [ 8,  2,  0,  0,  0],
       [ 9, 10,  4,  0,  0],
       [11, 12, 13, 14,  0],
       [15, 16,  0,  0,  0],
       [17,  4, 18,  5,  6],
       [19, 20,  0,  0,  0],
       [21, 22,  0,  0,  0],
       [23, 24,  2,  3,  0],
       [ 5,  6,  0,  0,  0]])

In [None]:
# this integer approch is not a good idea because it does not take into account the semantic meaning of words 
# as we have to add padding to make all the sequences of same length which make the computation expensive


# Sentiment Analysis

In [1]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense, Embedding, Flatten , Dropout,SimpleRNN

In [2]:
(x_train, y_train), (x_test, y_test) = imdb.load_data()

In [3]:
x_train[0] # each review is a list of integers where each integer represents a word

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

In [4]:
# problem : length of each review is different
print(len(x_train[0]))
print(len(x_train[1]))

218
189


In [5]:
from keras.utils import pad_sequences
# To solve this problem we will use padding
x_train = pad_sequences(x_train, maxlen=250) # maxlen is the maximum length of each review
x_test = pad_sequences(x_test, maxlen=250) # maxlen is the maximum length of each review

In [6]:
print(len(x_train[0]))
print(len(x_train[1]))
# now all the reviews are of same length

250
250


In [7]:
# model building
model = Sequential()
model.add(SimpleRNN(32, input_shape=(250,1),return_sequences=False))  # here 250 times unrolled
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 32)                1088      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 1,121
Trainable params: 1,121
Non-trainable params: 0
_________________________________________________________________


 the meaning of `return_sequences=False`. In an RNN layer, the `return_sequences` parameter determines whether the layer should return the output sequence for each input time step or just the final output.

When `return_sequences=False`, the RNN layer only returns the final output for the last time step of the input sequence. This is suitable for many-to-one architectures because we are only interested in the final prediction or output for the entire sequence. In your code, the `SimpleRNN` layer has `return_sequences=False`, indicating that it will only return the final output.

If you were to set `return_sequences=True`, the RNN layer would return the output for each time step in the input sequence, resulting in a sequence of outputs. This is useful for many-to-many architectures, where the output of each time step is important, such as in sequence generation tasks.

Let's break down the `SimpleRNN` layer declaration you mentioned: `SimpleRNN(32, input_shape=(250,1))`.

- `32`: This specifies the number of units or cells in the SimpleRNN layer. These units represent the internal computational elements of the layer that process the input sequence.
- `input_shape=(250,1)`: This defines the shape of the input data that will be fed into the SimpleRNN layer. It indicates that the input sequences will have a length of 250 time steps, and each time step will have a single feature.
In the case of the `SimpleRNN` layer, the number of units (32 in this case) determines the complexity and representational capacity of the layer. It influences the layer's ability to learn and model patterns in the input sequence.

On the other hand, the input shape (250, 1) defines the shape of the input data that will be processed by the `SimpleRNN` layer. It specifies that the input sequences will have a length of 250 time steps, and each time step will have a single feature.

In [8]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# how to do encodings using keras embedding 

In [9]:
docs = ['recurrent neural network',
		'neural network',
		'artificial neural',
		'connections between nodes',
		'can create a cycle',
		'allowing output',
		'some nodes to affect subsequent',
		'exhibit temporal',
		'dynamic behavior',
		'type of Neural Network',
    'affect subsequent']

In [10]:
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer()

In [11]:
tokenizer.fit_on_texts(docs)

In [12]:
len(tokenizer.word_index)

23

In [13]:
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[6, 1, 2],
 [1, 2],
 [7, 1],
 [8, 9, 3],
 [10, 11, 12, 13],
 [14, 15],
 [16, 3, 17, 4, 5],
 [18, 19],
 [20, 21],
 [22, 23, 1, 2],
 [4, 5]]

In [14]:
from keras.utils import pad_sequences
sequences = pad_sequences(sequences,padding='post')
sequences

array([[ 6,  1,  2,  0,  0],
       [ 1,  2,  0,  0,  0],
       [ 7,  1,  0,  0,  0],
       [ 8,  9,  3,  0,  0],
       [10, 11, 12, 13,  0],
       [14, 15,  0,  0,  0],
       [16,  3, 17,  4,  5],
       [18, 19,  0,  0,  0],
       [20, 21,  0,  0,  0],
       [22, 23,  1,  2,  0],
       [ 4,  5,  0,  0,  0]])

In [15]:
model = Sequential()
model.add(Embedding(23,output_dim=2,input_length=5)) #Total vocab len, ouput dim(per word would be represend by 2 vector), input len per row

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 5, 2)              46        
                                                                 
Total params: 46
Trainable params: 46
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile("adam","accuracy")

In [17]:
pred = model.predict(sequences)
pred



array([[[-0.02693019, -0.02901098],
        [-0.00583706,  0.03699764],
        [ 0.02434461, -0.02477236],
        [-0.02572801,  0.01593998],
        [-0.02572801,  0.01593998]],

       [[-0.00583706,  0.03699764],
        [ 0.02434461, -0.02477236],
        [-0.02572801,  0.01593998],
        [-0.02572801,  0.01593998],
        [-0.02572801,  0.01593998]],

       [[-0.0209319 ,  0.00192506],
        [-0.00583706,  0.03699764],
        [-0.02572801,  0.01593998],
        [-0.02572801,  0.01593998],
        [-0.02572801,  0.01593998]],

       [[-0.044108  ,  0.03198164],
        [ 0.04077939,  0.02992911],
        [ 0.036444  , -0.01542908],
        [-0.02572801,  0.01593998],
        [-0.02572801,  0.01593998]],

       [[ 0.00090504,  0.01098108],
        [ 0.00397978,  0.03963007],
        [-0.00771817,  0.0484598 ],
        [-0.04630901, -0.00688462],
        [-0.02572801,  0.01593998]],

       [[ 0.00682104, -0.02072327],
        [-0.01657227,  0.03277289],
        [-0.025728

### using imdb dataset 

In [18]:
from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten

In [19]:
(X_train,y_train),(X_test,y_test) = imdb.load_data()

In [20]:
X_train = pad_sequences(X_train,padding='post',maxlen=150)
X_test = pad_sequences(X_test,padding='post',maxlen=150)

In [23]:
model = Sequential()
model.add(Embedding(15000,output_dim=2,input_length=150))
model.add(SimpleRNN(32,return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 150, 2)            30000     
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 32)                1120      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 31,153
Trainable params: 31,153
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train,epochs=10,validation_data=(X_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
x_test[0][0:150].reshape(1,-1).shape


(1, 150)

In [26]:
test_data=x_test[0][0:150].reshape(1,-1)


In [27]:
model.predict(test_data)



array([[0.646185]], dtype=float32)