### Converting words into vectors

1. Integer Encoding
  - Giving a unique number to each word

2. Embeddings

In [1]:
import numpy as np

docs = ['go india',
		'india india',
		'hip hip hurray',
		'jeetega bhai jeetega india jeetega',
		'bharat mata ki jai',
		'kohli kohli',
		'sachin sachin',
		'dhoni dhoni',
		'modi ji ki jai',
		'inquilab zindabad']

In [4]:
!pip install tensorflow --quiet

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='<nothing>')
## oov_token is if a new word comes after tokenization then it will be replace by this oov_token

In [7]:
tokenizer.fit_on_texts(docs)

In [10]:
tokenizer.word_counts

OrderedDict([('go', 1),
             ('india', 4),
             ('hip', 2),
             ('hurray', 1),
             ('jeetega', 3),
             ('bhai', 1),
             ('bharat', 1),
             ('mata', 1),
             ('ki', 2),
             ('jai', 2),
             ('kohli', 2),
             ('sachin', 2),
             ('dhoni', 2),
             ('modi', 1),
             ('ji', 1),
             ('inquilab', 1),
             ('zindabad', 1)])

In [11]:
tokenizer.document_count # No of sentences

10

In [12]:
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[10, 2],
 [2, 2],
 [4, 4, 11],
 [3, 12, 3, 2, 3],
 [13, 14, 5, 6],
 [7, 7],
 [8, 8],
 [9, 9],
 [15, 16, 5, 6],
 [17, 18]]

In [13]:
from keras.utils import pad_sequences

In [14]:
sequences = pad_sequences(sequences, padding='post')
sequences

array([[10,  2,  0,  0,  0],
       [ 2,  2,  0,  0,  0],
       [ 4,  4, 11,  0,  0],
       [ 3, 12,  3,  2,  3],
       [13, 14,  5,  6,  0],
       [ 7,  7,  0,  0,  0],
       [ 8,  8,  0,  0,  0],
       [ 9,  9,  0,  0,  0],
       [15, 16,  5,  6,  0],
       [17, 18,  0,  0,  0]], dtype=int32)

## Sentiment Analysis

### Imports

In [15]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, Flatten

In [16]:
(X_train, y_train), (X_test, y_test) = imdb.load_data() # Data is already tokenied by integer encoding

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [17]:
X_train

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1

In [19]:
X_train.shape

(25000,)

In [18]:
y_train

array([1, 0, 0, ..., 0, 1, 0])

### Preprocessing

In [20]:
X_train = pad_sequences(X_train, padding='post', maxlen=50)
X_test = pad_sequences(X_test, padding='post', maxlen=50)

In [21]:
X_train.shape

(25000, 50)

### Building Model

In [22]:
model = Sequential()

model.add(SimpleRNN(32, input_shape=(50,1), return_sequences=False))
# return_sequences -> these are the feedbacks values going into the hidden layer again if this is false then the feedback will happen but it will not be saved after the feedback
model.add(Dense(1, activation='sigmoid'))

model.summary()

  super().__init__(**kwargs)


In [24]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - acc: 0.5016 - loss: 0.6947 - val_acc: 0.5058 - val_loss: 0.6940
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - acc: 0.5095 - loss: 0.6932 - val_acc: 0.5020 - val_loss: 0.6942
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - acc: 0.5065 - loss: 0.6929 - val_acc: 0.5038 - val_loss: 0.6950
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - acc: 0.5070 - loss: 0.6932 - val_acc: 0.5057 - val_loss: 0.6941
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - acc: 0.5138 - loss: 0.6929 - val_acc: 0.5070 - val_loss: 0.6938


<keras.src.callbacks.history.History at 0x7a3f5029da10>

### Using Embeddings

In NLP, word embeddings is a term used for the representation of words for text analysis, typically in the form of a real valued vector that encodes the meaning of the word that the words that are closer in the vector space are expected to be similar in meaning

Embeddings are better than Integer encoding because
1. Embeddings are dense : Means there are less non zero values in a embedding vector as compared to the Integer encoding vector
2. Embeddings capture the semantic meaning behind the words

### Embedding Layer

We apply the embedding layer before the RNN layer so that the embeddings layer learns the representations and then pass this representation to the RNN due to which RNN also gets a Dense Vector, you just have to make sure that the sentences are integer encoded before passing to the embedding layer

In [44]:
import numpy as np

docs = ['go india',
		'india india',
		'hip hip hurray',
		'jeetega bhai jeetega india jeetega',
		'bharat mata ki jai',
		'kohli kohli',
		'sachin sachin',
		'dhoni dhoni',
		'modi ji ki jai',
		'inquilab zindabad']

In [51]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=17, oov_token='<nothing>')

In [52]:
tokenizer.fit_on_texts(docs)
sequences = tokenizer.texts_to_sequences(docs)
sequences = pad_sequences(sequences, padding='post')

In [53]:
model = Sequential()
# 17 -> Vocabulary length (no of unique words)
model.add(Embedding(17, output_dim=2, input_length=5))
## output_dim -> 2 means that each word you pass will be a vector of (1, 5) now this vector will be converted into 2 numbers
# since this is the output dim of embedding layer and then this 2 numbers will be sent to the RNN
model.summary()

In [54]:
model.compile('adam', 'accuracy')

In [55]:
pred = model.predict(sequences)
print(pred)  # Each word is converted into 2 numbers and each sentence is converted into (5, 2) -> This is representation of one sentence

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[[[ 0.00304759 -0.04317248]
  [-0.02047485  0.00535973]
  [ 0.01953462 -0.0317094 ]
  [ 0.01953462 -0.0317094 ]
  [ 0.01953462 -0.0317094 ]]

 [[-0.02047485  0.00535973]
  [-0.02047485  0.00535973]
  [ 0.01953462 -0.0317094 ]
  [ 0.01953462 -0.0317094 ]
  [ 0.01953462 -0.0317094 ]]

 [[-0.01162474 -0.03721507]
  [-0.01162474 -0.03721507]
  [ 0.03937416  0.03862753]
  [ 0.01953462 -0.0317094 ]
  [ 0.01953462 -0.0317094 ]]

 [[-0.03398334 -0.02391263]
  [ 0.02173148 -0.01585231]
  [-0.03398334 -0.02391263]
  [-0.02047485  0.00535973]
  [-0.03398334 -0.02391263]]

 [[ 0.02897108 -0.01327736]
  [-0.003663   -0.01684888]
  [-0.04460591 -0.02964513]
  [-0.00476208 -0.02371309]
  [ 0.01953462 -0.0317094 ]]

 [[ 0.02591472 -0.03781732]
  [ 0.02591472 -0.03781732]
  [ 0.01953462 -0.0317094 ]
  [ 0.01953462 -0.0317094 ]
  [ 0.01953462 -0.0317094 ]]

 [[ 0.04421696  0.03813026]
  [ 0.04421696  0.03813026]
  [ 0.01953462 -0.0

## Sentiment Analysis using Embeddings

In [58]:
from keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten

In [64]:
(X_train,y_train),(X_test,y_test) = imdb.load_data(num_words=10000) ## Limiting vocabulary to 10000
X_train = pad_sequences(X_train,padding='post',maxlen=50)
X_test = pad_sequences(X_test,padding='post',maxlen=50)

In [65]:
X_train.shape

(25000, 50)

In [66]:
model = Sequential()
model.add(Embedding(10000, 2,input_length=50))
model.add(SimpleRNN(32,return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.summary()



In [67]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
model.fit(X_train,y_train,epochs=5,validation_data=(X_test,y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 18ms/step - acc: 0.5277 - loss: 0.6910 - val_acc: 0.7242 - val_loss: 0.5676
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 16ms/step - acc: 0.7840 - loss: 0.4635 - val_acc: 0.8032 - val_loss: 0.4313
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - acc: 0.8647 - loss: 0.3257 - val_acc: 0.8058 - val_loss: 0.4337
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 17ms/step - acc: 0.8963 - loss: 0.2611 - val_acc: 0.8043 - val_loss: 0.4725
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 17ms/step - acc: 0.9174 - loss: 0.2140 - val_acc: 0.7948 - val_loss: 0.4958


<keras.src.callbacks.history.History at 0x7a3f4a7f68d0>