In [9]:
# Importing necessary libraries
import numpy as np
from keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Input, Dense, SimpleRNN, Embedding

### `Integer Encoding`

In [10]:
# Dummy data
docs = [
    'go india',
    'india india',
    'hip hip hurray',
    'jeetega bhai jeetega india jeetega',
    'bharat mata ki jai',
    'kohli kohli',
    'sachin sachin',
    'dhoni dhoni',
    'modi ji ki jai',
    'inquilab zindabad',
]

- Tokenization: It splits sentences or documents into individual words (tokens). By default, it splits on whitespace and punctuation.

- Lowercasing (optional): It can convert all words to lowercase (controlled by lower=True, which is the default).

- Punctuation Removal/Filtering (optional): It removes most punctuation by default, but you can customize which characters to keep or remove using the filters argument.

- HTML Tag Removal (optional): It can remove basic HTML tags if specified.

- Counting Word Frequencies: It iterates through all the provided texts and counts how many times each unique word appears.
- Creating Word-to-Index Mapping: Based on the word frequencies (and num_words if specified), it builds a dictionary where each unique word is mapped to an integer index. The most frequent words get lower indices (starting from 1, as 0 is typically reserved for padding). This mapping is stored in tokenizer.word_index.
- Limiting Vocabulary Size (num_words): You can specify num_words during initialization. If set, the tokenizer will only consider the top num_words-1 most frequent words (plus optionally an OOV token) when building the vocabulary.

In [11]:
# Tokenizing
tokenizer = Tokenizer(oov_token = '<nothing>') # If there is a word came under predictionwhich is not trained through out tokenizer then that word will be considered as oove_token
tokenizer.fit_on_texts(docs) # Cannot handle null values by default

In [12]:
# Encoding of unique words in the documents
tokenizer.word_index

{'<nothing>': 1,
 'india': 2,
 'jeetega': 3,
 'hip': 4,
 'ki': 5,
 'jai': 6,
 'kohli': 7,
 'sachin': 8,
 'dhoni': 9,
 'go': 10,
 'hurray': 11,
 'bhai': 12,
 'bharat': 13,
 'mata': 14,
 'modi': 15,
 'ji': 16,
 'inquilab': 17,
 'zindabad': 18}

In [13]:
# Frequency of each word in the document
tokenizer.word_counts

OrderedDict([('go', 1),
             ('india', 4),
             ('hip', 2),
             ('hurray', 1),
             ('jeetega', 3),
             ('bhai', 1),
             ('bharat', 1),
             ('mata', 1),
             ('ki', 2),
             ('jai', 2),
             ('kohli', 2),
             ('sachin', 2),
             ('dhoni', 2),
             ('modi', 1),
             ('ji', 1),
             ('inquilab', 1),
             ('zindabad', 1)])

In [14]:
# No. of documents(sentences) in the dataset
tokenizer.document_count

10

In [15]:
# Encoding the documents
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[10, 2],
 [2, 2],
 [4, 4, 11],
 [3, 12, 3, 2, 3],
 [13, 14, 5, 6],
 [7, 7],
 [8, 8],
 [9, 9],
 [15, 16, 5, 6],
 [17, 18]]

In [16]:
# Applying post zero padding to equalize the length of all the documents
padded_sequences = pad_sequences(sequences, padding = 'post')
padded_sequences

array([[10,  2,  0,  0,  0],
       [ 2,  2,  0,  0,  0],
       [ 4,  4, 11,  0,  0],
       [ 3, 12,  3,  2,  3],
       [13, 14,  5,  6,  0],
       [ 7,  7,  0,  0,  0],
       [ 8,  8,  0,  0,  0],
       [ 9,  9,  0,  0,  0],
       [15, 16,  5,  6,  0],
       [17, 18,  0,  0,  0]], dtype=int32)

This document is now `integer encoded` - Each word in the document represented by one number

---

### `Embeddings`

In [18]:
model = Sequential()

model.add(Input(shape=(50,), dtype='int32'))
model.add(
    Embedding(
        input_dim = 17,
        output_dim = 2,
        embeddings_initializer = "uniform",
        embeddings_regularizer = None,
        embeddings_constraint = None,
        mask_zero = False,
        weights = None,
        lora_rank = None,
    )
)

model.summary()

In [29]:
# Compilation and Prediction
model.compile(optimizer = 'adam', metrics = ['accuracy'])
pred = model.predict(padded_sequences)
print(np.expand_dims(np.array(pred[0]), axis = 0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[[[ 0.01088379 -0.00352911]
  [ 0.04362936 -0.00202895]
  [-0.01231851 -0.03417271]
  [-0.01231851 -0.03417271]
  [-0.01231851 -0.03417271]]]
