In [20]:
# Importing necessary libraries
import numpy as np
from keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Input, Dense, SimpleRNN, Embedding

In [21]:
# Dummy data
docs = [
    'go india',
    'india india',
    'hip hip hurray',
    'jeetega bhai jeetega india jeetega',
    'bharat mata ki jai',
    'kohli kohli',
    'sachin sachin',
    'dhoni dhoni',
    'modi ji ki jai',
    'inquilab zindabad',
]

*   **`num_words=None`**:
    *   This parameter controls the maximum number of words to keep, based on their frequency.
    *   If set to an integer (e.g., `num_words=10000`), the tokenizer will only consider the top `10000` most frequent words in your corpus. Words beyond this count will be discarded.
    *   If set to `None` (as in your example), it means **all words** found in the corpus will be kept and indexed. This can lead to a very large vocabulary if your text data is extensive.

*   **`filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'`**:
    *   This is a string containing characters that will be **filtered out** from your text before tokenization.
    *   By default, it includes most punctuation marks, tabs (`\t`), and newlines (`\n`).
    *   The tokenizer will remove these characters from your input text, effectively cleaning it up before splitting it into words. For example, "Hello, world!" would become "Hello world" after filtering.

*   **`lower=True`**:
    *   This boolean parameter determines whether to convert text to **lowercase** before tokenization.
    *   If `True` (as here), all text will be converted to lowercase. This is a common practice in NLP to treat "The" and "the" as the same word, reducing vocabulary size and improving consistency.
    *   If `False`, the case of the words will be preserved.

*   **`split=' '`**:
    *   This parameter specifies the **delimiter** used to split text into individual words (tokens).
    *   By default, it's a single space (`' '`), meaning words will be separated by spaces.
    *   You could change this if your text uses a different primary delimiter, but for most natural language, a space is appropriate.

*   **`char_level=False`**:
    *   This boolean parameter controls whether tokenization should happen at the **character level** or **word level**.
    *   If `False` (as here), the tokenizer will operate at the word level, treating sequences of characters separated by `split` characters (and after `filters` are applied) as individual tokens.
    *   If `True`, each character in the input text would be treated as a separate token. This is less common for general text processing but can be useful for tasks like character-level language modeling or certain types of sequence-to-sequence problems.

*   **`oov_token=None`**:
    *   `oov` stands for "Out-Of-Vocabulary". This parameter allows you to specify a token that will represent words not found in the tokenizer's vocabulary (i.e., words that were not among the `num_words` most frequent words or were not seen during `fit_on_texts`).
    *   If set to `None` (as here), out-of-vocabulary words will simply be **ignored** (removed) when converting text to sequences of integers.
    *   If you provide a string (e.g., `oov_token='<unk>'`), any word not in the vocabulary will be replaced by this token. This is useful for ensuring that all input sequences have the same length and that your model can handle unseen words gracefully.

*   **`analyzer=None`**:
    *   This parameter is typically used for more advanced text processing, allowing you to specify a custom function to preprocess each document before tokenization.
    *   If `None` (as here), the default Keras text processing (applying `filters`, `lower`, and `split`) will be used.
    *   You could provide a callable (a function) here that takes a string as input and returns a list of tokens. This gives you fine-grained control over the tokenization process, for example, if you wanted to implement stemming or lemmatization before tokenizing.
        

In [22]:
# Tokenizing
tokenizer = Tokenizer(
    num_words=None,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=' ',
    char_level=False,
    oov_token="<none>",
    analyzer=None,
)

In [23]:
# Fitting on documents
tokenizer.fit_on_texts(docs) # Cannot handle null values by default

In [24]:
# Encoding of unique words in the documents
tokenizer.word_index

{'<none>': 1,
 'india': 2,
 'jeetega': 3,
 'hip': 4,
 'ki': 5,
 'jai': 6,
 'kohli': 7,
 'sachin': 8,
 'dhoni': 9,
 'go': 10,
 'hurray': 11,
 'bhai': 12,
 'bharat': 13,
 'mata': 14,
 'modi': 15,
 'ji': 16,
 'inquilab': 17,
 'zindabad': 18}

In [25]:
# Frequency of each word in the document
tokenizer.word_counts

OrderedDict([('go', 1),
             ('india', 4),
             ('hip', 2),
             ('hurray', 1),
             ('jeetega', 3),
             ('bhai', 1),
             ('bharat', 1),
             ('mata', 1),
             ('ki', 2),
             ('jai', 2),
             ('kohli', 2),
             ('sachin', 2),
             ('dhoni', 2),
             ('modi', 1),
             ('ji', 1),
             ('inquilab', 1),
             ('zindabad', 1)])

In [26]:
# No. of documents(sentences) in the dataset
tokenizer.document_count

10

In [27]:
# Encoding the documents
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[10, 2],
 [2, 2],
 [4, 4, 11],
 [3, 12, 3, 2, 3],
 [13, 14, 5, 6],
 [7, 7],
 [8, 8],
 [9, 9],
 [15, 16, 5, 6],
 [17, 18]]

In [28]:
# Applying post zero padding to equalize the length of all the documents
padded_sequences = pad_sequences(sequences, padding = 'post')
padded_sequences

array([[10,  2,  0,  0,  0],
       [ 2,  2,  0,  0,  0],
       [ 4,  4, 11,  0,  0],
       [ 3, 12,  3,  2,  3],
       [13, 14,  5,  6,  0],
       [ 7,  7,  0,  0,  0],
       [ 8,  8,  0,  0,  0],
       [ 9,  9,  0,  0,  0],
       [15, 16,  5,  6,  0],
       [17, 18,  0,  0,  0]], dtype=int32)

## Embeddings:
1.  **`Input`:** It takes **integer-encoded words** (e.g., "cat" is 5, "dog" is 12), not one-hot encodings. This is a key difference.
2.  **`Lookup Table`:** It acts like a **lookup table**. For each integer input, it retrieves a corresponding dense vector (embedding).
3.  **`Learned Vectors`:** These dense vectors are **learnable parameters** of the model. During training, the model adjusts these vectors so that words with similar meanings or contexts end up with similar embedding vectors.
4.  **`Output`:** It outputs a sequence of these dense vectors, one for each word in the input sequence.   

### Key Benefits of Embeddings:
1. Captures the semantic meaning of words (multiple dimensions (aspects) per word)
2. Trained through backpropagation - learns from actual data
3. Flexible dimensionality - can adjust embedding size as needed

`Note` - Choose your text processing method based on your task:
- Use Embeddings for: text generation, summarization, and question answering
- Use BOW or TF-IDF for: recommender systems and span detection

In [29]:
# Embeddings from keras
embedding = Embedding(
        input_dim = 19,
        output_dim = 2,
        embeddings_initializer = "normal",
        embeddings_regularizer = None,
        embeddings_constraint = None,
        mask_zero = False,
        weights = None,
        lora_rank = None,
    )

# Embeddings for first document
embedding(padded_sequences[0])

<tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[-0.08723129,  0.05023347],
       [ 0.0454735 ,  0.08788645],
       [ 0.03264703,  0.0687113 ],
       [ 0.03264703,  0.0687113 ],
       [ 0.03264703,  0.0687113 ]], dtype=float32)>