In [5]:
# Import Libraries With APIs

import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer

In [9]:
# Define Training Sentence

train_sentences = [
    'It is a sunny day',
    'It is a cloudy day',
    'Will it rain today?'
]

In [10]:
### Train the Tokenizer
# Set up the tokenizer
# Initialization
tokenizer = Tokenizer(num_words = 100)

#Train the tokenizer on training sentence
tokenizer.fit_on_texts(train_sentences)

#Store word index for the words in the sentence
word_index = tokenizer.word_index

In [11]:
print(word_index)

{'it': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'cloudy': 6, 'will': 7, 'rain': 8, 'today': 9}


In [12]:
### Create Sequences
# Create Sequence using Tokenizer

sequences = tokenizer.texts_to_sequences(train_sentences)

In [14]:
print(f"Word index --> {word_index}")
print(f"Sequences of Words -->{sequences}")

Word index --> {'it': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'cloudy': 6, 'will': 7, 'rain': 8, 'today': 9}
Sequences of Words -->[[1, 2, 3, 5, 4], [1, 2, 3, 6, 4], [7, 1, 8, 9]]


In [15]:
### Tokenizing new data using the same tokenizer

new_sentences = [
    'Will it be raining today?',
    'It is a pleasant day.'
]

In [16]:
new_sequences = tokenizer.texts_to_sequences(new_sentences)

print(new_sentences)
print(new_sequences)

['Will it be raining today?', 'It is a pleasant day.']
[[7, 1, 9], [1, 2, 3, 4]]


In [17]:
### Replacing newly encountered words with special values
# Set up the tokenizer with oov_token
tokenizer = Tokenizer(num_words = 100, oov_token = "<oov>")

#Train the tokenizer on training sentence
tokenizer.fit_on_texts(train_sentences)

#Store word index for the words in the sentence
word_index = tokenizer.word_index

In [18]:
# Create sequences of the new sentences

new_sequences = tokenizer.texts_to_sequences(new_sentences)

print(word_index)
print(new_sequences)

{'it': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'cloudy': 6, 'will': 7, 'rain': 8, 'today': 9, '<oov>': 10}
[[7, 1, 10, 10, 9], [1, 2, 3, 10, 4]]
