In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

Define Training Sentences

In [None]:
train_sentences = [
    'It is a Sunny day',
    'It is a cloudy day',
    'Will it rain today?'
]

Set up the Tokenizer

In [None]:
#Instantiate the tokenizer
tokenizer = Tokenizer(num_words=100)

#Train the tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

#store word index for the words in the sentece
word_index = tokenizer.word_index

In [None]:
word_index

{'it': 1,
 'is': 2,
 'a': 3,
 'day': 4,
 'sunny': 5,
 'cloudy': 6,
 'will': 7,
 'rain': 8,
 'today': 9}

Create Sequences

In [None]:
#Create sequences using tokenizer
sequence = tokenizer.texts_to_sequences(train_sentences)

In [None]:
print(word_index)
print(sequence)

{'it': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'cloudy': 6, 'will': 7, 'rain': 8, 'today': 9}
[[1, 2, 3, 5, 4], [1, 2, 3, 6, 4], [7, 1, 8, 9]]


Tokenizing new data using the same tokenizer

In [None]:
new_sentences =[
    'Will it be raining today?',
    'Hello World'
]

In [None]:
sequence2 = tokenizer.texts_to_sequences(new_sentences)

In [None]:
sequence2

[[7, 1, 9], []]

Replacing newly encountered words with special values

In [None]:
tokenizer = Tokenizer(num_words=100,oov_token="<oov>")
tokenizer.fit_on_texts(train_sentences)
word_index2 = tokenizer.word_index
word_index2

{'<oov>': 1,
 'it': 2,
 'is': 3,
 'a': 4,
 'day': 5,
 'sunny': 6,
 'cloudy': 7,
 'will': 8,
 'rain': 9,
 'today': 10}

In [None]:
sequence3 = tokenizer.texts_to_sequences(new_sentences)
sequence3

[[8, 2, 1, 1, 10], [1, 1]]

Padding the Sequences -> To make the input for neural network in the same size

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
train_sentences = [
    'It will rain',
    'The weather is cloudy',
    'Will it be raining today?',
    'It is a super hot day! day !'
]

In [None]:
tokenizer = Tokenizer(num_words=100,oov_token="<oov>")
tokenizer.fit_on_texts(train_sentences)
word_index2 = tokenizer.word_index
word_index2

{'<oov>': 1,
 'it': 2,
 'will': 3,
 'is': 4,
 'day': 5,
 'rain': 6,
 'the': 7,
 'weather': 8,
 'cloudy': 9,
 'be': 10,
 'raining': 11,
 'today': 12,
 'a': 13,
 'super': 14,
 'hot': 15}

In [None]:
sequence4 = tokenizer.texts_to_sequences(train_sentences)
sequence4

[[2, 3, 6], [7, 8, 4, 9], [3, 2, 10, 11, 12], [2, 4, 13, 14, 15, 5, 5]]

In [None]:
padded_seqs = pad_sequences(sequence4)
print(word_index2)
print(train_sentences)
print(sequence4)
print(padded_seqs)

{'<oov>': 1, 'it': 2, 'will': 3, 'is': 4, 'day': 5, 'rain': 6, 'the': 7, 'weather': 8, 'cloudy': 9, 'be': 10, 'raining': 11, 'today': 12, 'a': 13, 'super': 14, 'hot': 15}
['It will rain', 'The weather is cloudy', 'Will it be raining today?', 'It is a super hot day! day !']
[[2, 3, 6], [7, 8, 4, 9], [3, 2, 10, 11, 12], [2, 4, 13, 14, 15, 5, 5]]
[[ 0  0  0  0  2  3  6]
 [ 0  0  0  7  8  4  9]
 [ 0  0  3  2 10 11 12]
 [ 2  4 13 14 15  5  5]]


Customize padded sequence

In [None]:
padded_seqs = pad_sequences(sequence4,
                            padding = "post",
                            maxlen=5,
                            truncating="post"
                            )

In [None]:
print(padded_seqs)

[[ 2  3  6  0  0]
 [ 7  8  4  9  0]
 [ 3  2 10 11 12]
 [ 2  4 13 14 15]]
