In [109]:
## import the tensorflow APIs

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Define training sentences

In [130]:
## sentences to tokenize
train_sentences = [
    'It is a sunny day',
    'It is a raining day',
    'It is a cloudy day! hi'
]

Set up the tokenizer

In [131]:
## instantiate the tokenizer
tokenizer = Tokenizer(num_words=20)
## train the tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

## store word index for the words in the sentence
word_index = tokenizer.word_index

WORD INDEX

In [132]:
print(word_index)

{'it': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'raining': 6, 'cloudy': 7, 'hi': 8}


Create squences

In [113]:
### create sequences using tokenizer
sequences = tokenizer.texts_to_sequences(train_sentences)

In [114]:
## print word index dictionary and sequences
print(f"Word index -> {word_index}")
print(f"Sequences of words -> {sequences}")

Word index -> {'it': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'raining': 6, 'cloudy': 7}
Sequences of words -> [[1, 2, 3, 5, 4], [1, 2, 3, 6, 4], [1, 2, 3, 7, 4]]


In [115]:
## print sample sentence and sequence
print(train_sentences[0])
print(sequences[0])

It is a sunny day
[1, 2, 3, 5, 4]


Tokenizing new data using the same tokenizer

In [116]:
new_sentences = [
    "Will it be raining today?",
    "It is a pleasant day.",
    "Today is a nice day with the blue night!"
]

In [117]:
new_sequences = tokenizer.texts_to_sequences(new_sentences)

In [118]:
print(new_sentences)
print(new_sequences)

['Will it be raining today?', 'It is a pleasant day.', 'Today is a nice day with the blue night!']
[[1, 6], [1, 2, 3, 4], [2, 3, 4]]


Replacing newly encountered words with special values

In [126]:
## set up the tokenizer again with oov_token
tokenizer = Tokenizer(num_words=100, oov_token="<oov>")

## train the new tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

## store word index for the words in the sentence
word_index = tokenizer.word_index

In [120]:
## create sequences of the new sentences
new_sequences = tokenizer.texts_to_sequences(new_sentences)

print(word_index)
print(new_sequences)

{'<oov>': 1, 'it': 2, 'is': 3, 'a': 4, 'day': 5, 'sunny': 6, 'raining': 7, 'cloudy': 8}
[[1, 2, 1, 7, 1], [2, 3, 4, 1, 5], [1, 3, 4, 1, 5, 1, 1, 1, 1]]


Without padding sequences

In [121]:
print(new_sequences)

[[1, 2, 1, 7, 1], [2, 3, 4, 1, 5], [1, 3, 4, 1, 5, 1, 1, 1, 1]]


Pad Sequences

In [122]:
## pad sequences
padded_seqs = pad_sequences(new_sequences)
print(padded_seqs)

[[0 0 0 0 1 2 1 7 1]
 [0 0 0 0 2 3 4 1 5]
 [1 3 4 1 5 1 1 1 1]]


Customising your padded sequences with parameters

In [123]:
## pad sequences with padding type, max length and truncating parameters
padded_seqs_custom = pad_sequences(new_sequences, padding="post", maxlen=7, truncating="pre")

In [124]:
print(padded_seqs_custom)

[[1 2 1 7 1 0 0]
 [2 3 4 1 5 0 0]
 [4 1 5 1 1 1 1]]
