In [1]:
# Import Necessary packages
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# Example for tokenize the sentences
sentences= ['Dogs are very loyal to their owners.', 'Nature is the one of the most beautiful gift']

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
# word index from the sentences list
word_indexes = tokenizer.word_index
print(word_indexes)

{'the': 1, 'dogs': 2, 'are': 3, 'very': 4, 'loyal': 5, 'to': 6, 'their': 7, 'owners': 8, 'nature': 9, 'is': 10, 'one': 11, 'of': 12, 'most': 13, 'beautiful': 14, 'gift': 15}


In [3]:
# Sequences
# create sequences from the sentences

sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[2, 3, 4, 5, 6, 7, 8], [9, 10, 1, 11, 12, 1, 13, 14, 15]]


In [4]:
# tokenize new sentences from tokenizer
new_sentences = ['Spending time with the nature is very soulful moment', 'Sleeping is the best medicine']
new_sentences = tokenizer.texts_to_sequences(new_sentences)
print(new_sentences)


[[1, 9, 10, 4], [10, 1]]


In [5]:
# so from above cell we can see that for new words there is no token generated
# we need to replace newly encountered words with special values
# using oov_token (out of vocabularly token)

tokenizer = Tokenizer(num_words=100, oov_token = '<oov>' )
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)
out_sequences = tokenizer.texts_to_sequences(sentences)
print(out_sequences)

{'<oov>': 1, 'the': 2, 'dogs': 3, 'are': 4, 'very': 5, 'loyal': 6, 'to': 7, 'their': 8, 'owners': 9, 'nature': 10, 'is': 11, 'one': 12, 'of': 13, 'most': 14, 'beautiful': 15, 'gift': 16}
[[3, 4, 5, 6, 7, 8, 9], [10, 11, 2, 12, 13, 2, 14, 15, 16]]


In [6]:
# Now try with the new sentences
# tokenizer.fit_on_texts(new_sentences)
# word_index = tokenizer.word_index
# print(word_index)
new_sentences = ['Spending time with the nature is very soulful moment', 'Sleeping is the best medicine']

out_sequences = tokenizer.texts_to_sequences(new_sentences)
print(out_sequences) 
# 1 means out of vocabulary

[[1, 1, 1, 2, 10, 11, 5, 1, 1], [1, 11, 2, 1, 1]]


In [7]:
# Pad Sequences 

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
# padding
padded_sequences =  pad_sequences(out_sequences)

print(new_sentences)
print(word_index)
print(out_sequences)
print(padded_sequences)


['Spending time with the nature is very soulful moment', 'Sleeping is the best medicine']
{'<oov>': 1, 'the': 2, 'dogs': 3, 'are': 4, 'very': 5, 'loyal': 6, 'to': 7, 'their': 8, 'owners': 9, 'nature': 10, 'is': 11, 'one': 12, 'of': 13, 'most': 14, 'beautiful': 15, 'gift': 16}
[[1, 1, 1, 2, 10, 11, 5, 1, 1], [1, 11, 2, 1, 1]]
[[ 1  1  1  2 10 11  5  1  1]
 [ 0  0  0  0  1 11  2  1  1]]


In [9]:
# customize padded sequences with parameters
padded_seqs = pad_sequences(out_sequences,
                           padding="post",
                           maxlen=6,
                           truncating="post")
print(padded_seqs)


[[ 1  1  1  2 10 11]
 [ 1 11  2  1  1  0]]
