In [17]:
# import the tensorflow APIs
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [39]:
# sentences to tokenize
train_sentences = [
    'It is a sunny day',
    'It is a cloudy day',
    'Will it rain today?'
]

In [40]:
# instantiate the tokenizer
tokenizer = Tokenizer(num_words=100)

# train the tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

# store word index for the words in the sentences
word_index = tokenizer.word_index

In [41]:
print(word_index)

{'it': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'cloudy': 6, 'will': 7, 'rain': 8, 'today': 9}


In [42]:
#create sequences using tokenizer
sequences = tokenizer.texts_to_sequences(train_sentences)

In [43]:
#print word index dictionary and sequnces
print(f"Word Index -->{word_index}")
print(f"Sequence of Words -->{sequences}")

Word Index -->{'it': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'cloudy': 6, 'will': 7, 'rain': 8, 'today': 9}
Sequence of Words -->[[1, 2, 3, 5, 4], [1, 2, 3, 6, 4], [7, 1, 8, 9]]


In [47]:
#print sample sentence and sequence
print(train_sentences[0])
print(sequences[0])

It is a sunny day
[1, 2, 3, 5, 4]


In [48]:
#tokenizing new data on the same tokenizer
new_sentences = [
    'Will it be raining today?',
    'It is a pleasant day.'
]

In [50]:
new_sequences = tokenizer.texts_to_sequences(new_sentences)

In [51]:
print(new_sentences)
print(new_sequences)

['Will it be raining today?', 'It is a pleasant day.']
[[7, 1, 9], [1, 2, 3, 4]]


In [52]:
### Replacing newly encountered words with special values

# set up tokenizer again with oov_token
tokenizer = Tokenizer(num_words=100, oov_token='<oov>')

#train the new tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

#store word indexes for the words in the sentence
word_index = tokenizer.word_index

In [53]:
#create sequences of the new sentences
new_sequences = tokenizer.texts_to_sequences(new_sentences)
print(word_index)
print(new_sentences)
print(new_sequences)

{'<oov>': 1, 'it': 2, 'is': 3, 'a': 4, 'day': 5, 'sunny': 6, 'cloudy': 7, 'will': 8, 'rain': 9, 'today': 10}
['Will it be raining today?', 'It is a pleasant day.']
[[8, 2, 1, 1, 10], [2, 3, 4, 1, 5]]


## Padding the Sequence

In [55]:
#import the APIs
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [57]:
#define the train sentences
train_sentences = [
    'It will rain',
    'The weather is cloudy',
    'Will it be raining today?',
    'It is a super hot day'
]

In [59]:
### Train the tokenizer

#set up the tokenizer again with the oov_token
tokenizer = Tokenizer(num_words=100, oov_token='<oov>')

#train the tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

#store word index for the word in the sentence
word_index = tokenizer.word_index

In [60]:
#create sequences
sequences = tokenizer.texts_to_sequences(train_sentences)

In [61]:
#pad sequences
padded_seqs = pad_sequences(sequences)

In [63]:
print(word_index)
print(train_sentences)
print(sequences)
print(padded_seqs)

{'<oov>': 1, 'it': 2, 'will': 3, 'is': 4, 'rain': 5, 'the': 6, 'weather': 7, 'cloudy': 8, 'be': 9, 'raining': 10, 'today': 11, 'a': 12, 'super': 13, 'hot': 14, 'day': 15}
['It will rain', 'The weather is cloudy', 'Will it be raining today?', 'It is a super hot day']
[[2, 3, 5], [6, 7, 4, 8], [3, 2, 9, 10, 11], [2, 4, 12, 13, 14, 15]]
[[ 0  0  0  2  3  5]
 [ 0  0  6  7  4  8]
 [ 0  3  2  9 10 11]
 [ 2  4 12 13 14 15]]


In [65]:
### Customizing your padded sequence with parameters

#pad sequences with padding type, max length and truncating parameters
padded_seqs = pad_sequences(sequences,
                            padding="post",
                            maxlen=5,
                            truncating="post")

In [66]:
print(padded_seqs)

[[ 2  3  5  0  0]
 [ 6  7  4  8  0]
 [ 3  2  9 10 11]
 [ 2  4 12 13 14]]
