In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [10]:
train_sentences = [
    'I will be able to do amazing things with AI',
    'The path will not be easy',
    'Will I master AI easily?'
]

In [15]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

In [16]:
print(word_index)

{'will': 1, 'i': 2, 'be': 3, 'ai': 4, 'able': 5, 'to': 6, 'do': 7, 'amazing': 8, 'things': 9, 'with': 10, 'the': 11, 'path': 12, 'not': 13, 'easy': 14, 'master': 15, 'easily': 16}


In [17]:
sequences = tokenizer.texts_to_sequences(train_sentences)

In [18]:
print(f'Word index -->{word_index}')
print(f'Sequences of words -->{sequences}')

Word index -->{'will': 1, 'i': 2, 'be': 3, 'ai': 4, 'able': 5, 'to': 6, 'do': 7, 'amazing': 8, 'things': 9, 'with': 10, 'the': 11, 'path': 12, 'not': 13, 'easy': 14, 'master': 15, 'easily': 16}
Sequences of words -->[[2, 1, 3, 5, 6, 7, 8, 9, 10, 4], [11, 12, 1, 13, 3, 14], [1, 2, 15, 4, 16]]


In [19]:
print(train_sentences[0])
print(sequences[0])

I will be able to do amazing things with AI
[2, 1, 3, 5, 6, 7, 8, 9, 10, 4]


## Tokenizing new data with the same tokenizer

In [20]:
new_sentences = [
    'It will not be easy to do things with AI',
    'AI is amazing'
]

In [21]:
new_sequences = tokenizer.texts_to_sequences(new_sentences)

In [22]:
print(new_sentences)

['It will not be easy to do things with AI', 'AI is amazing']


In [23]:
print(new_sequences)

[[1, 13, 3, 14, 6, 7, 9, 10, 4], [4, 8]]


## Replacing newly encountered words with special values

In [24]:
tokenizer = Tokenizer(num_words=100, oov_token = '<oov>')

In [25]:
tokenizer.fit_on_texts(train_sentences)

In [26]:
word_index = tokenizer.word_index

In [27]:
new_sequences = tokenizer.texts_to_sequences(new_sentences)

In [28]:
print(word_index)

{'<oov>': 1, 'will': 2, 'i': 3, 'be': 4, 'ai': 5, 'able': 6, 'to': 7, 'do': 8, 'amazing': 9, 'things': 10, 'with': 11, 'the': 12, 'path': 13, 'not': 14, 'easy': 15, 'master': 16, 'easily': 17}


In [29]:
print(new_sentences)

['It will not be easy to do things with AI', 'AI is amazing']


In [30]:
print(new_sequences)

[[1, 2, 14, 4, 15, 7, 8, 10, 11, 5], [5, 1, 9]]


# Padding the sequences

In [31]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [32]:
padded_seqs = pad_sequences(sequences)

In [33]:
print(word_index)

{'<oov>': 1, 'will': 2, 'i': 3, 'be': 4, 'ai': 5, 'able': 6, 'to': 7, 'do': 8, 'amazing': 9, 'things': 10, 'with': 11, 'the': 12, 'path': 13, 'not': 14, 'easy': 15, 'master': 16, 'easily': 17}


In [34]:
print(train_sentences)

['I will be able to do amazing things with AI', 'The path will not be easy', 'Will I master AI easily?']


In [35]:
print(sequences)

[[2, 1, 3, 5, 6, 7, 8, 9, 10, 4], [11, 12, 1, 13, 3, 14], [1, 2, 15, 4, 16]]


In [36]:
print(padded_seqs)

[[ 2  1  3  5  6  7  8  9 10  4]
 [ 0  0  0  0 11 12  1 13  3 14]
 [ 0  0  0  0  0  1  2 15  4 16]]


## Customising padded sequence with parameters

In [39]:
padded_seqs = pad_sequences(sequences,
                           padding='post',
                           maxlen=10,
                           truncating='post')

In [40]:
print(padded_seqs)

[[ 2  1  3  5  6  7  8  9 10  4]
 [11 12  1 13  3 14  0  0  0  0]
 [ 1  2 15  4 16  0  0  0  0  0]]
