In [21]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [22]:
sentences = [
    'stop talking',
    'Shut up!',
    'Too much talking',
    'Enough...'
]

In [23]:
# num_words is a maximum number of words (to make data smaller and more representative)
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

In [24]:
tokenizer.fit_on_texts(sentences)

In [25]:
word_index = tokenizer.word_index
word_index

{'<OOV>': 1,
 'talking': 2,
 'stop': 3,
 'shut': 4,
 'up': 5,
 'too': 6,
 'much': 7,
 'enough': 8}

In [26]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[3, 2], [4, 5], [6, 7, 2], [8]]

In [27]:
sequence_oov = tokenizer.texts_to_sequences(['Quandale Dingle here!'])

In [28]:
sequence_oov

[[1, 1, 1]]

Words out of dataset are called OOV (out of vocabulary) and are encoded as 1

In [29]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
padded = pad_sequences(sequences)
padded

array([[0, 3, 2],
       [0, 4, 5],
       [6, 7, 2],
       [0, 0, 8]], dtype=int32)

In [32]:
# NICE!

In [33]:
padded_post = pad_sequences(sequences, padding='post')
padded_post

array([[3, 2, 0],
       [4, 5, 0],
       [6, 7, 2],
       [8, 0, 0]], dtype=int32)

In [34]:
# here zeros of padding are after the encoded words

In [35]:
padded_short = pad_sequences(sequences, padding='post', maxlen=2)
padded_short

array([[3, 2],
       [4, 5],
       [7, 2],
       [8, 0]], dtype=int32)

In [36]:
# it cutted of the first parts of our 2nd sequnce with 3 words

In [37]:
# we can also specify it to cut last parts

In [38]:
padded_short = pad_sequences(sequences, padding='post', maxlen=2, truncating='post')
padded_short

array([[3, 2],
       [4, 5],
       [6, 7],
       [8, 0]], dtype=int32)