In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
tokenizer = Tokenizer(num_words = 100)

sentences = ['I love my dog','I love my cat','Do you think my dog is amazing!','You love my dog?']

tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

{'love': 2, 'think': 8, 'cat': 6, 'amazing': 10, 'dog': 3, 'my': 1, 'i': 4, 'do': 7, 'you': 5, 'is': 9}
[[4, 2, 1, 3], [4, 2, 1, 6], [7, 5, 8, 1, 3, 9, 10], [5, 2, 1, 3]]


# Making all sentences to same length using padding

In [6]:
#Pass sequences to pad_sequences

padded = pad_sequences(sequences)

#Matrix width is same as the longest sentence

In [8]:
print(word_index)
print(sequences)
print(padded)

{'love': 2, 'think': 8, 'cat': 6, 'amazing': 10, 'dog': 3, 'my': 1, 'i': 4, 'do': 7, 'you': 5, 'is': 9}
[[4, 2, 1, 3], [4, 2, 1, 6], [7, 5, 8, 1, 3, 9, 10], [5, 2, 1, 3]]
[[ 0  0  0  4  2  1  3]
 [ 0  0  0  4  2  1  6]
 [ 7  5  8  1  3  9 10]
 [ 0  0  0  5  2  1  3]]


# To pad with zeroes after the sentenceMaking all sentences to same lengt

In [9]:
padded1 = pad_sequences(sequences, padding='post')
padded1

array([[ 4,  2,  1,  3,  0,  0,  0],
       [ 4,  2,  1,  6,  0,  0,  0],
       [ 7,  5,  8,  1,  3,  9, 10],
       [ 5,  2,  1,  3,  0,  0,  0]], dtype=int32)

# Max length of sentence/matrix

#We loose starting info. of the sentence

In [11]:
padded2 = pad_sequences(sequences, padding='post', maxlen=6)
padded2

array([[ 4,  2,  1,  3,  0,  0],
       [ 4,  2,  1,  6,  0,  0],
       [ 5,  8,  1,  3,  9, 10],
       [ 5,  2,  1,  3,  0,  0]], dtype=int32)

# LOose INfo at the end of sentence

In [12]:
padded2 = pad_sequences(sequences, padding='post', maxlen=6, truncating='post')
padded2

array([[4, 2, 1, 3, 0, 0],
       [4, 2, 1, 6, 0, 0],
       [7, 5, 8, 1, 3, 9],
       [5, 2, 1, 3, 0, 0]], dtype=int32)