In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import layers

In [12]:
# Padding sequence data
# When processing sequence data, it is very common for individual samples to have different lengths. 
# Consider the following example (text tokenized as words):

[
  ["The", "weather", "will", "be", "nice", "tomorrow"],
  ["How", "are", "you", "doing", "today"],
  ["Hello", "world", "!"]
]


# After vocabulary lookup, the data might be vectorized as integers, e.g.:

[
  [83, 91, 1, 645, 1253, 927],
  [73, 8, 3215, 55, 927],
  [71, 1331, 4231]
]

[[83, 91, 1, 645, 1253, 927], [73, 8, 3215, 55, 927], [71, 1331, 4231]]

In [13]:
# The data is a 2D list where individual samples have length 6, 5, and 3 respectively. 
# Since the input data for a deep learning model must be a single tensor

# Keras provides an API to easily truncate and pad sequences to a common length:
# tf.keras.preprocessing.sequence.pad_sequences.

raw_inputs = [
              [83, 91, 1, 645, 1253, 927],
              [73, 8, 3215, 55, 927],
              [711, 632, 71]
             ]


# By default, this will pad using 0s; it is configurable via the
# Note that you could "pre" padding (at the beginning) or "post" padding (at the end).
# We recommend using "post" padding when working with RNN layers
# 

padded_inputs = pad_sequences(raw_inputs,padding='post')


print(padded_inputs)

[[  83   91    1  645 1253  927]
 [  73    8 3215   55  927    0]
 [ 711  632   71    0    0    0]]


In [14]:
# Masking : Now that all samples have a uniform length, the model must be informed that some part of 
# the data is actually padding and should be ignored. That mechanism is masking.

# There are three ways to introduce input masks in Keras models:
# 1. Add a keras.layers.Masking laye
# 2. Configure a keras.layers.Embedding layer with mask_zero=True.
# 3. Pass a mask argument manually when calling layers that support this argument (e.g. RNN layers)

In [29]:
# 1 .Mask-generating layers: Embedding and Masking

embedding  = layers.Embedding(input_dim=1000,output_dim=16,mask_zero=True)
masked_output = embedding(padded_inputs)

print(masked_output._keras_mask)

Tensor("embedding_12/NotEqual:0", shape=(3, 6), dtype=bool)
