imports

In [2]:
import tensorflow as tf

sample inputs

In [3]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
    ]

initialize layer , building vocab , taking in special tokens/characters in account

In [4]:
vectorize_layer = tf.keras.layers.TextVectorization()

vectorize_layer.adapt(sentences)

vocabulary = vectorize_layer.get_vocabulary()

I0000 00:00:1754251400.174784   26726 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2946 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


In [5]:
for index, word in enumerate(vocabulary):
  print(index, word)

0 
1 [UNK]
2 my
3 love
4 dog
5 you
6 i
7 think
8 is
9 do
10 cat
11 amazing


input sentence to an integer sequence

In [6]:
sample_input = 'I love my dog'

sequence = vectorize_layer(sample_input)

print(sequence)

tf.Tensor([6 3 2 4], shape=(4,), dtype=int64)


getting new tokens

In [None]:
for index, word in enumerate(vocabulary):
  print(index, word)

including the special tokens

In [None]:
vocabulary = vectorize_layer.get_vocabulary()

for index, word in enumerate(vocabulary):
  print(index, word)

In [7]:
sentences_dataset = tf.data.Dataset.from_tensor_slices(sentences)

sequences = sentences_dataset.map(vectorize_layer)

for sentence, sequence in zip(sentences, sequences):
  print(f'{sentence} ---> {sequence}')

I love my dog ---> [6 3 2 4]
I love my cat ---> [ 6  3  2 10]
You love my dog! ---> [5 3 2 4]
Do you think my dog is amazing? ---> [ 9  5  7  2  4  8 11]


Using Padding Now

In [8]:
sequences_post = vectorize_layer(sentences)

print('INPUT:')
print(sentences)
print()

print('OUTPUT:')
print(sequences_post)

INPUT:
['I love my dog', 'I love my cat', 'You love my dog!', 'Do you think my dog is amazing?']

OUTPUT:
tf.Tensor(
[[ 6  3  2  4  0  0  0]
 [ 6  3  2 10  0  0  0]
 [ 5  3  2  4  0  0  0]
 [ 9  5  7  2  4  8 11]], shape=(4, 7), dtype=int64)


pre-padding

In [9]:
sequences_pre = tf.keras.utils.pad_sequences(sequences, padding='pre')

print('INPUT:')
[print(sequence.numpy()) for sequence in sequences]
print()

print('OUTPUT:')
print(sequences_pre)

INPUT:
[6 3 2 4]
[ 6  3  2 10]
[5 3 2 4]
[ 9  5  7  2  4  8 11]

OUTPUT:
[[ 0  0  0  6  3  2  4]
 [ 0  0  0  6  3  2 10]
 [ 0  0  0  5  3  2  4]
 [ 9  5  7  2  4  8 11]]


2025-08-04 01:38:46.450077: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-08-04 01:38:46.530711: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


using post-pad

In [10]:
sequences_post_trunc = tf.keras.utils.pad_sequences(sequences, maxlen=5, padding='pre')

print('INPUT:')
[print(sequence.numpy()) for sequence in sequences]
print()

print('OUTPUT:')
print(sequences_post_trunc)

INPUT:
[6 3 2 4]
[ 6  3  2 10]
[5 3 2 4]
[ 9  5  7  2  4  8 11]

OUTPUT:
[[ 0  6  3  2  4]
 [ 0  6  3  2 10]
 [ 0  5  3  2  4]
 [ 7  2  4  8 11]]


2025-08-04 01:39:30.224342: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


using ragged tensors instead

In [11]:
vectorize_layer = tf.keras.layers.TextVectorization(ragged=True)

vectorize_layer.adapt(sentences)

ragged_sequences = vectorize_layer(sentences)

print(ragged_sequences)

<tf.RaggedTensor [[6, 3, 2, 4], [6, 3, 2, 10], [5, 3, 2, 4], [9, 5, 7, 2, 4, 8, 11]]>


In [12]:
sequences_pre = tf.keras.utils.pad_sequences(ragged_sequences.numpy())

print(sequences_pre)

[[ 0  0  0  6  3  2  4]
 [ 0  0  0  6  3  2 10]
 [ 0  0  0  5  3  2  4]
 [ 9  5  7  2  4  8 11]]


out of vocab tokens

In [13]:
sentences_with_oov = [
    'i really love my dog',
    'my dog loves my manatee'
]

sequences_with_oov = vectorize_layer(sentences_with_oov)

for sentence, sequence in zip(sentences_with_oov, sequences_with_oov):
  print(f'{sentence} ---> {sequence}')

i really love my dog ---> [6 1 3 2 4]
my dog loves my manatee ---> [2 4 1 2 1]
