#### **➡️Tokenization**

In [9]:
import tensorflow as tf
from tensorflow.keras import layers, models , preprocessing

#### **➡️Custom Text Data**

In [2]:
sentences = ["I love my dog",
            "I love my cat",
			"I hate my dog",
            "I love programming",
			"I hate programming",
			"I love my cat and dog",
            ]

#### **➡️Create Vocab**

In [3]:
vect_layer = layers.TextVectorization()
vect_layer.adapt(sentences)
vocab = vect_layer.get_vocabulary()

for i,word in enumerate(vocab):
	print(f"{i}: {word}")

0: 
1: [UNK]
2: i
3: my
4: love
5: dog
6: programming
7: hate
8: cat
9: and


In [4]:
print("Vocabulary size:", len(vocab))

Vocabulary size: 10


#### **➡️Text to Sequence**

In [7]:
sentence_dataset = tf.data.Dataset.from_tensor_slices(sentences)
sequence = sentence_dataset.map(vect_layer)

for sentence ,seq in zip(sentence_dataset, sequence):
	print(f'{sentence}----> {seq.numpy()}')

b'I love my dog'----> [2 4 3 5]
b'I love my cat'----> [2 4 3 8]
b'I hate my dog'----> [2 7 3 5]
b'I love programming'----> [2 4 6]
b'I hate programming'----> [2 7 6]
b'I love my cat and dog'----> [2 4 3 8 9 5]


#### **➡️Padding**

In [11]:
sequences_post = preprocessing.sequence.pad_sequences(sequence, padding='post')
sequences_post

array([[2, 4, 3, 5, 0, 0],
       [2, 4, 3, 8, 0, 0],
       [2, 7, 3, 5, 0, 0],
       [2, 4, 6, 0, 0, 0],
       [2, 7, 6, 0, 0, 0],
       [2, 4, 3, 8, 9, 5]], dtype=int32)

In [12]:
sequences_pre = preprocessing.sequence.pad_sequences(sequence, padding='pre')
sequences_pre

array([[0, 0, 2, 4, 3, 5],
       [0, 0, 2, 4, 3, 8],
       [0, 0, 2, 7, 3, 5],
       [0, 0, 0, 2, 4, 6],
       [0, 0, 0, 2, 7, 6],
       [2, 4, 3, 8, 9, 5]], dtype=int32)

#### **➡️Test on Data**

In [15]:
test_data = ["I love my dog and cat",
             "I love python programming and tensorflow",
             "I love NLP and computer vision",
             "I want to be a data scientist",
             ]

test_seq = vect_layer(test_data)
test_seq

<tf.Tensor: shape=(4, 7), dtype=int64, numpy=
array([[2, 4, 3, 5, 9, 8, 0],
       [2, 4, 1, 6, 9, 1, 0],
       [2, 4, 1, 9, 1, 1, 0],
       [2, 1, 1, 1, 1, 1, 1]])>