<a href="https://colab.research.google.com/github/WambuiMunene/Tensorflow-Notebooks/blob/main/TF2_0_Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

In [3]:
from tensorflow.keras.layers import TextVectorization


In [4]:
# tiny dataset
sentences = [
    'I like eggs and ham',
    'I love chocolate and bunnies',
    'I hate onions'
    ]


In [21]:
MAX_VOCAB_SIZE = 20000


In [22]:
vectorization_layer = TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    # standardize = "lower_and_strip_punctuation",
    # split = "whitespace",
    # ngrams = None,
    # output_mode='int',
)

In [23]:
vectorization_layer.adapt(sentences)  # this is like the fit phase

In [24]:
sequences = vectorization_layer(sentences)  # this is like the predict phase
print(sequences)

tf.Tensor(
[[ 2  6  9  3  8]
 [ 2  5 10  3 11]
 [ 2  7  4  0  0]], shape=(3, 5), dtype=int64)


In [25]:
vectorization_layer.get_vocabulary()

['',
 '[UNK]',
 np.str_('i'),
 np.str_('and'),
 np.str_('onions'),
 np.str_('love'),
 np.str_('like'),
 np.str_('hate'),
 np.str_('ham'),
 np.str_('eggs'),
 np.str_('chocolate'),
 np.str_('bunnies')]

In [26]:
# how do weget the word toindex mapping?
word2idx ={v:k for k,v in enumerate(vectorization_layer.get_vocabulary())}
print(word2idx)

{'': 0, '[UNK]': 1, np.str_('i'): 2, np.str_('and'): 3, np.str_('onions'): 4, np.str_('love'): 5, np.str_('like'): 6, np.str_('hate'): 7, np.str_('ham'): 8, np.str_('eggs'): 9, np.str_('chocolate'): 10, np.str_('bunnies'): 11}


In [28]:
# Truncation
vectorization_layer_truncated =TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    output_sequence_length=3

)

#fit
vectorization_layer_truncated.adapt(sentences)

#vectorize
sequences_truncated = vectorization_layer_truncated(sentences)
print(sequences_truncated)


tf.Tensor(
[[ 2  6  9]
 [ 2  5 10]
 [ 2  7  4]], shape=(3, 3), dtype=int64)


In [29]:
# ragged (no padding) (TF backend only)
vectorization_layer_ragged =TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    ragged = True

)

# fit
vectorization_layer_ragged.adapt(sentences)

# predict
sequences_ragged = vectorization_layer_ragged(sentences)
print(sequences_ragged)

<tf.RaggedTensor [[2, 6, 9, 3, 8], [2, 5, 10, 3, 11], [2, 7, 4]]>


In [32]:
sequences_ragged.to_list()

[[2, 6, 9, 3, 8], [2, 5, 10, 3, 11], [2, 7, 4]]

In [33]:
# padat front instead of back
# not supported in Text Vectorization layer itself
from tensorflow.keras.utils import pad_sequences

print(pad_sequences(sequences,padding='post'))

#defaults:
# tf.keras.utils.pad_sequences(
#     sequences,
#     maxlen=None,
#     dtype='int32',
#     padding='pre',
#     truncating='pre',
#     value=0.0
# )

padded = pad_sequences(sequences_ragged.to_list())
print(padded)

[[ 2  6  9  3  8]
 [ 2  5 10  3 11]
 [ 2  7  4  0  0]]
[[ 2  6  9  3  8]
 [ 2  5 10  3 11]
 [ 0  0  2  7  4]]
