In [2]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
dataset = ['The cat sat on the mat.', 'The dog ate my homework.']
vocabulary = {}
for text in dataset:   
    text = standardize(text)
    tokens = tokenize(text)
    for token in tokens:
        if token not in vocabulary:
            vocabulary[token] = len(vocabulary)

def one_hot_encode_token(token):
    vector = np.zeros((len(vocabulary),))
    token_index = vocabulary[token]
    vector[token_index] = 1
    return vector

NameError: name 'standardize' is not defined

In [16]:

import string
class Vectorizer:
    
    def standardize(self, text):
        text = text.lower()
        return "".join(char for char in text
                       if char not in string.punctuation)
    
    def tokenize(self, text):
        text = self.standardize(text)
        return text.split()
    
    def make_vocabulary(self, dataset):
        self.vocabulary = {"": 0, "[UNK]": 1}
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        self.inverse_vocabulary = dict((v, k) for k, v in self.vocabulary.items())
        
    def encode(self, text):
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, 1) for token in tokens]
    
    def decode(self, int_sequence):
        return " ".join(self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)
    
vectorizer = Vectorizer()
dataset = ["I write, erase, rewrite",
            "Erase again, and then",
            "A poppy blooms.",]

vectorizer.make_vocabulary(dataset)

    

In [17]:
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)   

[2, 3, 5, 7, 1, 5, 6]


In [18]:
decoded_sentence = vectorizer.decode(encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again


# Preparing text data with keras TextVectorization layer 

In [22]:
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(output_mode="int",)

In [23]:
text_vectorization

<keras.layers.preprocessing.text_vectorization.TextVectorization at 0x25067d0d5b0>

the default layer behavior is equivalent to the following:
-

In [24]:
import re
import string
import tensorflow as tf

def custom_standardization_fn(string_tensor):
    lowercase_string = tf.strings.lower(string_tensor)
    return tf.strings.regex_replace(lowercase_string, f"[{re.escape(string.punctuation)}]", "")

def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor)

text_vectorization = TextVectorization(
 output_mode="int",
 standardize=custom_standardization_fn,
 split=custom_split_fn,)

#just call the adapt() method of the layer with a Dataset object that yields strings

dataset = [
 "I write, erase, rewrite",
 "Erase again, and then",
 "A poppy blooms.",]

text_vectorization.adapt(dataset)

Displaying the vocabulary
-

In [16]:
data = text_vectorization.get_vocabulary()
print(data)

['', '[UNK]', 'erase', 'write', 'then', 'rewrite', 'poppy', 'i', 'blooms', 'and', 'again', 'a']


In [26]:
data[5]

'rewrite'

encode and then decode an example sentence
-

In [33]:
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)

inverse_vocab = dict(enumerate(vocabulary))
decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)
i write rewrite and [UNK] rewrite again
