# Vectorizations

This notebook will be used to perform some vectorization experiments - and to practice my coding skills :)

### Table of Contents

- Standalone vectorization
    - independent, uses pure python
    - standardizes into lowercase and removes punctuation
    - splits by space
    - makes vocabulary takes in multiple strings
    - returns integer sequences
- TextVectorization Layer
    - Faster, more efficient, and integrated into tensorflow's data pipeline and keras model
    - You can give custom behavior (though the default is standardize=lowercase, remove punctuation; tokenize=whitespace)
        - Remember that it uses tf.strings!
    - its adapt method indexes the vocabulary of a list of sentences; get_vocabulary() to get the vocabulary

In [13]:
# imports
import string
import re
import tensorflow as tf
from tensorflow import keras
from keras import layers

### Standalone Vectorizer

In [4]:
# standalone vectorizer
class StandaloneVectorizer:

    def __init__(self, dictionary):
        """The initializer for the Standalone Vectorizer, initializing the vocabular and inverse vocabulary dictionary"""
        self.vocab = {}
        self.inverse_vocab = {}
        self.make_vocabulary(dictionary)

    @staticmethod
    def standardize(text: str) -> str:
        """Standardizes the text, converting it to lowercase and removing punctuation"""
        text = text.lower()
        return "".join([i for i in text if i not in string.punctuation])

    @staticmethod
    def tokenize(text: str) -> list[int]:
        """Tokenizes the text, stndardizing it and then spliting it by space"""
        text = StandaloneVectorizer.standardize(text)
        return text.split()

    def make_vocabulary(self, dictionary: list[str]):
        """Forms the vocabulary, taking each sentence in dictionary, extracting the words, and assigning indices to them"""
        self.vocab = {"": 0, "[UNK]" : 1}
        for i in dictionary:
            # loop through each string, extracting tokens
            tokens = StandaloneVectorizer.tokenize(i)
            for token in tokens:
                if token not in self.vocab:
                    # add the token to our vocabulary
                    self.vocab[token] = len(self.vocab)

        # inverse_vocab now takes in an integer and returns the word
        self.inverse_vocab = {v: k for k, v in self.vocab.items()}

    def encode(self, text: str):
        """Encodes the text into the integer sequence"""
        tokens = StandaloneVectorizer.tokenize(text)
        return [self.vocab.get(token, 1) for token in tokens]

    def decode(self, int_sequence: list[int]):
        """Decodes an integer sequence into its 'word form'"""
        return " ".join([self.inverse_vocab.get(i, "[UNK]") for i in int_sequence])

dataset = ["I write, erase, rewrite", "Erase again, and then", "A poppy blooms."]
vectorizer = StandaloneVectorizer(dataset)
print(f"Vocabulary: {vectorizer.vocab}")

test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
print(f"Encoded sentence {encoded_sentence}")
decoded_sentence = vectorizer.decode(encoded_sentence)
print(f"Decoded sentence: {decoded_sentence}")

Vocabulary: {'': 0, '[UNK]': 1, 'i': 2, 'write': 3, 'erase': 4, 'rewrite': 5, 'again': 6, 'and': 7, 'then': 8, 'a': 9, 'poppy': 10, 'blooms': 11}
Encoded sentence [2, 3, 5, 7, 1, 5, 6]
Decoded sentence: i write rewrite and [UNK] rewrite again


### TextVectorization Layer, built into Keras

In [14]:
# default text vectorization
default_text = layers.TextVectorization()

# custom standarization and split functions
def custom_standardization(string_tensor: tf.string) -> tf.string:
    lowercase_string = tf.strings.lower(string_tensor)
    return tf.strings.regex_replace(lowercase_string, f"[{re.escape(string.punctuation)}]", "")

def custom_split(string_tensor: tf.string) -> tf.string:
    return tf.strings.split(string_tensor)

# create a custom text vectorizer
custom_text = layers.TextVectorization(output_mode="int", standardize=custom_standardization, split=custom_split)

# add our vocabulary into it
default_text.adapt(dataset)
custom_text.adapt(dataset)

# print some stuff :D
print(default_text.get_vocabulary())
print(default_text(test_sentence))
print(custom_text(test_sentence))

['', '[UNK]', 'erase', 'write', 'then', 'rewrite', 'poppy', 'i', 'blooms', 'and', 'again', 'a']
tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)
tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)
