<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/Preprocessing/Text%20Preprocessing%20layers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram K

Working with preprocessing layers

In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [4]:
data = ["How", "are", "you ..."]

### Creating the text processing layer

In [5]:
layer = layers.TextVectorization()
layer.adapt(data)
vectorized_text = layer(data)
print(vectorized_text)

tf.Tensor(
[[3]
 [4]
 [2]], shape=(3, 1), dtype=int64)


### Creating the vocab

In [6]:
vocab = ["a", "b", "c", "d"]
data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
layer = layers.StringLookup(vocabulary=vocab)
vectorized_data = layer(data)
print(vectorized_data)

tf.Tensor(
[[1 3 4]
 [4 0 2]], shape=(2, 3), dtype=int64)


### Creating the layers before training the model

In [11]:
adapt_data = tf.constant(
    [
        "The Brain is wider than the Sky",
        "For put them side by side",
        "The one the other will contain",
        "With ease and You beside",
    ]
)

# Create a TextVectorization layer
text_vectorizer = layers.TextVectorization(output_mode="int")
text_vectorizer.adapt(adapt_data)


# Try out the layer
print(
    "Encoded text:\n", text_vectorizer(["The Brain is deeper than the sea"]).numpy(),
)

# Create a simple model
inputs = keras.Input(shape=(None,), dtype="int64")
x = layers.Embedding(input_dim=text_vectorizer.vocabulary_size(), output_dim=16)(inputs)
x = layers.GRU(8)(x)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

# Create a labeled dataset (which includes unknown tokens)
train_dataset = tf.data.Dataset.from_tensor_slices(
    (["The Brain is deeper than the sea", "for if they are held Blue to Blue"], [1, 0])
)

# Preprocess the string inputs, turning them into int sequences
train_dataset = train_dataset.batch(2).map(lambda x, y: (text_vectorizer(x), y))
print(train_dataset)
# Train the model on the int sequences
print("\nTraining model...")
model.compile(optimizer="rmsprop", loss="mse")
model.fit(train_dataset)


Encoded text:
 [[ 2 19 14  1  9  2  1]]
<MapDataset shapes: ((None, None), (None,)), types: (tf.int64, tf.int32)>

Training model...


<keras.callbacks.History at 0x7f784dd9d4d0>

### Creating the layer inside the model

In [12]:
# For inference, you can export a model that accepts strings as input
inputs = keras.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
outputs = model(x)
model = keras.Model(inputs, outputs)

# Call the end-to-end model on test data (which includes unknown tokens)
print("\nCalling end-to-end model on test string...")
test_data = tf.constant(["The one the other will absorb"])
test_output = model(test_data)
print("Model output:", test_output)


Calling end-to-end model on test string...
Model output: tf.Tensor([[0.0418027]], shape=(1, 1), dtype=float32)
