# Week 6 Assignment

### Dataset Loading and Preprocessing
This section loads the English-Finnish dataset, preprocesses it by appending start and end tokens to Finnish sentences, and shuffles the data. The data is split into training and validation sets, with 20% used for validation.


In [1]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import numpy as np
import string

# Load and preprocess the dataset
text_file = "fin-eng/fin.txt"

with open(text_file, encoding='utf-8') as f:
    lines = f.read().split("\n")[:-1]

text_pairs = []
for line in lines:
    english, finnish, _ = line.split("\t")
    finnish = "[start] " + finnish + " [end]"
    text_pairs.append((english, finnish))

# Shuffle and split into train and validation sets
np.random.shuffle(text_pairs)
num_val_samples = int(len(text_pairs) * 0.2)
train_pairs = text_pairs[:-num_val_samples]
val_pairs = text_pairs[-num_val_samples:]


### Text Vectorization
The text vectorization layers tokenize and convert English and Finnish sentences into integer sequences, with a vocabulary limit of 20,000 tokens and a fixed sequence length of 20. These layers are adapted to the training data.


In [2]:
# Text vectorization setup
strip_chars = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\t\n"

# Ensure sequence lengths match
sequence_length = 20

# Define source vectorization layer
source_vectorization = TextVectorization(
    max_tokens=20000,
    output_sequence_length=sequence_length,
    standardize=lambda text: tf.strings.regex_replace(text, f"[{strip_chars}]", "")
)

# Define target vectorization layer
target_vectorization = TextVectorization(
    max_tokens=20000,
    output_sequence_length=sequence_length,
    standardize=lambda text: tf.strings.regex_replace(text, f"[{strip_chars}]", "")
)



### Dataset Preparation
This section defines functions to format the dataset into input-output pairs for the Transformer model. It creates TensorFlow datasets for training and validation with batching and prefetching for optimized training.


In [3]:
# Data pipeline
batch_size = 64

def format_dataset(eng, fin):
    eng = source_vectorization(eng)
    fin = target_vectorization(fin)
    return {"encoder_inputs": eng, "decoder_inputs": fin[:, :-1]}, fin[:, 1:]

def make_dataset(pairs):
    eng_texts, fin_texts = zip(*pairs)
    dataset = tf.data.Dataset.from_tensor_slices((list(eng_texts), list(fin_texts)))
    dataset = dataset.shuffle(2048).batch(batch_size).map(format_dataset).prefetch(16)
    return dataset

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)


### Transformer Encoder and Decoder
This section implements the Transformer Encoder and Decoder using multi-head attention, dense projection layers, and residual connections with layer normalization. The Decoder includes a causal attention mask for autoregressive generation.


In [4]:
from tensorflow.keras.layers import Layer, LayerNormalization, Dense

class TransformerEncoder(Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = tf.keras.Sequential([
            Dense(dense_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()

    def call(self, inputs, mask=None):
        attention_output = self.attention(query=inputs, value=inputs, key=inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

class TransformerDecoder(Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.attention_1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = tf.keras.Sequential([
            Dense(dense_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.layernorm_3 = LayerNormalization()

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        attention_output_1 = self.attention_1(query=inputs, value=inputs, key=inputs, attention_mask=causal_mask)
        proj_input = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(query=proj_input, value=encoder_outputs, key=encoder_outputs, attention_mask=mask)
        proj_input = self.layernorm_2(proj_input + attention_output_2)

        proj_output = self.dense_proj(proj_input)
        return self.layernorm_3(proj_input + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        seq_len = input_shape[1]
        mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return mask


### Model Building
This section constructs the Transformer model by stacking the Encoder and Decoder, using embeddings for both input and output sequences, and applying a dense softmax layer for final predictions.


In [5]:
embed_dim = 256
dense_dim = 512
num_heads = 8

encoder_inputs = tf.keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = tf.keras.layers.Embedding(20000, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = tf.keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
x = tf.keras.layers.Embedding(20000, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
outputs = Dense(20000, activation="softmax")(x)

model = tf.keras.Model([encoder_inputs, decoder_inputs], outputs)





### Training
The model is compiled with the RMSprop optimizer and trained for 10 epochs using the training and validation datasets.


In [10]:
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(train_ds, epochs=10, validation_data=val_ds)


Epoch 1/10


FailedPreconditionError: Graph execution error:

Detected at node text_vectorization_1/None_Lookup/LookupTableFindV2 defined at (most recent call last):
<stack traces unavailable>
Error in user-defined function passed to MapDataset:3 transformation with iterator: Iterator::Root::Prefetch::Map: Table not initialized.
	 [[{{node text_vectorization_1/None_Lookup/LookupTableFindV2}}]]
	 [[IteratorGetNext]] [Op:__inference_one_step_on_iterator_25740]

### Translation Testing
This section defines a function to test sentence translation. It generates translations for example sentences using the trained Transformer model.


In [None]:
def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(20):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
        predictions = model.predict({"encoder_inputs": tokenized_input_sentence, "decoder_inputs": tokenized_target_sentence})
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = target_vectorization.get_vocabulary()[sampled_token_index]
        if sampled_token == "[end]":
            break
        decoded_sentence += " " + sampled_token
    return decoded_sentence

# Examples
examples = ["I love programming.", "How are you?", "This is a test."]
for example in examples:
    print(f"{example} -> {decode_sequence(example)}")


FailedPreconditionError: Exception encountered when calling TextVectorization.call().

[1m{{function_node __wrapped__LookupTableFindV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Table not initialized. [Op:LookupTableFindV2] name: [0m

Arguments received by TextVectorization.call():
  • inputs=["'I love programming.'"]