# Week 6 Assignment

### I tried couple ways to do this, but this was the only way to get it work

In [12]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers

# Load the dataset
text_file = "fin-eng/fin.txt"

with open(text_file, encoding='utf-8') as f:
    lines = f.read().split("\n")[:-1]

# Limit the data to a smaller subset (e.g., using only the first 1000 samples for quick testing)
text_pairs = []
for line in lines[:1000]:  # Limit data to the first 1000 sentences
    english, finnish, rest = line.split("\t")
    finnish = "[start] " + finnish + " [end]"
    text_pairs.append((english, finnish))

# Split into English and Finnish sentences
english_sentences = [pair[0] for pair in text_pairs]
finnish_sentences = [pair[1] for pair in text_pairs]

# Tokenize the English sentences
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(english_sentences)
english_vocab_size = len(english_tokenizer.word_index) + 1  # Add 1 for padding
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)

# Tokenize the Finnish sentences
finnish_tokenizer = Tokenizer()
finnish_tokenizer.fit_on_texts(finnish_sentences)
finnish_vocab_size = len(finnish_tokenizer.word_index) + 1  # Add 1 for padding
finnish_sequences = finnish_tokenizer.texts_to_sequences(finnish_sentences)

# Find max sequence length in the dataset
max_english_length = max([len(seq) for seq in english_sequences])
max_finnish_length = max([len(seq) for seq in finnish_sequences])

# Update the max sequence length (use actual max length or reasonable cutoff)
max_english_length = 68  # Adjust to a reasonable cutoff or use the actual max length
max_finnish_length = 68  # Similarly adjust for Finnish

# Pad the sequences
english_sequences = pad_sequences(english_sequences, maxlen=max_english_length, padding='post')
finnish_sequences = pad_sequences(finnish_sequences, maxlen=max_finnish_length, padding='post')

# Prepare the model components (Positional Encoding, Transformer layers)
def get_positional_encoding(seq_len, d_model):
    position = np.arange(seq_len)[:, np.newaxis]
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
    pos_enc = np.zeros((seq_len, d_model))
    pos_enc[:, 0::2] = np.sin(position * div_term)
    pos_enc[:, 1::2] = np.cos(position * div_term)
    return tf.cast(pos_enc, dtype=tf.float32)

# Transformer Encoder Layer
def transformer_encoder(inputs, d_model, num_heads, ff_dim, dropout=0.1):
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(inputs, inputs)
    attention = layers.Dropout(dropout)(attention)
    attention = layers.LayerNormalization(epsilon=1e-6)(inputs + attention)
    
    ff = layers.Dense(ff_dim, activation='relu')(attention)
    ff = layers.Dropout(dropout)(ff)
    ff = layers.Dense(d_model)(ff)
    
    return layers.LayerNormalization(epsilon=1e-6)(attention + ff)

# Transformer Decoder Layer
def transformer_decoder(encoder_output, decoder_input, d_model, num_heads, ff_dim, dropout=0.1):
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(decoder_input, decoder_input)
    attention = layers.Dropout(dropout)(attention)
    attention = layers.LayerNormalization(epsilon=1e-6)(decoder_input + attention)
    
    attention2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(attention, encoder_output)
    attention2 = layers.Dropout(dropout)(attention2)
    attention2 = layers.LayerNormalization(epsilon=1e-6)(attention + attention2)
    
    ff = layers.Dense(ff_dim, activation='relu')(attention2)
    ff = layers.Dropout(dropout)(ff)
    ff = layers.Dense(d_model)(ff)
    
    return layers.LayerNormalization(epsilon=1e-6)(attention2 + ff)

# Build the Transformer model
def build_transformer_model(english_vocab_size, finnish_vocab_size, d_model=128, num_heads=8, ff_dim=512, max_english_length=20, max_finnish_length=20):
    english_input = layers.Input(shape=(max_english_length,))
    finnish_input = layers.Input(shape=(max_finnish_length,))
    
    english_embedding = layers.Embedding(english_vocab_size, d_model)(english_input)
    finnish_embedding = layers.Embedding(finnish_vocab_size, d_model)(finnish_input)
    
    pos_enc = get_positional_encoding(max_english_length, d_model)
    english_embedding += pos_enc
    finnish_embedding += pos_enc
    
    encoder_output = transformer_encoder(english_embedding, d_model, num_heads, ff_dim)
    decoder_output = transformer_decoder(encoder_output, finnish_embedding, d_model, num_heads, ff_dim)
    
    output = layers.Dense(finnish_vocab_size, activation='softmax')(decoder_output)
    
    model = tf.keras.Model(inputs=[english_input, finnish_input], outputs=output)
    return model

# Build and compile the model
model = build_transformer_model(english_vocab_size, finnish_vocab_size, max_english_length=max_english_length, max_finnish_length=max_finnish_length)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary to ensure everything is correct
model.summary()

# Training data preparation
english_input_data = english_sequences
finnish_input_data = finnish_sequences  # Keep the full length of Finnish sequences (no removal of the last token)
finnish_output_data = finnish_sequences[:, 1:]  # Remove the first token of Finnish sequences (targets)

# Pad `finnish_output_data` to the correct length
finnish_output_data = pad_sequences(finnish_output_data, maxlen=max_finnish_length, padding='post')

# Train the model for 3 epochs
model.fit([english_input_data, finnish_input_data], np.expand_dims(finnish_output_data, -1), batch_size=32, epochs=3, validation_split=0.2)



Epoch 1/3




[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 134ms/step - accuracy: 0.8144 - loss: 1.9860 - val_accuracy: 0.9552 - val_loss: 0.4127
Epoch 2/3
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 121ms/step - accuracy: 0.9564 - loss: 0.3885 - val_accuracy: 0.9552 - val_loss: 0.4105
Epoch 3/3
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 121ms/step - accuracy: 0.9564 - loss: 0.3806 - val_accuracy: 0.9552 - val_loss: 0.4109


<keras.src.callbacks.history.History at 0x29197a6f410>