<a href="https://colab.research.google.com/github/Scaglione-Nick/ECGR4106/blob/main/HW5_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

# Data Loading Function
def load_data_from_txt(file_path):
    english_sentences = []
    french_sentences = []

    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        lines = file.readlines()

        # Join all lines into one string (to handle multiline tuples)
        content = ''.join(lines)

        # Regular expression to find all English-French pairs
        pattern = r'\("([^"]+)", "([^"]+)"\)'

        # Find all matching pairs (English, French)
        matches = re.findall(pattern, content)

        for match in matches:
            english_sentences.append(match[0])  # English sentence
            french_sentences.append(match[1])   # French sentence

    return english_sentences, french_sentences

french_sentences, english_sentences = load_data_from_txt('E2F_dataset.txt')

In [None]:
# Tokenize the sentences
tokenizer_en = Tokenizer(oov_token='<OOV>')  # Handle out-of-vocabulary words in English
tokenizer_fr = Tokenizer(oov_token='<OOV>')  # Handle out-of-vocabulary words in French

tokenizer_en.fit_on_texts(english_sentences)
tokenizer_fr.fit_on_texts(french_sentences)

# Convert sentences to sequences of integers
encoder_input = tokenizer_en.texts_to_sequences(english_sentences)
decoder_input = tokenizer_fr.texts_to_sequences(french_sentences)

# Add <start> and <end> tokens to French sentences (for the decoder)
start_token = tokenizer_fr.word_index.get('<start>', len(tokenizer_fr.word_index) + 1)
end_token = tokenizer_fr.word_index.get('<end>', len(tokenizer_fr.word_index) + 2)
#Manually add <start> and <end> tokens to the tokenizer's word_index.
tokenizer_fr.word_index['<start>'] = start_token
tokenizer_fr.word_index['<end>'] = end_token

decoder_input = [[start_token] + seq + [end_token] for seq in decoder_input]
max_encoder_len = max([len(seq) for seq in encoder_input])
max_decoder_len = max([len(seq) for seq in decoder_input])

In [None]:
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0.1):
    # Self-attention layer
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(inputs, inputs)
    attention = layers.Dropout(dropout)(attention)
    attention = layers.LayerNormalization(epsilon=1e-6)(inputs + attention)

    # Feed-forward network
    ff = layers.Dense(ff_dim, activation='relu')(attention)
    ff = layers.Dropout(dropout)(ff)
    ff = layers.Dense(inputs.shape[-1])(ff)
    output = layers.LayerNormalization(epsilon=1e-6)(attention + ff)

    return output

def transformer_decoder(inputs, encoder_output, head_size, num_heads, ff_dim, dropout=0.1):
    # Cross-attention layer
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(inputs, encoder_output)
    attention = layers.Dropout(dropout)(attention)
    attention = layers.LayerNormalization(epsilon=1e-6)(inputs + attention)

    # Feed-forward network
    ff = layers.Dense(ff_dim, activation='relu')(attention)
    ff = layers.Dropout(dropout)(ff)
    ff = layers.Dense(inputs.shape[-1])(ff)
    output = layers.LayerNormalization(epsilon=1e-6)(attention + ff)

    return output

def build_transformer(vocab_size_en, vocab_size_fr, max_len_encoder, max_len_decoder, head_size=128, num_heads=1, ff_dim=256, num_layers=1):
    # Encoder input
    encoder_input_layer = Input(shape=(max_len_encoder,))
    encoder_embedding = Embedding(input_dim=vocab_size_en, output_dim=head_size)(encoder_input_layer)  # Use vocab_size_en here

    encoder_output = encoder_embedding
    for _ in range(num_layers):
        encoder_output = transformer_encoder(encoder_output, head_size, num_heads, ff_dim)

    # Decoder input
    decoder_input_layer = Input(shape=(max_len_decoder,))
    decoder_embedding = Embedding(input_dim=vocab_size_fr, output_dim=head_size)(decoder_input_layer)  # Use vocab_size_fr here

    decoder_output = decoder_embedding
    for _ in range(num_layers):
        decoder_output = transformer_decoder(decoder_output, encoder_output, head_size, num_heads, ff_dim)

    # Final output layer
    output = layers.Dense(vocab_size_fr, activation='softmax')(decoder_output)  # Use vocab_size_fr here as well

    model = Model([encoder_input_layer, decoder_input_layer], output)
    return model

# Parameters
vocab_size_en = len(tokenizer_en.word_index) + 1
vocab_size_fr = len(tokenizer_fr.word_index) + 1

# Build the model with simplified parameters
model = build_transformer(vocab_size_en+20, vocab_size_fr+20, max_encoder_len, max_decoder_len, num_heads=2, num_layers=1, head_size=128, ff_dim=256)
# Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Pad the sequences to ensure that all sequences have the same length
encoder_input_padded = pad_sequences(encoder_input, maxlen=max_encoder_len, padding='post')
decoder_input_padded = pad_sequences(decoder_input, maxlen=max_decoder_len, padding='post')

# Prepare decoder output (next word prediction for the decoder)
decoder_output_data = [seq[1:] for seq in decoder_input]  # Create a list of target sequences

# Pad the decoder output data
decoder_output_padded = pad_sequences(decoder_output_data, maxlen=max_decoder_len, padding='post')

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(encoder_input_padded, decoder_output_padded, test_size=0.2, random_state=42)
decoder_input_train, decoder_input_val = train_test_split(decoder_input_padded, test_size=0.2, random_state=42)

In [None]:
# Train the model
history = model.fit(
    [X_train, decoder_input_train],
    y_train,
    epochs=10,               # Set the number of epochs
    batch_size=64,           # Adjust batch size as needed
    validation_data=([X_val, decoder_input_val], y_val)
)

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 447ms/step - accuracy: 0.0925 - loss: 5.0690 - val_accuracy: 0.4251 - val_loss: 3.6829
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 0.4419 - loss: 3.6621 - val_accuracy: 0.4058 - val_loss: 3.4510
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - accuracy: 0.3985 - loss: 3.3583 - val_accuracy: 0.4058 - val_loss: 3.3023
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - accuracy: 0.4196 - loss: 3.1552 - val_accuracy: 0.4493 - val_loss: 3.0962
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 0.4712 - loss: 2.8706 - val_accuracy: 0.4686 - val_loss: 2.9721
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 0.5004 - loss: 2.7250 - val_accuracy: 0.5266 - val_loss: 2.8263
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━

In [None]:
# List of configurations: (num_layers, num_heads)
configs = [
    (1, 2),  # 1 layer, 2 heads
    (1, 4),  # 1 layer, 4 heads
    (2, 2),  # 2 layers, 2 heads
    (2, 4),  # 2 layers, 4 heads
    (4, 2),  # 4 layers, 2 heads
    (4, 4)   # 4 layers, 4 heads
]

# Function to build and train the model for a given configuration
def train_transformer_with_config(num_layers, num_heads):
    print(f"Training model with {num_layers} layers and {num_heads} heads...")

    # Build the model with the given configuration
    model = build_transformer(vocab_size_en+2, vocab_size_fr+2, max_encoder_len, max_decoder_len, num_heads=num_heads, num_layers=num_layers, head_size=128, ff_dim=256)

    # Compile the model
    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(
        [X_train, decoder_input_train],
        y_train,
        epochs=10,               # Set the number of epochs
        batch_size=64,           # Adjust batch size as needed
        validation_data=([X_val, decoder_input_val], y_val)
    )

    # Return the history object for analysis
    return history

# Loop through each configuration and train the model
history_results = {}
for num_layers, num_heads in configs:
    history_results[f"{num_layers}_layers_{num_heads}_heads"] = train_transformer_with_config(num_layers, num_heads)


Training model with 1 layers and 2 heads...
Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 465ms/step - accuracy: 0.0981 - loss: 5.6757 - val_accuracy: 0.4058 - val_loss: 3.6669
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.4026 - loss: 3.6326 - val_accuracy: 0.4155 - val_loss: 3.4007
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step - accuracy: 0.4106 - loss: 3.3720 - val_accuracy: 0.4058 - val_loss: 3.3248
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - accuracy: 0.4054 - loss: 3.1726 - val_accuracy: 0.4493 - val_loss: 3.1936
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.4592 - loss: 2.9930 - val_accuracy: 0.4348 - val_loss: 2.9943
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 92ms/step - accuracy: 0.4643 - loss: 2.7501 - val_accuracy: 0.4493 - val_loss: 2.9031

In [None]:
model = build_transformer(vocab_size_en+2, vocab_size_fr+2, max_encoder_len, max_decoder_len, num_heads=2, num_layers=1, head_size=128, ff_dim=256)

    # Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model
history = model.fit(
    [X_train, decoder_input_train],
    y_train,
    epochs=50,               # Set the number of epochs
    batch_size=64,           # Adjust batch size as needed
    validation_data=([X_val, decoder_input_val], y_val)
)

Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 475ms/step - accuracy: 0.0822 - loss: 5.2050 - val_accuracy: 0.4058 - val_loss: 3.6505
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - accuracy: 0.4003 - loss: 3.6739 - val_accuracy: 0.4058 - val_loss: 3.3868
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - accuracy: 0.4014 - loss: 3.3417 - val_accuracy: 0.4106 - val_loss: 3.2113
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.4178 - loss: 3.1306 - val_accuracy: 0.4106 - val_loss: 3.0767
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 0.4320 - loss: 2.8878 - val_accuracy: 0.5411 - val_loss: 2.9155
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 0.5249 - loss: 2.7029 - val_accuracy: 0.4686 - val_loss: 2.8499
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━

In [None]:
def decode_sequence(input_seq, tokenizer_en, tokenizer_fr, model, max_decoder_len):
    """
    Decode a sequence (input sequence) to get the predicted output (French translation).
    This function generates one word at a time and feeds it back into the model.
    """
    # Get the initial encoder state by predicting the encoder output
    encoder_input_seq = np.expand_dims(input_seq, axis=0)

    # Initialize the decoder input with the <start> token
    start_token = tokenizer_fr.word_index.get('<start>', tokenizer_fr.word_index.get('<OOV>'))
    decoder_input_seq = np.array([[start_token]])

    # Store the decoded sequence (French translation)
    decoded_sentence = []

    for _ in range(max_decoder_len):
        # Predict the next word (token)
        preds = model.predict([encoder_input_seq, decoder_input_seq])

        # Get the predicted word (token) with the highest probability
        predicted_token = np.argmax(preds[0, -1, :])

        # If we reach the <end> token, stop generating
        if predicted_token == tokenizer_fr.word_index['<end>']:
            break

        # Convert the predicted token to the word
        predicted_word = tokenizer_fr.index_word.get(predicted_token, "")
        decoded_sentence.append(predicted_word)

        # Update the decoder input sequence with the predicted word
        decoder_input_seq = np.concatenate([decoder_input_seq, np.array([[predicted_token]])], axis=-1)

    return ' '.join(decoded_sentence)

# Select a few test sentences from your validation set (for example, the first 5 sentences)
test_sentences = X_val[:5]
true_translations = french_sentences[:5]

# Decode each input sentence and compare it with the true translation
for i, sentence in enumerate(test_sentences):
    print(f"Input (English): {english_sentences[i]}")
    predicted_translation = decode_sequence(sentence, tokenizer_en, tokenizer_fr, model, max_decoder_len)
    print(f"Predicted Translation (French): {predicted_translation}")
    print(f"True Translation (French): {true_translations[i]}")
    print("-" * 50)

Input (English): J'ai froid
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Predicted Translation (French): we greet our neighbors
True Translation (French): I am cold
--------------------------------------------------
Input (English): Tu es fatigué
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Predicted Translation (French): we climb the world
True Translation (French)