In [1]:
import re 
import torch 
import math
import copy
from torch import nn
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import torch.nn.functional as F
import torch.multiprocessing as mp

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader , Dataset
mp.set_start_method('spawn', force=True)

import nltk
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Loading data

In [2]:
def load_data():
    
    """Loads data from specified paths into separate lists for train, dev, and test sets."""
    base_path="/kaggle/input/datasets2"
    
    path_dict = {
        "train_en": f"{base_path}/train.en",
        "train_fr": f"{base_path}/train.fr",
        "dev_en": f"{base_path}/dev.en",
        "dev_fr": f"{base_path}/dev.fr",
        "test_en": f"{base_path}/test.en",
        "test_fr": f"{base_path}/test.fr"
    }

    data = {
        "train_en": [],
        "train_fr": [],
        "dev_en": [],
        "dev_fr": [],
        "test_en": [],
        "test_fr": []
    }

    for key, path in path_dict.items():
        with open(path, "r", encoding="utf-8") as f:
            data[key] = [line.strip() for line in f.readlines()]

    return data

data = load_data()

train_en = data["train_en"]
train_fr = data["train_fr"]
dev_en = data["dev_en"]
dev_fr = data["dev_fr"]
test_en = data["test_en"]
test_fr = data["test_fr"]

# Print the first three English training sentences with a break after each sentence
print("English training sentences:")
for sentence in train_en[:3]:
    print(sentence)  # Print each sentence on a new line
print()  # Extra newline for spacing

# Print the first three French training sentences with a break after each sentence
print("French training sentences:")
for sentence in train_fr[:3]:
    print(sentence)  # Print each sentence on a new line
print()  # Extra newline for spacing

English training sentences:
David Gallo: This is Bill Lange. I'm Dave Gallo.
And we're going to tell you some stories from the sea here in video.
We've got some of the most incredible video of Titanic that's ever been seen, and we're not going to show you any of it.

French training sentences:
David Gallo: Voici Bill Lange. Je suis Dave Gallo.
Nous allons vous raconter quelques histoires de la mer en vidéo.
Nous avons des vidéos du Titanic parmi les plus spectaculaires jamais vues. et nous n'allons pas vous en montrer une image.



## Preprocessing

In [3]:
def clean_and_preprocess(sentence):
    sentence = sentence.lower()
    
    # remove punctuation except spaces
    punctuation_pattern = r"[!\"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]"
    sentence = re.sub(punctuation_pattern, " ", sentence)
    
    # Replace multiple spaces with a single space
    sentence = re.sub(r" +", " ", sentence).strip()
    return sentence

def preprocess_corpus(sentences):
    """Preprocess a list of sentences."""
    preprocessed_sentences = []
    
    for sentence in sentences:
        preprocessed_sentence = clean_and_preprocess(sentence)
        preprocessed_sentences.append(preprocessed_sentence)
    
    return preprocessed_sentences

train_en_processed = preprocess_corpus(train_en)  
train_fr_processed = preprocess_corpus(train_fr)  
dev_en_processed = preprocess_corpus(dev_en)      
dev_fr_processed = preprocess_corpus(dev_fr) 
test_en_processed = preprocess_corpus(test_en)
test_fr_processed = preprocess_corpus(test_fr)

train_en_processed[0]

'david gallo this is bill lange i m dave gallo'

## Building vocab

In [4]:
def build_vocab(sentences, min_freq=1):
    """Build a vocabulary from the list of preprocessed sentences."""

    # Special tokens
    SPECIAL_TOKENS = ["<s>", "</s>", "<unk>", "<pad>"]
    
    # Count word frequencies
    word_counter = Counter()
    
    for sentence in sentences:
        word_counter.update(sentence.split())  # Split sentence into words
    
    vocab = [word for word, freq in word_counter.items() if freq >= min_freq]
    
    # Add special tokens at the beginning of the vocab list
    vocab = SPECIAL_TOKENS + vocab
    
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for word, idx in word2idx.items()}
    
    return vocab, word2idx, idx2word

en_vocab, en_word2idx, en_idx2word = build_vocab(train_en_processed)
fr_vocab, fr_word2idx, fr_idx2word = build_vocab(train_fr_processed)

print(f"English Vocab Size: {len(en_vocab)}")
print(f"French Vocab Size: {len(fr_vocab)}")

print(f"Sample English Vocab: {en_vocab[:10]}")
print(f"Sample English word2idx: {list(en_word2idx.items())[:10]}")
print(f"Sample English idx2word: {list(en_idx2word.items())[:10]}")

English Vocab Size: 21644
French Vocab Size: 29456
Sample English Vocab: ['<s>', '</s>', '<unk>', '<pad>', 'david', 'gallo', 'this', 'is', 'bill', 'lange']
Sample English word2idx: [('<s>', 0), ('</s>', 1), ('<unk>', 2), ('<pad>', 3), ('david', 4), ('gallo', 5), ('this', 6), ('is', 7), ('bill', 8), ('lange', 9)]
Sample English idx2word: [(0, '<s>'), (1, '</s>'), (2, '<unk>'), (3, '<pad>'), (4, 'david'), (5, 'gallo'), (6, 'this'), (7, 'is'), (8, 'bill'), (9, 'lange')]


## Tokenization

In [5]:
PAD_IDX = en_word2idx["<pad>"]
UNK_IDX = en_word2idx["<unk>"]
START_IDX = en_word2idx["<s>"]
END_IDX = en_word2idx["</s>"]

def tokenize_sentence(sentence, word2idx):
    """Tokenizes a sentence and converts it to a list of word indices."""
    
    tokens = sentence.split()  # Split sentence into words
    tokenized_sentence = [word2idx.get(word, UNK_IDX) for word in tokens]  # Convert to indices
    
    # Add <s> at the beginning and </s> at the end of the sentence
    tokenized_sentence = [START_IDX] + tokenized_sentence + [END_IDX]
    
    return tokenized_sentence

def tokenize_corpus(sentences, word2idx):
    """Tokenize a list of sentences."""
    
    tokenized_corpus = [tokenize_sentence(sentence, word2idx) for sentence in sentences]
    return tokenized_corpus

# Tokenizing English and French datasets
train_en_tokenized = tokenize_corpus(train_en_processed, en_word2idx)
train_fr_tokenized = tokenize_corpus(train_fr_processed, fr_word2idx)
dev_en_tokenized = tokenize_corpus(dev_en_processed, en_word2idx)
dev_fr_tokenized = tokenize_corpus(dev_fr_processed, fr_word2idx)
test_en_tokenized = tokenize_corpus(test_en_processed, en_word2idx)
test_fr_tokenized = tokenize_corpus(test_fr_processed, fr_word2idx)

# Convert tokenized sentences to tensors and pad them
def pad_sentences(tokenized_sentences, max_length=50):
    """Pads tokenized sentences to the same length and truncates them to max_length."""
    
    tensor_sentences = [torch.tensor(sentence[:max_length]) for sentence in tokenized_sentences]
    padded_sentences = pad_sequence(tensor_sentences, batch_first=True, padding_value=PAD_IDX)
    
    return padded_sentences


train_en_padded = pad_sentences(train_en_tokenized)
train_fr_padded = pad_sentences(train_fr_tokenized)
dev_en_padded = pad_sentences(dev_en_tokenized)
dev_fr_padded = pad_sentences(dev_fr_tokenized)
test_en_padded = pad_sentences(test_en_tokenized)
test_fr_padded = pad_sentences(test_fr_tokenized)

# Display shapes to verify padding
print(f"Train EN padded shape: {train_en_padded.shape}")
print(f"Train FR padded shape: {train_fr_padded.shape}")
train_en_padded[0]

Train EN padded shape: torch.Size([30000, 50])
Train FR padded shape: torch.Size([30000, 50])


tensor([ 0,  4,  5,  6,  7,  8,  9, 10, 11, 12,  5,  1,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3])

In [6]:
train_en_tokenized[1]

[0, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 1]

## Creating Dataloader

In [7]:
from torch.utils.data import Dataset

class TranslationDataset(Dataset):
    def __init__(self, src_data, tgt_data):
        """
        src_data: tokenized and padded source sentences (e.g., English)
        tgt_data: tokenized and padded target sentences (e.g., French)
        """
        self.src_data = src_data
        self.tgt_data = tgt_data

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        src_sentence = self.src_data[idx]
        tgt_sentence = self.tgt_data[idx]
        return src_sentence, tgt_sentence
    
train_dataset = TranslationDataset(train_en_padded, train_fr_padded)
dev_dataset = TranslationDataset(dev_en_padded, dev_fr_padded)
test_dataset = TranslationDataset(test_en_padded, test_fr_padded)

from torch.utils.data import DataLoader

BATCH_SIZE = 32

# Create DataLoaders for each dataset
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

for src_batch, tgt_batch in train_loader:
    print(f"Source batch shape: {src_batch.shape}")
    print(f"Target batch shape: {tgt_batch.shape}")
    break  # Just checking the first batch

Source batch shape: torch.Size([32, 50])
Target batch shape: torch.Size([32, 50])


## Multi Head Attention

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "model dimension must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.Q_weight = nn.Linear(d_model, d_model)
        self.K_weight = nn.Linear(d_model, d_model)
        self.V_weight = nn.Linear(d_model, d_model)
        self.Out_weight = nn.Linear(d_model, d_model)

    def split_heads(self, x):
        batch_size, seq_len, d_model = x.size()
        return x.reshape(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_len, d_k = x.size()
        return x.transpose(1, 2).reshape(batch_size, seq_len, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.Q_weight(Q))
        K = self.split_heads(self.K_weight(K))
        V = self.split_heads(self.V_weight(V))
        
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        attn_probs = torch.softmax(attn_scores, dim=-1)
        attn_output = torch.matmul(attn_probs, V)
        output = self.Out_weight(self.combine_heads(attn_output))
        return output

## Feed Forward

In [9]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

## Positional encoding

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        p_encoding = torch.zeros(max_seq_length, d_model, device=device)
        position = torch.arange(0, max_seq_length, dtype=torch.float, device=device).unsqueeze(1)
        division = torch.pow(10_000, (-torch.arange(0, d_model, 2, device=device).float() / d_model))

        p_encoding[:, 0::2] = torch.sin(position * division)
        p_encoding[:, 1::2] = torch.cos(position * division)

        return self.register_buffer('pe', p_encoding.unsqueeze(0))


    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

## Encoder

In [11]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attention_output = self.self_attn(x, x, x, mask)
        x = x + self.dropout(attention_output)
        x = self.norm1(x)
        ff_output = self.feed_forward(x)
        x = x + self.dropout(ff_output)
        x = self.norm2(x)
        return x

## Decoder

In [12]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)


    def forward(self, x, enc_output, src_mask, tgt_mask):
        attention_output = self.self_attn(x, x, x, tgt_mask)
        x = x + self.dropout(attention_output)
        x = self.norm1(x)
        attention_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = x + self.dropout(attention_output)
        x = self.norm2(x)
        ff_output = self.feed_forward(x)
        x = x + self.dropout(ff_output)
        x = self.norm3(x)
        return x

## Transformer Model

In [13]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length, device=device), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)

        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [17]:
en_vocab_size = len(en_vocab)
fr_vocab_size = len(fr_vocab)
d_model = 300
num_heads = 6
num_layers = 6
d_ff = 300
max_seq_length = 50
dropout = 0.1
num_epochs = 10  

criterion = nn.CrossEntropyLoss(ignore_index=3)
model = Transformer(en_vocab_size, fr_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)

## Training the model

In [18]:
def train_model(model, train_loader, dev_loader, criterion, optimizer, scheduler, num_epochs):
    val_losses = []
    train_losses = []
    for epoch in range(num_epochs):
    
        model.train()
        train_loss = 0
        for data in train_loader:
            src_data, tgt_data = data
            src_data = src_data.to(device)
            tgt_data = tgt_data.to(device)
            optimizer.zero_grad()
            output = model(src_data, tgt_data[:, :-1])
            loss = criterion(output.contiguous().view(-1, fr_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)  # Average training loss for the epoch
        print(f"Epoch: {epoch+1}, Training Loss: {train_loss}")
        train_losses.append(train_loss)

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data in dev_loader:
                src_data, tgt_data = data
                src_data = src_data.to(device)
                tgt_data = tgt_data.to(device)
                output = model(src_data, tgt_data[:, :-1])
                loss = criterion(output.contiguous().view(-1, fr_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
                val_loss += loss.item()

        val_loss /= len(dev_loader)  # Average validation loss for the epoch
        print(f"Epoch: {epoch+1}, Validation Loss: {val_loss}")
        val_losses.append(val_loss)
        scheduler.step(val_loss)
        # Save model state
        torch.save(model.state_dict(), f'./transformer_state_dict_epoch_{epoch+1}')
    return train_losses, val_losses

In [20]:
train_losses, val_losses = train_model(model, train_loader, dev_loader, criterion, optimizer, scheduler, num_epochs = 15)

Epoch: 1, Training Loss: 4.81405693483251
Epoch: 1, Validation Loss: 4.3422971452985495
Epoch: 2, Training Loss: 3.785772310899519
Epoch: 2, Validation Loss: 3.730265634400504
Epoch: 3, Training Loss: 3.2715079484463754
Epoch: 3, Validation Loss: 3.3661989058767046
Epoch: 4, Training Loss: 2.9031395530904027
Epoch: 4, Validation Loss: 3.0990612592015947
Epoch: 5, Training Loss: 2.6002680060705905
Epoch: 5, Validation Loss: 2.8785335336412703
Epoch: 6, Training Loss: 2.35000250919033
Epoch: 6, Validation Loss: 2.6937010969434465
Epoch: 7, Training Loss: 2.1501640305081917
Epoch: 7, Validation Loss: 2.6085760380540575
Epoch: 8, Training Loss: 1.9842537837242014
Epoch: 8, Validation Loss: 2.593611168009894
Epoch: 9, Training Loss: 1.8384778738530205
Epoch: 9, Validation Loss: 2.524822869471141
Epoch: 10, Training Loss: 1.7354237241531485
Epoch: 10, Validation Loss: 2.567470831530435
Epoch: 11, Training Loss: 1.6692728111738844
Epoch: 11, Validation Loss: 2.533692342894418
Epoch: 12, Train

In [23]:
val_losses

[4.3422971452985495,
 3.730265634400504,
 3.3661989058767046,
 3.0990612592015947,
 2.8785335336412703,
 2.6937010969434465,
 2.6085760380540575,
 2.593611168009894,
 2.524822869471141,
 2.567470831530435,
 2.533692342894418,
 2.5234274779047285,
 2.504726299217769,
 2.482797018119267,
 2.4892984415803636]

## Bleu score calculation

In [25]:
def calculate_bleu(pred, truth):
    pred = pred.cpu().detach().numpy()
    truth = truth.cpu().detach().numpy()
    
    # Flatten both arrays
    pred = pred.flatten()
    truth = truth.flatten()

    # Convert to list of integers
    preds = [int(item) for item in pred]
    truths = [int(item) for item in truth]
    
    # Filter tokens: remove <pad> (3) in truth corresponding to <sos> (1) in pred
    filtered_truths = []
    filtered_preds = []
    
    for t, p in zip(truths, preds):
        if not (t == 3 and p == 1):  # Check if truth is <pad> and pred is </s>
            filtered_truths.append(t)
            filtered_preds.append(p)
    
    # Calculate BLEU score with filtered tokens
    chencherry = SmoothingFunction()
    bleu_score = sentence_bleu([filtered_truths], filtered_preds, smoothing_function=chencherry.method1)
    
    return bleu_score

## Model evaluation

In [34]:
def evaluate_model(model, test_loader, en_idx2word, fr_idx2word, bleu_file='testbleu.txt', predictions_file='predictions.txt'):
    model.eval()
    model.to(device)  # Ensure model is on the correct device
    
    bleu_scores = []
    sum_bleu_scores = 0
    total_batches = len(test_loader)
    
    # Open the BLEU score output file for writing
    with open(bleu_file, 'w') as bleu_f, open(predictions_file, 'w') as pred_f:
        with torch.no_grad():  # No need to compute gradients
            for i, (src, tgt) in enumerate(test_loader):
                src = src.to(device)
                tgt = tgt.to(device)
                
                # Get model output, ignoring the last token in target (tgt[:, :-1])
                output = model(src, tgt[:, :-1])
                
                # Calculate BLEU score between model's predicted sequence and ground truth
                predicted_seq = output.argmax(dim=-1)  # Convert logits to predicted token ids
                bleu_score = calculate_bleu(predicted_seq, tgt[:, 1:])  # tgt[:, 1:] skips the <sos> token in target
                
                bleu_scores.append(bleu_score)
                sum_bleu_scores += bleu_score
                
                # Write batch number and BLEU score to the BLEU file
                bleu_f.write(f'Batch No: {i + 1}, BLEU score: {bleu_score:.4f}\n')
                
                # Convert source and target sentences back to text
                src_sentences = [" ".join([en_idx2word[idx.item()] for idx in sentence if idx.item() not in [START_IDX, UNK_IDX, PAD_IDX, END_IDX]])
                                 for sentence in src]
                tgt_sentences = [" ".join([fr_idx2word[idx.item()] for idx in sentence if idx.item() not in [START_IDX, UNK_IDX, PAD_IDX, END_IDX]])
                                 for sentence in tgt]
                predicted_sentences = [" ".join([fr_idx2word[idx.item()] for idx in predicted_seq[j] if idx.item() not in [START_IDX, UNK_IDX, PAD_IDX, END_IDX]])
                                       for j in range(len(predicted_seq))]
                
                # Write source, target, and predicted sentences to the predictions file
                for j in range(len(src_sentences)):
                    pred_f.write(f'Source: {src_sentences[j]}\n')
                    pred_f.write(f'Target: {tgt_sentences[j]}\n')
                    pred_f.write(f'Predicted: {predicted_sentences[j]}\n\n')

                # Print the sentences and BLEU score for the first 10 batches
                print(f'Batch No: {i + 1}, BLEU score: {bleu_score:.4f}')
        
        # Calculate average BLEU score over all batches
        average_bleu_score = sum_bleu_scores / total_batches if total_batches > 0 else 0
        print(f'Average BLEU score over {total_batches} sentences: {average_bleu_score:.4f}')
    
    return average_bleu_score  # Return the average BLEU score

In [35]:
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
evaluate_model(model, test_loader, en_idx2word, fr_idx2word)

Batch No: 1, BLEU score: 0.1214
Batch No: 2, BLEU score: 0.0648
Batch No: 3, BLEU score: 0.1980
Batch No: 4, BLEU score: 0.1895
Batch No: 5, BLEU score: 0.7293
Batch No: 6, BLEU score: 0.1087
Batch No: 7, BLEU score: 0.4141
Batch No: 8, BLEU score: 0.1530
Batch No: 9, BLEU score: 0.2750
Batch No: 10, BLEU score: 0.2834
Batch No: 11, BLEU score: 0.1639
Batch No: 12, BLEU score: 0.3355
Batch No: 13, BLEU score: 0.0497
Batch No: 14, BLEU score: 0.2727
Batch No: 15, BLEU score: 0.0105
Batch No: 16, BLEU score: 0.2427
Batch No: 17, BLEU score: 0.2251
Batch No: 18, BLEU score: 0.6274
Batch No: 19, BLEU score: 0.4367
Batch No: 20, BLEU score: 0.5782
Batch No: 21, BLEU score: 0.3783
Batch No: 22, BLEU score: 0.1385
Batch No: 23, BLEU score: 0.3065
Batch No: 24, BLEU score: 0.5774
Batch No: 25, BLEU score: 0.2945
Batch No: 26, BLEU score: 0.3030
Batch No: 27, BLEU score: 0.5969
Batch No: 28, BLEU score: 0.0503
Batch No: 29, BLEU score: 0.2398
Batch No: 30, BLEU score: 0.2487
Batch No: 31, BLEU 

0.27989168142147236

## Hyperparameter Tuning

In [None]:
import csv

embedding_dim = [300, 300, 512]
num_heads = [6, 10, 16]
num_layers = [6, 2, 4]
dropout = [0.1, 0.2]
num_epochs = 10

bleu_values = []

with open('model_results.csv', mode='w', newline='') as file:
    writer = csv.writer(file)

    writer.writerow(['Embedding Dim', 'Num Heads', 'Num Layers', 'Dropout', 'BLEU Score'])
    for k in range(0, 3):
        for drop in dropout:
            embed_dim = embedding_dim[k]
            num_head = num_heads[k]
            num_layer = num_layers[k]
            
            model = Transformer(en_vocab_size, fr_vocab_size, embed_dim, num_head, num_layer, embed_dim, max_seq_length, drop).to(device)
            criterion = nn.CrossEntropyLoss(ignore_index=3)
            optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)
            
            train_model(model, train_loader, dev_loader, criterion, optimizer, scheduler, num_epochs)
        
            bleu_score = evaluate_model(model, test_loader, en_idx2word, fr_idx2word)
            bleu_values.append(bleu_score)
        
            writer.writerow([embed_dim, num_head, num_layer, drop, bleu_score])
