# problem 1

In [1]:
text = """“Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text.

At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model.

One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks.

Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time.

Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants.

In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology.”"""

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import math
from torch.utils.data import Dataset, DataLoader, random_split
from collections import defaultdict

class CharDataset(Dataset):
    def __init__(self, text, seq_length):
        self.chars = sorted(list(set(text)))
        self.char2idx = {c: i for i, c in enumerate(self.chars)}
        self.idx2char = {i: c for i, c in enumerate(self.chars)}
        self.encoded = [self.char2idx[c] for c in text]
        self.seq_length = seq_length

        self.samples = [
            (self.encoded[i:i+seq_length], self.encoded[i+1:i+seq_length+1])
            for i in range(len(self.encoded) - seq_length)
        ]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return torch.tensor(self.samples[idx][0]), torch.tensor(self.samples[idx][1])

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, nhead=4, num_layers=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoder = PositionalEncoding(embed_dim)
        encoder_layers = nn.TransformerEncoderLayer(embed_dim, nhead, dim_feedforward=256)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x) * math.sqrt(self.embedding.embedding_dim)
        x = self.pos_encoder(x)
        x = self.transformer(x)
        return self.fc(x)

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        return self.fc(x)
class RNNAttentionModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128):
        super().__init__()
        self.encoder = nn.Embedding(vocab_size, embed_dim)
        self.encoder_rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.decoder_rnn = nn.RNN(embed_dim + hidden_dim, hidden_dim, batch_first=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, teacher_forcing_ratio=0.5):
        batch_size, seq_len = x.size()

        # Encoder
        enc_emb = self.encoder(x)
        enc_out, enc_hidden = self.encoder_rnn(enc_emb)

        # Decoder with attention
        dec_input = x[:, 0].unsqueeze(1)
        outputs = []

        for t in range(seq_len):
            dec_emb = self.encoder(dec_input)

            # Attention mechanism (fixed)
            expanded_dec = dec_emb.expand(-1, seq_len, -1)
            attention_input = torch.cat((enc_out, expanded_dec), dim=-1)
            attention_scores = self.attention(attention_input).squeeze(-1)
            attn_weights = F.softmax(attention_scores, dim=1)
            context = torch.bmm(attn_weights.unsqueeze(1), enc_out)

            # Decoder step
            dec_out, dec_hidden = self.decoder_rnn(
                torch.cat([dec_emb, context], dim=-1)
            )
            output = self.fc(dec_out.squeeze(1))
            outputs.append(output.unsqueeze(1))

            # Teacher forcing
            if self.training and torch.rand(1) < teacher_forcing_ratio:
                dec_input = x[:, t].unsqueeze(1)
            else:
                dec_input = output.argmax(-1).unsqueeze(1)

        return torch.cat(outputs, dim=1)

def train_model(model, train_loader, val_loader, epochs=10, lr=0.001, device='cpu'):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

    model.to(device)
    history = defaultdict(list)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        start_time = time.time()

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()

            outputs = model(inputs)
            if isinstance(model, RNNAttentionModel):
                outputs = outputs[:, :targets.size(1)]

            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss, val_acc = evaluate(model, val_loader, device)
        scheduler.step(val_loss)

        history['train_loss'].append(train_loss/len(train_loader))
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        history['time'].append(time.time() - start_time)

        print(f"Epoch {epoch+1}/{epochs}: "
              f"Train Loss: {history['train_loss'][-1]:.4f}, "
              f"Val Loss: {val_loss:.4f}, "
              f"Val Acc: {val_acc:.4f}, "
              f"Time: {history['time'][-1]:.2f}s")

    return history

def evaluate(model, loader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)

            if isinstance(model, RNNAttentionModel):
                outputs = outputs[:, :targets.size(1)]

            loss = F.cross_entropy(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            total_loss += loss.item()

            preds = outputs.argmax(-1)
            correct += (preds == targets).sum().item()
            total += targets.numel()

    return total_loss/len(loader), correct/total

def compute_complexity(model):
    params = sum(p.numel() for p in model.parameters())
    return {
        'params': params,
    }


def run_experiment(text, seq_lengths=[10, 20, 30], epochs=15, device='cpu'):
    results = []

    for seq_len in seq_lengths:
        print(f"\n=== Sequence Length: {seq_len} ===")
        dataset = CharDataset(text, seq_len)
        train_size = int(0.8 * len(dataset))
        val_size = len(dataset) - train_size
        train_set, val_set = random_split(dataset, [train_size, val_size])

        train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=32)

        vocab_size = len(dataset.chars)

        models = {
            'Transformer': TransformerModel(vocab_size),
            'RNN': RNNModel(vocab_size),
            'RNN+Attention': RNNAttentionModel(vocab_size)
        }

        for name, model in models.items():
            print(f"\nTraining {name}...")
            history = train_model(model, train_loader, val_loader, epochs, device=device)

            complexity = compute_complexity(model)

            results.append({
                'model': name,
                'seq_len': seq_len,
                'best_train_loss': min(history['train_loss']),
                'best_val_acc': max(history['val_acc']),
                'avg_time_per_epoch': sum(history['time'])/epochs,
                **complexity
            })

    return results

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    text = """This is an example text for character-level language modeling.
            We'll compare Transformer vs RNN architectures with different sequence lengths."""

    results = run_experiment(text, device=device)

    print("\n=== Final Results ===")
    for r in results:
        print(f"{r['model']} (seq_len={r['seq_len']}): "
              f"Train Loss={r['best_train_loss']:.4f}, "
              f"Val Acc={r['best_val_acc']:.4f}, "
              f"Time/Epoch={r['avg_time_per_epoch']:.2f}s, "
              f"Params={r['params']:,}, ")


=== Sequence Length: 10 ===

Training Transformer...
Epoch 1/15: Train Loss: 3.2291, Val Loss: 2.8027, Val Acc: 0.1586, Time: 0.15s
Epoch 2/15: Train Loss: 2.5647, Val Loss: 2.4216, Val Acc: 0.2759, Time: 0.10s
Epoch 3/15: Train Loss: 2.2705, Val Loss: 2.2262, Val Acc: 0.3379, Time: 0.03s
Epoch 4/15: Train Loss: 2.0413, Val Loss: 2.0470, Val Acc: 0.3379, Time: 0.03s
Epoch 5/15: Train Loss: 1.9100, Val Loss: 1.9097, Val Acc: 0.3862, Time: 0.03s
Epoch 6/15: Train Loss: 1.8138, Val Loss: 1.8330, Val Acc: 0.3517, Time: 0.03s
Epoch 7/15: Train Loss: 1.7346, Val Loss: 1.7914, Val Acc: 0.3379, Time: 0.03s
Epoch 8/15: Train Loss: 1.7017, Val Loss: 1.7298, Val Acc: 0.3517, Time: 0.03s
Epoch 9/15: Train Loss: 1.6765, Val Loss: 1.7037, Val Acc: 0.3483, Time: 0.03s
Epoch 10/15: Train Loss: 1.6469, Val Loss: 1.6646, Val Acc: 0.3621, Time: 0.03s
Epoch 11/15: Train Loss: 1.6238, Val Loss: 1.6649, Val Acc: 0.3690, Time: 0.03s
Epoch 12/15: Train Loss: 1.6145, Val Loss: 1.6625, Val Acc: 0.3448, Time: 0

# Problem 2: transformer model, for the tiny Shakespeare dataset

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import math
from torch.utils.data import Dataset, DataLoader, random_split
from collections import defaultdict
import pandas as pd

def train_model(model, train_loader, val_loader, epochs=10, lr=0.001, device='cpu'):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

    model.to(device)
    history = defaultdict(list)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        start_time = time.time()

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss, val_acc = evaluate(model, val_loader, device)
        scheduler.step(val_loss)

        history['train_loss'].append(train_loss/len(train_loader))
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        history['time'].append(time.time() - start_time)

        print(f"Epoch {epoch+1}/{epochs}: "
              f"Train Loss: {history['train_loss'][-1]:.4f}, "
              f"Val Loss: {val_loss:.4f}, "
              f"Val Acc: {val_acc:.4f}, "
              f"Time: {history['time'][-1]:.2f}s")

    return history

def evaluate(model, loader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)

            loss = F.cross_entropy(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            total_loss += loss.item()

            preds = outputs.argmax(-1)
            correct += (preds == targets).sum().item()
            total += targets.numel()

    return total_loss/len(loader), correct/total

def run_transformer_experiments(text, seq_lengths=[20, 30, 50],
                              layer_configs=[1, 2, 4],
                              head_configs=[2, 4],
                              epochs=15, device='cpu'):
    results = []

    for seq_len in seq_lengths:
        print(f"\n=== Sequence Length: {seq_len} ===")
        dataset = CharDataset(text, seq_len)
        train_size = int(0.8 * len(dataset))
        val_size = len(dataset) - train_size
        train_set, val_set = random_split(dataset, [train_size, val_size])

        train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=32)

        vocab_size = len(dataset.chars)

        for num_layers in layer_configs:
            for nhead in head_configs:
                if nhead > num_layers * 2:
                    continue

                print(f"\nTraining Transformer with {num_layers} layers and {nhead} heads...")
                model = TransformerModel(vocab_size, nhead=nhead, num_layers=num_layers)

                history = train_model(model, train_loader, val_loader, epochs, device=device)

                complexity = compute_complexity(model)

                results.append({
                    'model': 'Transformer',
                    'seq_len': seq_len,
                    'num_layers': num_layers,
                    'num_heads': nhead,
                    'best_train_loss': min(history['train_loss']),
                    'best_val_loss': min(history['val_loss']),
                    'best_val_acc': max(history['val_acc']),
                    'total_training_time': sum(history['time']),
                    'avg_time_per_epoch': sum(history['time'])/epochs,
                    **complexity
                })

    return pd.DataFrame(results)

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    results_df = run_transformer_experiments(text, device=device)

    results_df.to_csv('transformer_experiment_results.csv', index=False)
    print("\n=== Experiment Results ===")
    print(results_df)

    print("\n=== Summary of Results ===")
    print("Best validation accuracy per sequence length:")
    print(results_df.groupby('seq_len')['best_val_acc'].max())

    print("\nTraining time comparison:")
    print(results_df.groupby(['seq_len', 'num_layers', 'num_heads'])['total_training_time'].mean())


=== Sequence Length: 20 ===

Training Transformer with 1 layers and 2 heads...




Epoch 1/15: Train Loss: 2.6019, Val Loss: 2.2996, Val Acc: 0.2796, Time: 1.34s
Epoch 2/15: Train Loss: 2.2803, Val Loss: 2.2318, Val Acc: 0.2866, Time: 0.36s
Epoch 3/15: Train Loss: 2.2397, Val Loss: 2.2182, Val Acc: 0.2769, Time: 0.35s
Epoch 4/15: Train Loss: 2.2214, Val Loss: 2.2036, Val Acc: 0.2862, Time: 0.40s
Epoch 5/15: Train Loss: 2.2183, Val Loss: 2.1953, Val Acc: 0.2738, Time: 0.36s
Epoch 6/15: Train Loss: 2.2108, Val Loss: 2.1896, Val Acc: 0.2769, Time: 0.33s
Epoch 7/15: Train Loss: 2.2087, Val Loss: 2.1908, Val Acc: 0.2803, Time: 0.41s
Epoch 8/15: Train Loss: 2.2022, Val Loss: 2.1837, Val Acc: 0.2814, Time: 0.40s
Epoch 9/15: Train Loss: 2.2020, Val Loss: 2.1840, Val Acc: 0.2831, Time: 0.41s
Epoch 10/15: Train Loss: 2.2017, Val Loss: 2.1794, Val Acc: 0.2780, Time: 0.39s
Epoch 11/15: Train Loss: 2.1978, Val Loss: 2.1771, Val Acc: 0.2783, Time: 0.38s
Epoch 12/15: Train Loss: 2.1964, Val Loss: 2.1811, Val Acc: 0.2806, Time: 0.36s
Epoch 13/15: Train Loss: 2.1983, Val Loss: 2.1779



Epoch 1/15: Train Loss: 2.5903, Val Loss: 2.3053, Val Acc: 0.2808, Time: 0.44s
Epoch 2/15: Train Loss: 2.2790, Val Loss: 2.2304, Val Acc: 0.2823, Time: 0.56s
Epoch 3/15: Train Loss: 2.2384, Val Loss: 2.2129, Val Acc: 0.2869, Time: 0.52s
Epoch 4/15: Train Loss: 2.2259, Val Loss: 2.2048, Val Acc: 0.2816, Time: 0.43s
Epoch 5/15: Train Loss: 2.2163, Val Loss: 2.1920, Val Acc: 0.2805, Time: 0.40s
Epoch 6/15: Train Loss: 2.2093, Val Loss: 2.1954, Val Acc: 0.2777, Time: 0.38s
Epoch 7/15: Train Loss: 2.2052, Val Loss: 2.1889, Val Acc: 0.2805, Time: 0.39s
Epoch 8/15: Train Loss: 2.2027, Val Loss: 2.1896, Val Acc: 0.2871, Time: 0.51s
Epoch 9/15: Train Loss: 2.2009, Val Loss: 2.1851, Val Acc: 0.2791, Time: 0.52s
Epoch 10/15: Train Loss: 2.1982, Val Loss: 2.1855, Val Acc: 0.2848, Time: 0.54s
Epoch 11/15: Train Loss: 2.1976, Val Loss: 2.1875, Val Acc: 0.2772, Time: 0.56s
Epoch 12/15: Train Loss: 2.1970, Val Loss: 2.1826, Val Acc: 0.2828, Time: 0.37s
Epoch 13/15: Train Loss: 2.1973, Val Loss: 2.1786

# problem 3 and 4: Transformer-based encoder-decoder architecture for English to French Translation and French to English Translation

In [1]:
pairs_en_fr = [

    ("I am cold", "J'ai froid"),
    ("You are tired", "Tu es fatigué"),
    ("He is hungry", "Il a faim"),
    ("She is happy", "Elle est heureuse"),
    ("We are friends", "Nous sommes amis"),
    ("They are students", "Ils sont étudiants"),
    ("The cat is sleeping", "Le chat dort"),
    ("The sun is shining", "Le soleil brille"),
    ("We love music", "Nous aimons la musique"),
    ("She speaks French fluently", "Elle parle français couramment"),
    ("He enjoys reading books", "Il aime lire des livres"),
    ("They play soccer every weekend", "Ils jouent au football chaque week-end"),
    ("The movie starts at 7 PM", "Le film commence à 19 heures"),
    ("She wears a red dress", "Elle porte une robe rouge"),
    ("We cook dinner together", "Nous cuisinons le dîner ensemble"),
    ("He drives a blue car", "Il conduit une voiture bleue"),
    ("They visit museums often", "Ils visitent souvent des musées"),
    ("The restaurant serves delicious food", "Le restaurant sert une délicieuse cuisine"),
    ("She studies mathematics at university", "Elle étudie les mathématiques à l'université"),
    ("We watch movies on Fridays", "Nous regardons des films le vendredi"),
    ("He listens to music while jogging", "Il écoute de la musique en faisant du jogging"),
    ("They travel around the world", "Ils voyagent autour du monde"),
    ("The book is on the table", "Le livre est sur la table"),
    ("She dances gracefully", "Elle danse avec grâce"),
    ("We celebrate birthdays with cake", "Nous célébrons les anniversaires avec un gâteau"),
    ("He works hard every day", "Il travaille dur tous les jours"),
    ("They speak different languages", "Ils parlent différentes langues"),
    ("The flowers bloom in spring", "Les fleurs fleurissent au printemps"),
    ("She writes poetry in her free time", "Elle écrit de la poésie pendant son temps libre"),
    ("We learn something new every day", "Nous apprenons quelque chose de nouveau chaque jour"),
    ("The dog barks loudly", "Le chien aboie bruyamment"),
    ("He sings beautifully", "Il chante magnifiquement"),
    ("They swim in the pool", "Ils nagent dans la piscine"),
    ("The birds chirp in the morning", "Les oiseaux gazouillent le matin"),
    ("She teaches English at school", "Elle enseigne l'anglais à l'école"),
    ("We eat breakfast together", "Nous prenons le petit déjeuner ensemble"),
    ("He paints landscapes", "Il peint des paysages"),
    ("They laugh at the joke", "Ils rient de la blague"),
    ("The clock ticks loudly", "L'horloge tic-tac bruyamment"),
    ("She runs in the park", "Elle court dans le parc"),
    ("We travel by train", "Nous voyageons en train"),
    ("He writes a letter", "Il écrit une lettre"),
    ("They read books at the library", "Ils lisent des livres à la bibliothèque"),
    ("The baby cries", "Le bébé pleure"),
    ("She studies hard for exams", "Elle étudie dur pour les examens"),
    ("We plant flowers in the garden", "Nous plantons des fleurs dans le jardin"),
    ("He fixes the car", "Il répare la voiture"),
    ("They drink coffee in the morning", "Ils boivent du café le matin"),
    ("The sun sets in the evening", "Le soleil se couche le soir"),
    ("She dances at the party", "Elle danse à la fête"),
    ("We play music at the concert", "Nous jouons de la musique au concert"),
    ("He cooks dinner for his family", "Il cuisine le dîner pour sa famille"),
    ("They study French grammar", "Ils étudient la grammaire française"),
    ("The rain falls gently", "La pluie tombe doucement"),
    ("She sings a song", "Elle chante une chanson"),
    ("We watch a movie together", "Nous regardons un film ensemble"),
    ("He sleeps deeply", "Il dort profondément"),
    ("They travel to Paris", "Ils voyagent à Paris"),
    ("The children play in the park", "Les enfants jouent dans le parc"),
    ("She walks along the beach", "Elle se promène le long de la plage"),
    ("We talk on the phone", "Nous parlons au téléphone"),
    ("He waits for the bus", "Il attend le bus"),
    ("They visit the Eiffel Tower", "Ils visitent la tour Eiffel"),
    ("The stars twinkle at night", "Les étoiles scintillent la nuit"),
    ("She dreams of flying", "Elle rêve de voler"),
    ("We work in the office", "Nous travaillons au bureau"),
    ("He studies history", "Il étudie l'histoire"),
    ("They listen to the radio", "Ils écoutent la radio"),
    ("The wind blows gently", "Le vent souffle doucement"),
    ("She swims in the ocean", "Elle nage dans l'océan"),
    ("We dance at the wedding", "Nous dansons au mariage"),
    ("He climbs the mountain", "Il gravit la montagne"),
    ("They hike in the forest", "Ils font de la randonnée dans la forêt"),
    ("The cat meows loudly", "Le chat miaule bruyamment"),
    ("She paints a picture", "Elle peint un tableau"),
    ("We build a sandcastle", "Nous construisons un château de sable"),
    ("He sings in the choir", "Il chante dans le chœur"),
    ("They ride bicycles", "Ils font du vélo"),
    ("The coffee is hot", "Le café est chaud"),
    ("She wears glasses", "Elle porte des lunettes"),
    ("We visit our grandparents", "Nous rendons visite à nos grands-parents"),
    ("He plays the guitar", "Il joue de la guitare"),
    ("They go shopping", "Ils font du shopping"),
    ("The teacher explains the lesson", "Le professeur explique la leçon"),
    ("She takes the train to work", "Elle prend le train pour aller au travail"),
    ("We bake cookies", "Nous faisons des biscuits"),
    ("He washes his hands", "Il se lave les mains"),
    ("They enjoy the sunset", "Ils apprécient le coucher du soleil"),
    ("The river flows calmly", "La rivière coule calmement"),
    ("She feeds the cat", "Elle nourrit le chat"),
    ("We visit the museum", "Nous visitons le musée"),
    ("He fixes his bicycle", "Il répare son vélo"),
    ("They paint the walls", "Ils peignent les murs"),
    ("The baby sleeps peacefully", "Le bébé dort paisiblement"),
    ("She ties her shoelaces", "Elle attache ses lacets"),
    ("We climb the stairs", "Nous montons les escaliers"),
    ("He shaves in the morning", "Il se rase le matin"),
    ("They set the table", "Ils mettent la table"),
    ("The airplane takes off", "L'avion décolle"),
    ("She waters the plants", "Elle arrose les plantes"),
    ("We practice yoga", "Nous pratiquons le yoga"),
    ("He turns off the light", "Il éteint la lumière"),
    ("They play video games", "Ils jouent aux jeux vidéo"),
    ("The soup smells delicious", "La soupe sent délicieusement bon"),
    ("She locks the door", "Elle ferme la porte à clé"),
    ("We enjoy a picnic", "Nous profitons d'un pique-nique"),
    ("He checks his email", "Il vérifie ses emails"),
    ("They go to the gym", "Ils vont à la salle de sport"),
    ("The moon shines brightly", "La lune brille intensément"),
    ("She catches the bus", "Elle attrape le bus"),
    ("We greet our neighbors", "Nous saluons nos voisins"),
    ("He combs his hair", "Il se peigne les cheveux"),
    ("They wave goodbye", "Ils font un signe d'adieu")

]

In [10]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import math
from collections import defaultdict
import numpy as np

BATCH_SIZE = 32
D_MODEL = 64
NUM_EPOCHS = 20
FFN_HID_DIM = 256
DROPOUT = 0.1
LEARNING_RATE = 0.001
MAX_SEQ_LEN = 20
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class BidirectionalTranslationDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab, direction='en-fr'):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.direction = direction

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        if self.direction == 'en-fr':
            src, tgt = self.pairs[idx]
        else:
            tgt, src = self.pairs[idx]

        src_tokens = ['<sos>'] + src.split() + ['<eos>']
        tgt_tokens = ['<sos>'] + tgt.split() + ['<eos>']

        src_ids = [self.src_vocab.get(token, self.src_vocab['<unk>']) for token in src_tokens]
        tgt_ids = [self.tgt_vocab.get(token, self.tgt_vocab['<unk>']) for token in tgt_tokens]

        return torch.tensor(src_ids), torch.tensor(tgt_ids)

# Build both vocabs
all_en = [p[0] for p in pairs_en_fr]
all_fr = [p[1] for p in pairs_en_fr]
en_vocab = build_vocab(all_en)
fr_vocab = build_vocab(all_fr)

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, num_layers, num_heads):
        super().__init__()
        self.src_embed = nn.Embedding(src_vocab_size, D_MODEL)
        self.tgt_embed = nn.Embedding(tgt_vocab_size, D_MODEL)
        self.pos_encoder = PositionalEncoding(D_MODEL)

        self.transformer = nn.Transformer(
            d_model=D_MODEL,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=FFN_HID_DIM,
            dropout=DROPOUT,
            batch_first=True
        )
        self.fc = nn.Linear(D_MODEL, tgt_vocab_size)

    def forward(self, src, tgt):
        src = self.pos_encoder(self.src_embed(src) * math.sqrt(D_MODEL))
        tgt = self.pos_encoder(self.tgt_embed(tgt) * math.sqrt(D_MODEL))

        src_mask = self.transformer.generate_square_subsequent_mask(src.size(1)).to(device)
        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(1)).to(device)

        output = self.transformer(src, tgt,
                                src_mask=src_mask,
                                tgt_mask=tgt_mask)
        return self.fc(output)

def train_bidirectional(config, direction='en-fr'):
    dataset = BidirectionalTranslationDataset(
        pairs_en_fr,
        src_vocab=en_vocab if direction == 'en-fr' else fr_vocab,
        tgt_vocab=fr_vocab if direction == 'en-fr' else en_vocab,
        direction=direction
    )
    train_loader = DataLoader(dataset, batch_size=BATCH_SIZE,
                            shuffle=True, collate_fn=collate_fn)

    model = Transformer(
        src_vocab_size=len(en_vocab) if direction == 'en-fr' else len(fr_vocab),
        tgt_vocab_size=len(fr_vocab) if direction == 'en-fr' else len(en_vocab),
        num_layers=config['num_layers'],
        num_heads=config['num_heads']
    ).to(device)

    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    best_acc = 0
    for epoch in range(NUM_EPOCHS):
        model.train()
        total_loss = 0
        for src, tgt in train_loader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_input = tgt[:, :-1]

            optimizer.zero_grad()
            output = model(src, tgt_input)
            loss = criterion(output.reshape(-1, output.size(-1)), tgt[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        val_loss, correct, total = 0, 0, 0
        with torch.no_grad():
            for src, tgt in train_loader:
                src, tgt = src.to(device), tgt.to(device)
                tgt_input = tgt[:, :-1]
                output = model(src, tgt_input)
                val_loss += criterion(output.reshape(-1, output.size(-1)), tgt[:, 1:].reshape(-1)).item()

                preds = output.argmax(-1)
                mask = (tgt[:, 1:] != 0)
                correct += ((preds == tgt[:, 1:]) & mask).sum().item()
                total += mask.sum().item()

        print(f"Direction {direction} | Config {config} | Epoch {epoch+1}")
        print(f"Train Loss: {total_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(train_loader):.4f} | Val Acc: {correct/total:.4f}")

    return model

# Configuration Testing for both directions
configurations = [
    {'num_layers': 1, 'num_heads': 2},
    {'num_layers': 1, 'num_heads': 4},
    {'num_layers': 2, 'num_heads': 2},
    {'num_layers': 2, 'num_heads': 4},
    {'num_layers': 4, 'num_heads': 2},
    {'num_layers': 4, 'num_heads': 4},
]

trained_models = {'en-fr': [], 'fr-en': []}

for config in configurations:
    # Train English→French
    print(f"\nTraining EN→FR with config: {config}")
    model_en_fr = train_bidirectional(config, direction='en-fr')
    trained_models['en-fr'].append(model_en_fr)

    # Train French→English
    print(f"\nTraining FR→EN with config: {config}")
    model_fr_en = train_bidirectional(config, direction='fr-en')
    trained_models['fr-en'].append(model_fr_en)

def bidirectional_translate(model, sentence, src_vocab, tgt_vocab, device, direction='en-fr', config=None):
    model.eval()
    tokens = ['<sos>'] + sentence.split() + ['<eos>']
    src = torch.tensor([src_vocab.get(token, src_vocab['<unk>']) for token in tokens]).unsqueeze(0).to(device)

    tgt = torch.tensor([[tgt_vocab['<sos>']]]).to(device)
    for _ in range(MAX_SEQ_LEN):
        output = model(src, tgt)
        next_token = output.argmax(-1)[:, -1].unsqueeze(1)
        tgt = torch.cat([tgt, next_token], dim=1)
        if next_token.item() == tgt_vocab['<eos>']:
            break

    translated = [tgt_vocab_inv.get(i, '<unk>') for i in tgt.squeeze().tolist()[1:-1]]
    return {
        'direction': direction,
        'config': config,
        'translation': ' '.join(translated)
    }

print("\nBidirectional Qualitative Evaluation:")
for idx, config in enumerate(configurations):
    print(f"\n{'='*40}")
    print(f"Evaluating Configuration: {config}")
    print(f"{'='*40}")

    model_en_fr = trained_models['en-fr'][idx]
    model_fr_en = trained_models['fr-en'][idx]

    for en, fr in test_sentences:
        # English→French translation
        fr_result = bidirectional_translate(
            model_en_fr, en, en_vocab, fr_vocab, device,
            direction='en-fr', config=config
        )

        # French→English translation
        en_result = bidirectional_translate(
            model_fr_en, fr, fr_vocab, en_vocab, device,
            direction='fr-en', config=config
        )

        print(f"\nConfiguration: {config}")
        print(f"Direction: {fr_result['direction']}")
        print(f"Source EN: {en}")
        print(f"Predicted FR: {fr_result['translation']}")
        print(f"Actual FR: {fr}")

        print(f"\nConfiguration: {en_result['config']}")
        print(f"Direction: {en_result['direction']}")
        print(f"Source FR: {fr}")
        print(f"Predicted EN: {en_result['translation']}")
        print(f"Actual EN: {en}")
        print("-"*60)


Training EN→FR with config: {'num_layers': 1, 'num_heads': 2}
Direction en-fr | Config {'num_layers': 1, 'num_heads': 2} | Epoch 1
Train Loss: 5.5308 | Val Loss: 5.1443 | Val Acc: 0.1644
Direction en-fr | Config {'num_layers': 1, 'num_heads': 2} | Epoch 2
Train Loss: 5.0591 | Val Loss: 4.8219 | Val Acc: 0.1736
Direction en-fr | Config {'num_layers': 1, 'num_heads': 2} | Epoch 3
Train Loss: 4.8056 | Val Loss: 4.6205 | Val Acc: 0.1751
Direction en-fr | Config {'num_layers': 1, 'num_heads': 2} | Epoch 4
Train Loss: 4.6055 | Val Loss: 4.4512 | Val Acc: 0.2197
Direction en-fr | Config {'num_layers': 1, 'num_heads': 2} | Epoch 5
Train Loss: 4.4618 | Val Loss: 4.2803 | Val Acc: 0.2581
Direction en-fr | Config {'num_layers': 1, 'num_heads': 2} | Epoch 6
Train Loss: 4.2981 | Val Loss: 4.1390 | Val Acc: 0.2488
Direction en-fr | Config {'num_layers': 1, 'num_heads': 2} | Epoch 7
Train Loss: 4.1614 | Val Loss: 3.9944 | Val Acc: 0.2750
Direction en-fr | Config {'num_layers': 1, 'num_heads': 2} | E