# Notebook qui compare architecture, learning rate....


## Comparaison taille de transformer
Comparaison de 3 taills de transformer pour garder le moins lourd


In [None]:
import pandas as pd
from torch.utils.data import DataLoader, random_split
from transformer import Transformer
from PommierDataset import PommierDataset, collate_fn
import torch.optim as optim
import torch.nn as nn
import torch
import wandb

In [None]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU is not available")

In [None]:

# Load the dataset
dataset_path = "out/sequence_analysis_generated_dataset10000.csv"
dataset = PommierDataset(dataset_path)

# Split the dataset into train and validation sets
VAL_SPLIT = 0.8
train_size = int(VAL_SPLIT * len(dataset))
val_size = len(dataset) - train_size
train_split, val_split = random_split(dataset, [train_size, val_size])

# Create DataLoaders
BATCH_SIZE = 512
train_loader = DataLoader(train_split, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_split, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# List of transformer sizes
transformer_sizes = [16, 32, 64, 128]


In [None]:


# Iterate over transformer sizes
for size in transformer_sizes:
    NUM_EPOCHS = 12
    LEARNING_RATE = 0.00005
    MAX_D_MODEL = 128

    num_heads = max(1, size // 32)  # Ensure at least 1 head
    exp_name = f"Transformer_{size}"
    wandb.init(
        project="Topologie-Pommiers",
        name=exp_name,
        config={
            "learning_rate": LEARNING_RATE,
            "Val split": VAL_SPLIT,
            "architecture": exp_name,
            "dataset": "sequence_analysis_generated_dataset10000.csv",
            "batch size": BATCH_SIZE,
            "Dimension model": size,
            "Number of heads": num_heads,
            "epochs": NUM_EPOCHS,
            "dynamic": False
        }
    )

    # Initialize the model
    model = Transformer(17, 12, size, num_heads, 0)
    model.to("cuda" if torch.cuda.is_available() else "cpu")

    # Add number of parameters to wandb config
    wandb.config.update({"num_parameters": sum(p.numel() for p in model.parameters())})

    # Initialize the optimizer and loss function
    optimizer = optim.Adam(model.parameters(), LEARNING_RATE)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    # Training loop
    for epoch in range(NUM_EPOCHS):
        model.train()
        total_train_loss = 0
        for enc_inp, dec_inp, dec_target in train_loader:
            enc_inp = enc_inp.to(model.device)
            dec_inp = dec_inp.to(model.device)
            dec_target = dec_target.to(model.device)
            padding_mask = (dec_inp == 0).to(torch.float32)

            logits = model(enc_inp, dec_inp, padding_mask)
            logits = logits.view(-1, logits.size(-1))
            dec_target = dec_target.view(-1)
            loss = criterion(logits, dec_target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            wandb.log({"train_loss": loss.item()})

        model.eval()
        total_eval_loss = 0
        with torch.no_grad():
            for enc_inp, dec_inp, dec_target in val_loader:
                enc_inp = enc_inp.to(model.device)
                dec_inp = dec_inp.to(model.device)
                dec_target = dec_target.to(model.device)
                padding_mask = (dec_inp == 0).to(torch.float32).to(model.device)

                logits = model(enc_inp, dec_inp, padding_mask)
                logits = logits.view(-1, logits.size(-1))
                dec_target = dec_target.view(-1)
                loss = criterion(logits, dec_target)
                total_eval_loss += loss.item()
                wandb.log({"val_loss": loss.item()})

        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_eval_loss / len(val_loader)
        print(f"[INFO] Epoch {epoch} : train loss = {avg_train_loss:.4f}, val loss = {avg_val_loss:.4f}")

    torch.save(model.state_dict(), f"transformer_{size}.pth")
    wandb.finish()

In [None]:
## Comparaison du nombre de couches de transformateur
Dans cette section, nous allons comparer les performances des transformateurs en fonction du nombre de couches. Nous allons utiliser une taille de transformateur fixe et faire varier le nombre de couches pour observer l'impact sur les performances du modèle.

: 

In [None]:
# List of number of layers
num_layers_list = [1, 2, 3, 6]
fixed_size = 32
num_heads = 1

# Iterate over number of layers
for num_layers in num_layers_list:
    NUM_EPOCHS = 12
    LEARNING_RATE = 0.00005

    exp_name = f"Transformer_{fixed_size}_layers_{num_layers}"
    wandb.init(
        project="Topologie-Pommiers",
        name=exp_name,
        config={
            "learning_rate": LEARNING_RATE,
            "Val split": VAL_SPLIT,
            "architecture": exp_name,
            "dataset": "sequence_analysis_generated_dataset10000.csv",
            "batch size": BATCH_SIZE,
            "Dimension model": fixed_size,
            "Number of heads": num_heads,
            "Number of layers": num_layers,
            "epochs": NUM_EPOCHS,
            "dynamic": False
        }
    )

    # Initialize the model
    model = Transformer(17, num_layers, fixed_size, num_heads, 0)
    model.to("cuda" if torch.cuda.is_available() else "cpu")

    # Add number of parameters to wandb config
    wandb.config.update({"num_parameters": sum(p.numel() for p in model.parameters())})

    # Initialize the optimizer and loss function
    optimizer = optim.Adam(model.parameters(), LEARNING_RATE)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    # Training loop
    for epoch in range(NUM_EPOCHS):
        model.train()
        total_train_loss = 0
        for enc_inp, dec_inp, dec_target in train_loader:
            enc_inp = enc_inp.to(model.device)
            dec_inp = dec_inp.to(model.device)
            dec_target = dec_target.to(model.device)
            padding_mask = (dec_inp == 0).to(torch.float32)

            logits = model(enc_inp, dec_inp, padding_mask)
            logits = logits.view(-1, logits.size(-1))
            dec_target = dec_target.view(-1)
            loss = criterion(logits, dec_target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            wandb.log({"train_loss": loss.item()})

        model.eval()
        total_eval_loss = 0
        with torch.no_grad():
            for enc_inp, dec_inp, dec_target in val_loader:
                enc_inp = enc_inp.to(model.device)
                dec_inp = dec_inp.to(model.device)
                dec_target = dec_target.to(model.device)
                padding_mask = (dec_inp == 0).to(torch.float32).to(model.device)

                logits = model(enc_inp, dec_inp, padding_mask)
                logits = logits.view(-1, logits.size(-1))
                dec_target = dec_target.view(-1)
                loss = criterion(logits, dec_target)
                total_eval_loss += loss.item()
                wandb.log({"val_loss": loss.item()})

        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_eval_loss / len(val_loader)
        print(f"[INFO] Epoch {epoch} : train loss = {avg_train_loss:.4f}, val loss = {avg_val_loss:.4f}")

    torch.save(model.state_dict(), f"transformer_{fixed_size}_layers_{num_layers}.pth")
    wandb.finish()