In [1]:
from model import build_transformer

import torch
import torch.nn as nn
from torch.utils.data import random_split, Dataset, DataLoader

from tqdm import tqdm

import os

torch.set_default_device('cuda')

In [None]:
model = build_transformer(dropout=0.1,
						source_vocab_size=8000+1, target_vocab_size=292, context_length=900 - 3 + 1,
						encoder_block_count=6,
						encoder_self_attention_head_count=8,
						encoder_self_attention_abstraction_coef=0.15,
						encoder_feed_forward_abstraction_coef=4,
						dim=256, epsilon=1e-9)

total_params = sum(param.numel() for param in model.parameters() if param.requires_grad)
print(f"Nombre total de paramètres apprenables : {total_params}")

model_path = None
if model_path:
	model.load_state_dict(torch.load(model_path))

In [None]:
X = torch.load('X.pt', weights_only=True).int().to("cuda")
L = torch.load('L.pt', weights_only=True).int().to("cuda")
Y = torch.load('Y.pt', weights_only=True).half().to("cuda")

train_ratio = 0.9
train_size = int(len(X) * train_ratio)
test_size = len(X) - train_size

generator = torch.Generator(device="cuda").manual_seed(42)
X_train, X_test = random_split(X, [train_size, test_size], generator)
L_train, L_test = random_split(L, [train_size, test_size], generator)
Y_train, Y_test = random_split(Y, [train_size, test_size], generator)

print(len(X_train), len(X_test))
print(len(L_train), len(L_test))
print(len(Y_train), len(Y_test))

In [4]:
config = {
	"batch_size": 16,
	"epochs": 100,
	"lr": 1e-4,
	"epsilon": 1e-9,
	"weigths_folder": "weights/",
	"weights_file": "tr_model_",
}

In [5]:
class CustomDataset(Dataset):
    def __init__(self, X, L, Y):
        self.X = X
        self.L = L
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.L[idx], self.Y[idx]


dataset = CustomDataset(X_train, L_train, Y_train)
loader = DataLoader(dataset, batch_size=config["batch_size"], shuffle=True, generator=torch.Generator(device='cuda'))

In [None]:
import torch

masks = torch.ones((5, 5, 5), device="cuda")
for i in range(5-1):
	masks[i, :, i+1:] = 0

print(masks[torch.tensor([0, 2, 4])])

In [7]:
def training_loop(model, data, config, info):

	if not os.path.exists(config["weigths_folder"]):
		os.makedirs(config["weigths_folder"])
	
	optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"], eps=config["epsilon"], weight_decay=1e-5)
	loss_fn = nn.BCEWithLogitsLoss()

	masks = torch.ones((info["context_length"], 1, info["context_length"], info["context_length"]), device="cuda")
	for i in range(info["context_length"]-1):
		masks[i, :, :, i+1:] = 0

	for epoch in range(config["epochs"]):
		model.train()
		batch_iterator = tqdm(data, desc=f"Prcessing epoch {epoch:02d}")
		for x, l, y in batch_iterator:
			m = masks[l - 1]
			pred = model(x, mask=m)

			loss = loss_fn(pred, y)
			batch_iterator.set_postfix({f"loss": f"{loss.item():6.3f}"})

			loss.backward()
			optimizer.step()
			optimizer.zero_grad()
		
		model_path = config["weigths_folder"] + config["weights_file"] + str(epoch)
		torch.save({"state": model.state_dict()}, model_path)

In [None]:
info = {
	"context_length": 900 - 3 + 1,
}

training_loop(model, loader, config, info)