In [None]:
!pip uninstall -y transformers optimum-quanto accelerate


Found existing installation: transformers 4.57.3
Uninstalling transformers-4.57.3:
  Successfully uninstalled transformers-4.57.3
[0mFound existing installation: accelerate 1.12.0
Uninstalling accelerate-1.12.0:
  Successfully uninstalled accelerate-1.12.0


In [None]:
!pip install "transformers==4.44.2"


Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.44.2)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m93.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
  

In [None]:
import json
import numpy as np
import time
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration


In [None]:
# 1) Charger les paires préparées
with open("/content/bart_train_pairsfinal.json", "r", encoding="utf-8") as f:
    train_pairs = json.load(f)

with open("/content/bart_dev_pairsfinal.json", "r", encoding="utf-8") as f:
    dev_pairs = json.load(f)

print("Train size:", len(train_pairs))
print("Dev size  :", len(dev_pairs))


# 2) Dataset pour BART
class Text2SQLDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_input_len=512, max_output_len=256):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        item = self.pairs[idx]

        enc = self.tokenizer(
            item["input"],
            padding="max_length",
            truncation=True,
            max_length=self.max_input_len,
            return_tensors="pt",
        )

        dec = self.tokenizer(
            item["output"],
            padding="max_length",
            truncation=True,
            max_length=self.max_output_len,
            return_tensors="pt",
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": dec["input_ids"].squeeze(0),
        }


Train size: 8659
Dev size  : 1034


In [None]:
# 1) Charger T5-small + tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# IMPORTANT : bien régler pad_token_id (sinon erreurs de loss parfois)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id

# Device (GPU si dispo)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Device :", device)

# 2) Créer les Dataset / DataLoader
train_dataset = Text2SQLDataset(train_pairs, tokenizer)
val_dataset   = Text2SQLDataset(dev_pairs, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=4, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Device : cuda


In [None]:
def compute_exact_match(pred_texts, label_texts):
    pred_texts = [p.strip().lower() for p in pred_texts]
    label_texts = [l.strip().lower() for l in label_texts]
    matches = [int(p == l) for p, l in zip(pred_texts, label_texts)]
    return sum(matches) / len(matches) if matches else 0.0

num_epochs = 3   # commence par 1 ou 2 pour tester, après tu peux mettre 3
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

pad_token_id = tokenizer.pad_token_id

start_time = time.time()

for epoch in range(1, num_epochs + 1):
    print(f"\n===== ÉPOCH {epoch}/{num_epochs} =====")
    model.train()
    total_loss = 0.0

    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels         = batch["labels"].to(device)

        # Ignorer le padding dans la loss
        labels_for_loss = labels.clone()
        labels_for_loss[labels_for_loss == pad_token_id] = -100

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels_for_loss,
        )
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()

        if (step + 1) % 500 == 0:
            print(f"  Step {step+1}/{len(train_loader)} - Loss: {loss.item():.4f}")

    avg_train_loss = total_loss / len(train_loader)
    print(f"Loss moyen entraînement : {avg_train_loss:.4f}")

    # ========= ÉVALUATION =========
    model.eval()
    all_preds = []
    all_labels_text = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids      = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels         = batch["labels"].to(device)

            # Génération de la prédiction
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=256,
                num_beams=4,
            )

            pred_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

            # Texte de référence
            label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

            all_preds.extend(pred_texts)
            all_labels_text.extend(label_texts)

    exact_match = compute_exact_match(all_preds, all_labels_text)
    print(f"Exact Match dev : {exact_match:.4f}")

end_time = time.time()
training_duration = end_time - start_time
print("\n==============================")
print("  ENTRAÎNEMENT TERMINÉ")
print("==============================")
print(f"Temps total d'entraînement : {training_duration/60:.2f} minutes")
print(f"                         = {training_duration:.2f} secondes")
print("==============================")



===== ÉPOCH 1/3 =====
  Step 500/4330 - Loss: 1.9375
  Step 1000/4330 - Loss: 0.6897
  Step 1500/4330 - Loss: 1.0437
  Step 2000/4330 - Loss: 0.8351
  Step 2500/4330 - Loss: 1.4498
  Step 3000/4330 - Loss: 0.6293
  Step 3500/4330 - Loss: 0.6679
  Step 4000/4330 - Loss: 0.7673
Loss moyen entraînement : 1.1253
Exact Match dev : 0.0580

===== ÉPOCH 2/3 =====
  Step 500/4330 - Loss: 0.3741
  Step 1000/4330 - Loss: 0.9603
  Step 1500/4330 - Loss: 0.2762
  Step 2000/4330 - Loss: 0.6203
  Step 2500/4330 - Loss: 0.2882
  Step 3000/4330 - Loss: 0.2996
  Step 3500/4330 - Loss: 0.4034
  Step 4000/4330 - Loss: 0.1742
Loss moyen entraînement : 0.5530
Exact Match dev : 0.1489

===== ÉPOCH 3/3 =====
  Step 500/4330 - Loss: 0.3949
  Step 1000/4330 - Loss: 0.3600
  Step 1500/4330 - Loss: 0.1634
  Step 2000/4330 - Loss: 0.5782
  Step 2500/4330 - Loss: 0.6786
  Step 3000/4330 - Loss: 0.2752
  Step 3500/4330 - Loss: 0.2366
  Step 4000/4330 - Loss: 0.4430
Loss moyen entraînement : 0.4040
Exact Match dev :

In [None]:
model.eval()

def show_dev_example(idx, max_length=256):
    pair = dev_pairs[idx]
    input_text = pair["input"]
    gold_sql   = pair["output"]
    db_id      = pair.get("db_id", "N/A")

    print(f"===== EXEMPLE DEV #{idx} =====")
    print(f"DB_ID : {db_id}\n")
    print("----- INPUT T5 -----")
    print(input_text)

    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=4,
            early_stopping=True,
        )

    pred_sql = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    print("\n----- SQL GÉNÉRÉ -----")
    print(pred_sql)

    print("\n----- SQL GOLD (Spider) -----")
    print(gold_sql)
    print("="*60)


In [None]:
show_dev_example(0)
show_dev_example(10)
show_dev_example(25)


===== EXEMPLE DEV #0 =====
DB_ID : concert_singer

----- INPUT T5 -----
DB: concert_singer
QUESTION: how many singers do we have

TABLES:
- stadium: stadium id, location, name, capacity, highest, lowest, average
- singer: singer id, name, country, song name, song release year, age, is male
- concert: concert id, concert name, theme, stadium id, year
- singer in concert: concert id, singer id

FOREIGN_KEYS:
- concert.stadium id -> stadium.stadium id
- singer in concert.singer id -> singer.singer id
- singer in concert.concert id -> concert.concert id

ENTITIES:
- TABLE 'singers' -> singer


----- SQL GÉNÉRÉ -----
SELECT count(*) FROM singer

----- SQL GOLD (Spider) -----
SELECT count(*) FROM singer
===== EXEMPLE DEV #10 =====
DB_ID : concert_singer

----- INPUT T5 -----
DB: concert_singer
QUESTION: show all countries and the number of singers in each country

TABLES:
- stadium: stadium id, location, name, capacity, highest, lowest, average
- singer: singer id, name, country, song name, 

“Avec T5-small, la loss d'entraînement diminue de 1.12 à 0.40 en trois époques, ce qui montre une bonne capacité d’apprentissage.
L’Exact Match sur le jeu de validation augmente de manière monotone, de 5.8 % à 16.6 %, ce qui indique que le modèle généralise mieux que BART-base dans notre configuration expérimentale.

Malgré un score qui reste modeste au regard des modèles d’état de l’art sur Spider, ce résultat est cohérent avec la taille du modèle (T5-small), les ressources matérielles limitées (un seul GPU Colab) et la difficulté du benchmark.”

In [None]:
output_dir = "/content/t5_small_text2sql_finetuned"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Modèle T5-small et tokenizer sauvegardés dans", output_dir)


Modèle T5-small et tokenizer sauvegardés dans /content/t5_small_text2sql_finetuned


In [None]:
# On continue l'entraînement à partir de l'état actuel du modèle
extra_epochs = 2          # par exemple : encore 2 epochs (4 et 5)
start_epoch = 3           # tu as déjà fait 3 epochs
pad_token_id = tokenizer.pad_token_id

for epoch in range(start_epoch + 1, start_epoch + 1 + extra_epochs):
    print(f"\n===== ÉPOCH {epoch}/{start_epoch + extra_epochs} (CONTINUATION) =====")
    model.train()
    total_loss = 0.0

    for step, batch in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels         = batch["labels"].to(device)

        labels_for_loss = labels.clone()
        labels_for_loss[labels_for_loss == pad_token_id] = -100

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels_for_loss,
        )
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()

        if (step + 1) % 500 == 0:
            print(f"  Step {step+1}/{len(train_loader)} - Loss: {loss.item():.4f}")

    avg_train_loss = total_loss / len(train_loader)
    print(f"Loss moyen entraînement (epoch {epoch}) : {avg_train_loss:.4f}")

    # ====== ÉVAL DEV ======
    model.eval()
    all_preds = []
    all_labels_text = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids      = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels         = batch["labels"].to(device)

            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=256,
                num_beams=4,
            )

            pred_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

            all_preds.extend(pred_texts)
            all_labels_text.extend(label_texts)

    exact_match = compute_exact_match(all_preds, all_labels_text)
    print(f"Exact Match dev (epoch {epoch}) : {exact_match:.4f}")



===== ÉPOCH 4/5 (CONTINUATION) =====
  Step 500/4330 - Loss: 0.5954
  Step 1000/4330 - Loss: 0.0899
  Step 1500/4330 - Loss: 0.2190
  Step 2000/4330 - Loss: 0.2122
  Step 2500/4330 - Loss: 0.5448
  Step 3000/4330 - Loss: 0.3613
  Step 3500/4330 - Loss: 0.1303
  Step 4000/4330 - Loss: 0.2580
Loss moyen entraînement (epoch 4) : 0.3260
Exact Match dev (epoch 4) : 0.1692

===== ÉPOCH 5/5 (CONTINUATION) =====
  Step 500/4330 - Loss: 0.2008
  Step 1000/4330 - Loss: 0.4557
  Step 1500/4330 - Loss: 0.3844
  Step 2000/4330 - Loss: 0.3646
  Step 2500/4330 - Loss: 0.3186
  Step 3000/4330 - Loss: 0.2226
  Step 3500/4330 - Loss: 0.2245
  Step 4000/4330 - Loss: 0.0297
Loss moyen entraînement (epoch 5) : 0.2712
Exact Match dev (epoch 5) : 0.1867


In [None]:
# Chemin où tu veux sauvegarder le modèle
output_dir = "/content/t5_small_text2sql_5epochs"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("✅ Modèle et tokenizer T5-small sauvegardés dans :", output_dir)


✅ Modèle et tokenizer T5-small sauvegardés dans : /content/t5_small_text2sql_5epochs


In [None]:
model.eval()  # important

def show_dev_example(idx, max_length=256):
    """
    Affiche pour un exemple du dev set :
      - l'input texte (question + schéma)
      - le SQL généré par le modèle
      - le SQL gold (Spider)
    """
    pair = dev_pairs[idx]
    input_text = pair["input"]
    gold_sql   = pair["output"]
    db_id      = pair.get("db_id", "N/A")

    print(f"===== EXEMPLE DEV #{idx} =====")
    print(f"DB_ID : {db_id}\n")

    print("----- INPUT T5 -----")
    print(input_text)

    # Préparation input pour le modèle
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=512,
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Génération
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=4,
            early_stopping=True,
        )

    pred_sql = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    print("\n----- SQL GÉNÉRÉ -----")
    print(pred_sql)

    print("\n----- SQL GOLD (Spider) -----")
    print(gold_sql)
    print("="*70)


In [None]:
show_dev_example(0)
show_dev_example(10)
show_dev_example(25)
show_dev_example(100)


===== EXEMPLE DEV #0 =====
DB_ID : concert_singer

----- INPUT T5 -----
DB: concert_singer
QUESTION: how many singers do we have

TABLES:
- stadium: stadium id, location, name, capacity, highest, lowest, average
- singer: singer id, name, country, song name, song release year, age, is male
- concert: concert id, concert name, theme, stadium id, year
- singer in concert: concert id, singer id

FOREIGN_KEYS:
- concert.stadium id -> stadium.stadium id
- singer in concert.singer id -> singer.singer id
- singer in concert.concert id -> concert.concert id

ENTITIES:
- TABLE 'singers' -> singer


----- SQL GÉNÉRÉ -----
SELECT count(*) FROM singer

----- SQL GOLD (Spider) -----
SELECT count(*) FROM singer
===== EXEMPLE DEV #10 =====
DB_ID : concert_singer

----- INPUT T5 -----
DB: concert_singer
QUESTION: show all countries and the number of singers in each country

TABLES:
- stadium: stadium id, location, name, capacity, highest, lowest, average
- singer: singer id, name, country, song name, 

“En augmentant le nombre d’époques d’entraînement de 3 à 5 pour T5-small, nous observons une amélioration progressive de l’Exact Match sur le jeu de validation, qui passe de 16.6 % à 18.7 %. La loss d’entraînement continue de diminuer, ce qui suggère que le modèle n’a pas encore atteint un sur-apprentissage fort, même si les gains deviennent de plus en plus marginaux. Ces résultats montrent que T5-small exploite mieux notre pipeline schema-aware que BART-base, tout en restant limité par la taille du modèle et la complexité du benchmark Spider.”