In [1]:
! nvidia-smi

Thu Apr 10 15:42:05 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.86                 Driver Version: 551.86         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3080      WDDM  |   00000000:01:00.0  On |                  N/A |
| 59%   45C    P8             23W /  350W |     964MiB /  12288MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import torch
print(torch.cuda.is_available())  # Should print True if GPU is available
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 3080


In [3]:
from huggingface_hub import login
from datasets import load_dataset, DatasetDict
import evaluate
from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments

login(token="") # place your own token here

In [4]:
codet5 = 'Salesforce/codet5-base' #changed to use CodeT5-base!

tokenizer = AutoTokenizer.from_pretrained(codet5)
model = T5ForConditionalGeneration.from_pretrained(codet5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

name = 'codet5-base-sql-create-context'
path = 'finetuned/codet5-base-sql-create-context'
batch_size = 8

In [5]:
dataset = DatasetDict({ 'train': load_dataset("b-mc2/sql-create-context", split='train[:80%]'),
                            'validation': load_dataset("b-mc2/sql-create-context", split='train[-20%:-10%]'),
                      })

def format_dataset(example):
    return {'input': 'schema: \n' + example['context'][:420] + '\n\ntranslate to SQL: ' + example['question'], 'target': example['answer']}

formatted_dataset = dataset.map(format_dataset, remove_columns=dataset['train'].column_names).shuffle(seed=42) # also shuffles!

In [6]:
formatted_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'target'],
        num_rows: 62862
    })
    validation: Dataset({
        features: ['input', 'target'],
        num_rows: 7857
    })
})

In [7]:
formatted_dataset['train'][0]

{'input': 'schema: \nCREATE TABLE table_name_42 (decision VARCHAR, series VARCHAR)\n\ntranslate to SQL: Which Decision has a Series of 3 – 3?',
 'target': 'SELECT decision FROM table_name_42 WHERE series = "3 – 3"'}

In [8]:
# map with tokenizer to provide tokenized dataset to the Seq2SeqTrainer
def tokenize_function(example_batch):
    '''use direct tokenizer call, construct encodings dictionary'''
    input_encodings = tokenizer(example_batch['input'], padding='max_length', truncation=True, max_length=256)
    target_encodings = tokenizer(example_batch['target'], padding='max_length', truncation=True, max_length=256)


    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True, remove_columns=formatted_dataset['train'].column_names)

columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']
tokenized_dataset.set_format(type='torch', columns=columns)


In [9]:
# arguments for Seq2SeqTrainer
trainer_args = Seq2SeqTrainingArguments(
    output_dir=path,
    num_train_epochs=5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    overwrite_output_dir=True,
    save_total_limit=3,
    load_best_model_at_end=True,
    push_to_hub=False
    #fp16=True, 
)

In [10]:
# Metric calculation 
# Exact Match https://huggingface.co/spaces/evaluate-metric/exact_match
# ROUGE2 score https://huggingface.co/spaces/evaluate-metric/rouge
# BLEU score https://huggingface.co/spaces/evaluate-metric/sacrebleu
exact_match = evaluate.load("exact_match")
rouge = evaluate.load("rouge")
sacrebleu = evaluate.load("sacrebleu")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # Decode the predictions and labels
    pred_ids[pred_ids == -100] = tokenizer.pad_token_id
    
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    
    return {
        "exact_match": exact_match.compute(predictions=pred_str, references=label_str)['exact_match'],
        "rouge2": rouge.compute(predictions=pred_str, references=label_str)["rouge2"],
        "bleu": sacrebleu.compute(predictions=pred_str, references=label_str)["score"],
    }

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=trainer_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
) 

In [11]:
#trainer.evaluate()

In [12]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Exact Match,Rouge2,Bleu
1,0.0071,0.004668,0.235968,0.880801,75.189148
2,0.0041,0.003887,0.239277,0.88205,75.406126
3,0.0028,0.003718,0.24475,0.883199,75.586255
4,0.0019,0.003543,0.246914,0.883711,75.712976
5,0.0013,0.003569,0.245132,0.883943,75.678024


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=39290, training_loss=0.0044258481440091445, metrics={'train_runtime': 13060.1177, 'train_samples_per_second': 24.066, 'train_steps_per_second': 3.008, 'total_flos': 9.57007772909568e+16, 'train_loss': 0.0044258481440091445, 'epoch': 5.0})

In [13]:
# store the model and maybe push to huggingface hub?
trainer.save_model()

tokenizer.save_pretrained(path)

trainer.create_model_card()

#trainer.push_to_hub()