In [None]:
import os
import json
import torch
import random
from google.colab import drive
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.optim.lr_scheduler import ReduceLROnPlateau

Montar disco

In [None]:
drive.mount('/content/drive')
SPIDER_PATH = "/content/drive/My Drive/spider"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Cargar entrenamiento

In [None]:
def load_data(filepath):
    with open(filepath, "r") as f:
        return json.load(f)

train_data = load_data(f"{SPIDER_PATH}/train_spider.json")
val_data = load_data(f"{SPIDER_PATH}/dev.json")

Pre procesar datos

In [None]:
def preprocess_spider(data):
    inputs = []
    targets = []
    for item in data:
        question = item["question"]
        sql_query = item["query"]
        inputs.append(f"Translate to SQL: {question}")
        targets.append(sql_query)
    return inputs, targets

train_inputs, train_targets = preprocess_spider(train_data)
val_inputs, val_targets = preprocess_spider(val_data)

Tokenizar Datos

In [None]:
MODEL_NAME = "SwastikM/bart-large-nl2sql"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_data(inputs, targets, tokenizer, max_input_len=512, max_target_len=128):
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=max_input_len, return_tensors="pt")
    labels = tokenizer(targets, padding="max_length", truncation=True, max_length=max_target_len, return_tensors="pt").input_ids
    return model_inputs, labels

train_encodings, train_labels = tokenize_data(train_inputs, train_targets, tokenizer)
val_encodings, val_labels = tokenize_data(val_inputs, val_targets, tokenizer)

Dataset Personalizado

In [None]:
class SpiderDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = SpiderDataset(train_encodings, train_labels)
val_dataset = SpiderDataset(val_encodings, val_labels)

Cargar Modelo

In [None]:
model = BartForConditionalGeneration.from_pretrained(MODEL_NAME)

Parametrizacion del entrenamiento

In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/bart_nl2sql_finetuned",
    evaluation_strategy="epoch",  # Evaluar al final de cada época
    save_strategy="epoch",  # Guardar checkpoints al final de cada época
    learning_rate=1e-5,  # Ajustado para fine-tuning en NL2SQL
    per_device_train_batch_size=8,  # Ajuste dinámico según disponibilidad de GPU
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,  # Para simular batch más grandes en GPUs limitadas
    weight_decay=0.05,  # Prevención de overfitting
    warmup_steps=500,  # Mejor ajuste de tasa de aprendizaje
    logging_dir="/content/drive/My Drive/logs",
    logging_steps=50,  # Frecuencia de logging
    save_total_limit=2,  # Mantener solo los últimos dos checkpoints
    num_train_epochs=10,  # Incremento de épocas para mejor ajuste
    report_to="none",  # Evitar reportes en servidores externos
    load_best_model_at_end=True,  # Cargar el mejor modelo al final
    metric_for_best_model="eval_loss",  # Definir métrica de evaluación
    greater_is_better=False,  # Menor pérdida es mejor
    save_on_each_node=True,
)




Scheduler para reducir el learning rate

In [None]:
scheduler = ReduceLROnPlateau(
    optimizer=torch.optim.AdamW(model.parameters(), lr=1e-5),
    mode='min',
    factor=0.5,  # Reducir lr a la mitad cuando no mejore
    patience=2,  # Esperar 2 épocas antes de reducir lr
    verbose=True
)

Early Stop

In [None]:
# Entrenador con Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Detener si no mejora en 3 épocas
)

Entrenador

Aplicar Entrenamiento

In [None]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])


Epoch,Training Loss,Validation Loss
1,1.1538,0.595448
2,0.2033,0.258317
3,0.0955,0.278421
4,0.0619,0.299231
5,0.0482,0.310771


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1095, training_loss=1.034376820255088, metrics={'train_runtime': 1910.553, 'train_samples_per_second': 36.639, 'train_steps_per_second': 1.141, 'total_flos': 3.792433053696e+16, 'train_loss': 1.034376820255088, 'epoch': 5.0})

Guardar Modelo

In [None]:
model.save_pretrained("/content/drive/My Drive/bart_nl2sql_finetuned/final_model")
tokenizer.save_pretrained("/content/drive/My Drive/bart_nl2sql_finetuned/final_model")

('/content/drive/My Drive/bart_nl2sql_finetuned/final_model/tokenizer_config.json',
 '/content/drive/My Drive/bart_nl2sql_finetuned/final_model/special_tokens_map.json',
 '/content/drive/My Drive/bart_nl2sql_finetuned/final_model/vocab.json',
 '/content/drive/My Drive/bart_nl2sql_finetuned/final_model/merges.txt',
 '/content/drive/My Drive/bart_nl2sql_finetuned/final_model/added_tokens.json',
 '/content/drive/My Drive/bart_nl2sql_finetuned/final_model/tokenizer.json')

Metricas de Evaluacion

In [None]:
eval_results = trainer.evaluate()
print("Resultados de Evaluación:")
for key, value in eval_results.items():
    print(f"{key}: {value}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])


Resultados de Evaluación:
eval_loss: 0.2583167850971222
eval_runtime: 17.2539
eval_samples_per_second: 59.929
eval_steps_per_second: 7.535
epoch: 5.0


Prueba de preguntas aleatorias