Montar Drive

In [2]:
from google.colab import drive
import os

# Montar Google Drive
drive.mount('/content/drive')

# Definir la ruta del dataset Spider en Google Drive
SPIDER_PATH = "/content/drive/My Drive/spider"

# Verificar que los archivos están en la ubicación correcta
print("Archivos en Spider:", os.listdir(SPIDER_PATH))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Archivos en Spider: ['train_gold.sql', 'dev_gold.sql', 'dev.json', 'train_others.json', 'train_spider.json', 'tables.json', 'README.txt', 'test_tables.json', 'test.json', 'test_gold.sql', '.DS_Store', 'test_database', 'database', 'train_spider_fixed.json']


Cargar los Datos de Spider

In [3]:
import json

# Cargar datos de entrenamiento y validación
with open(f"{SPIDER_PATH}/train_spider.json", "r") as f:
    train_data = json.load(f)

with open(f"{SPIDER_PATH}/dev.json", "r") as f:
    val_data = json.load(f)

print("Ejemplo de entrenamiento:", train_data[0])
print("Ejemplo de validación:", val_data[0])


Ejemplo de entrenamiento: {'db_id': 'department_management', 'query': 'SELECT count(*) FROM head WHERE age  >  56', 'query_toks': ['SELECT', 'count', '(', '*', ')', 'FROM', 'head', 'WHERE', 'age', '>', '56'], 'query_toks_no_value': ['select', 'count', '(', '*', ')', 'from', 'head', 'where', 'age', '>', 'value'], 'question': 'How many heads of the departments are older than 56 ?', 'question_toks': ['How', 'many', 'heads', 'of', 'the', 'departments', 'are', 'older', 'than', '56', '?'], 'sql': {'from': {'table_units': [['table_unit', 1]], 'conds': []}, 'select': [False, [[3, [0, [0, 0, False], None]]]], 'where': [[False, 3, [0, [0, 10, False], None], 56.0, None]], 'groupBy': [], 'having': [], 'orderBy': [], 'limit': None, 'intersect': None, 'union': None, 'except': None}}
Ejemplo de validación: {'db_id': 'concert_singer', 'query': 'SELECT count(*) FROM singer', 'query_toks': ['SELECT', 'count', '(', '*', ')', 'FROM', 'singer'], 'query_toks_no_value': ['select', 'count', '(', '*', ')', 'fr

Preprocesar los Datos para los Modelos

In [4]:
import pandas as pd

# Función para convertir Spider a formato de entrenamiento
def preprocess_spider(data):
    inputs = []
    targets = []

    for item in data:
        question = item["question"]  # Pregunta en lenguaje natural
        sql_query = item["query"]  # SQL correspondiente

        # Formato de entrada para los modelos
        inputs.append(f"Translate to SQL: {question}")
        targets.append(sql_query)

    return pd.DataFrame({"input": inputs, "target": targets})

# Convertir datos de entrenamiento y validación
train_df = preprocess_spider(train_data)
val_df = preprocess_spider(val_data)

print("Ejemplo de entrada para T5:", train_df.head())


Ejemplo de entrada para T5:                                                input  \
0  Translate to SQL: How many heads of the depart...   
1  Translate to SQL: List the name, born state an...   
2  Translate to SQL: List the creation year, name...   
3  Translate to SQL: What are the maximum and min...   
4  Translate to SQL: What is the average number o...   

                                              target  
0         SELECT count(*) FROM head WHERE age  >  56  
1  SELECT name ,  born_state ,  age FROM head ORD...  
2  SELECT creation ,  name ,  budget_in_billions ...  
3  SELECT max(budget_in_billions) ,  min(budget_i...  
4  SELECT avg(num_employees) FROM department WHER...  


Función para Tokenización

In [5]:
from transformers import AutoTokenizer

def tokenize_data(df, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    inputs = tokenizer(df["input"].tolist(), padding="max_length", truncation=True, return_tensors="pt", max_length=512)
    targets = tokenizer(df["target"].tolist(), padding="max_length", truncation=True, return_tensors="pt", max_length=128)
    return inputs, targets, tokenizer


Crear Dataset PyTorch

In [6]:
import torch
from torch.utils.data import Dataset

class SpiderDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx]
        }


Configurar y Entrenar los Modelos T5-SQL

In [7]:
from transformers import AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback


# Modelos en Hugging Face
models = {
    "T5-SQL": "suriya7/t5-base-text-to-sql",
    #"BART-LARGE-NL2SQL": "SwastikM/bart-large-nl2sql",
    #"NL2SQL-StarCoder-15B": "gabrielpondc/NL2SQL-StarCoder-15B",
  #  "AutoSQL-nl2sql-8b": "xbrain/AutoSQL-nl2sql-1.0-8b",
   # "T5-Small-NL2SQL": "Shritama/t5-small-finetuned-nl2sql"
}

# Entrenar cada modelo
for model_name, model_path in models.items():
    print(f"\n🔵 Entrenando {model_name}...\n")

    # Tokenizar datos
    train_inputs, train_targets, tokenizer = tokenize_data(train_df, model_path)
    val_inputs, val_targets, _ = tokenize_data(val_df, model_path)

    # Crear dataset
    train_dataset = SpiderDataset(train_inputs, train_targets)
    val_dataset = SpiderDataset(val_inputs, val_targets)

    # Cargar modelo
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

    # Configurar entrenamiento
    training_args = TrainingArguments(
        #Fine Tune: se agrega fp16=True para reducir memoria y mejorar el entrenamiento
        #fp16=True,
        output_dir=f"./{model_name}_spider",
        evaluation_strategy="epoch",
        #Fine Tune: Incrementamos el trainin batch y eval batch de 4 a 8 para mejorar el entrenamiento
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        #Fine Tune reducimos el learning_rate=3e-5,para hacerlo mas estable
        learning_rate=1e-5,
        weight_decay=0.01,
        load_best_model_at_end=True, #Requerido para Early Stopping
        #Aumentamos cantidad de epocas de 3 a 5, ahora incrementamos a 7
        num_train_epochs=7,
        logging_dir=f"./logs/{model_name}",
        save_strategy="epoch"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        #optimizers=(trainer.optimizer, lr_scheduler)
    )

    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))


    # Iniciar entrenamiento
    trainer.train()

    # Guardar modelo en Google Drive
    save_path = f"/content/drive/My Drive/spider_models_fine_Tuned/{model_name}"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"✅ Modelo {model_name} guardado en {save_path}")



🔵 Entrenando T5-SQL...



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33moms301082[0m ([33moms301082-citi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,2.1741,0.53334
2,0.5512,0.440995
3,0.4712,0.420308
4,0.4426,0.408359
5,0.424,0.402459
6,0.4157,0.39895
7,0.409,0.39752


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


✅ Modelo T5-SQL guardado en /content/drive/My Drive/spider_models_fine_Tuned/T5-SQL


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
from transformers import AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback


# Modelos en Hugging Face
models = {
    "T5-SQL": "suriya7/t5-base-text-to-sql",
    #"BART-LARGE-NL2SQL": "SwastikM/bart-large-nl2sql",
    #"NL2SQL-StarCoder-15B": "gabrielpondc/NL2SQL-StarCoder-15B",
  #  "AutoSQL-nl2sql-8b": "xbrain/AutoSQL-nl2sql-1.0-8b",
   # "T5-Small-NL2SQL": "Shritama/t5-small-finetuned-nl2sql"
}

# Entrenar cada modelo
for model_name, model_path in models.items():
    print(f"\n🔵 Entrenando {model_name}...\n")

    # Tokenizar datos
    train_inputs, train_targets, tokenizer = tokenize_data(train_df, model_path)
    val_inputs, val_targets, _ = tokenize_data(val_df, model_path)

    # Crear dataset
    train_dataset = SpiderDataset(train_inputs, train_targets)
    val_dataset = SpiderDataset(val_inputs, val_targets)

    # Cargar modelo
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

    # Configurar entrenamiento
    training_args = TrainingArguments(
        #Fine Tune: se agrega fp16=True para reducir memoria y mejorar el entrenamiento
        #fp16=True,
        output_dir=f"./{model_name}_spider",
        evaluation_strategy="epoch",
        #Fine Tune: Incrementamos el trainin batch y eval batch de 4 a 8 para mejorar el entrenamiento
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        #Fine Tune reducimos el learning_rate=3e-5,para hacerlo mas estable
        learning_rate=1e-5,
        weight_decay=0.01,
        load_best_model_at_end=True, #Requerido para Early Stopping
        #Aumentamos cantidad de epocas a 15
        num_train_epochs=15,
        logging_dir=f"./logs/{model_name}",
        save_strategy="epoch"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        #optimizers=(trainer.optimizer, lr_scheduler)
    )

    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))


    # Iniciar entrenamiento
    trainer.train()

    # Guardar modelo en Google Drive
    save_path = f"/content/drive/My Drive/spider_models_fine_Tuned/{model_name}"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"✅ Modelo {model_name} guardado en {save_path}")



🔵 Entrenando T5-SQL...





Epoch,Training Loss,Validation Loss
1,2.1622,0.523668
2,0.542,0.435735
3,0.4565,0.412799
4,0.4215,0.398143
5,0.3998,0.387921
6,0.3836,0.381333
7,0.3669,0.375353
8,0.3516,0.371232
9,0.3445,0.368995
10,0.3401,0.366935


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


✅ Modelo T5-SQL guardado en /content/drive/My Drive/spider_models_fine_Tuned/T5-SQL


Probar el Modelo Entrenado

In [10]:
!pip install sacrebleu datasets torch tabulate

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=

In [12]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# 📌 Lista de modelos a evaluar
models_to_test = {

    "T5-SQL": "/content/drive/MyDrive/spider_models_fine_Tuned/T5-SQL",

}

# 📌 Diccionario para almacenar modelos y tokenizers
models_dict = {}

for model_name, model_path in models_to_test.items():
    try:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        models_dict[model_name] = {"model": model, "tokenizer": tokenizer}
        print(f"✅ {model_name} cargado correctamente.")
    except Exception as e:
        print(f"❌ Error al cargar {model_name}: {e}")


✅ T5-SQL cargado correctamente.


Probar el Modelo

In [14]:
# Seleccionar un modelo entrenado
model_name = "T5-SQL"
model_path = f"/content/drive/My Drive/spider_models_fine_Tuned/{model_name}"

# Cargar modelo y tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Función de inferencia
def generate_sql(question):
    input_text = f"Translate to SQL: {question}"
    inputs = tokenizer(input_text, return_tensors="pt")

    # Generar SQL
    output = model.generate(**inputs)
    sql_query = tokenizer.decode(output[0], skip_special_tokens=True)
    return sql_query

# Prueba con una pregunta nueva
#question = "¿Cuántos empleados hay en la base de datos?"
question = "count of employees with age > 30?"
sql_generated = generate_sql(question)

print("Pregunta:", question)
print("SQL Generado:", sql_generated)


Pregunta: count of employees with age > 30?
SQL Generado: SELECT count(*) FROM Employee WHERE age > 30


Generar Predicciones SQL para Validación

In [15]:
import json

SPIDER_PATH = "/content/drive/My Drive/spider"

# 📌 Cargar datos de validación de Spider
with open(f"{SPIDER_PATH}/dev.json", "r") as f:
    val_data = json.load(f)

# 📌 Extraer preguntas y sus SQL correctos
questions = [item["question"] for item in val_data]
true_sql = [item["query"] for item in val_data]

# 📌 Función para generar SQL con cada modelo
def generate_sql(model, tokenizer, question):
    input_text = f"Translate to SQL: {question}"
    inputs = tokenizer(input_text, return_tensors="pt")

    output = model.generate(
        **inputs, max_length=128, temperature=0.7, top_p=0.9, num_return_sequences=1, repetition_penalty=1.2
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# 📌 Generar consultas SQL con cada modelo
predictions = {}
for model_name, data in models_dict.items():
    print(f"\n🚀 Generando SQL con {model_name}...\n")
    model = data["model"]
    tokenizer = data["tokenizer"]
    predictions[model_name] = [generate_sql(model, tokenizer, q) for q in questions]



🚀 Generando SQL con T5-SQL...





Evaluar Modelo

In [16]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [18]:
import evaluate
import sqlite3
from tabulate import tabulate

# 📌 Cargar métricas BLEU
bleu_metric = evaluate.load("sacrebleu")

# 📌 Función para calcular Exact Match
def exact_match(pred, true):
    return int(pred.strip().lower() == true.strip().lower())

# 📌 Función para ejecutar consultas SQL en SQLite
def execute_sql(query, conn):
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        return cursor.fetchall()
    except Exception as e:
        return None  # Si la consulta es inválida, devuelve None

# 📌 Función para calcular Execution Accuracy
def execution_accuracy(predictions, references, conn):
    correct = sum(1 for pred, ref in zip(predictions, references) if execute_sql(pred, conn) == execute_sql(ref, conn))
    return (correct / len(references)) * 100

# 📌 Crear una base de datos en memoria para evaluar Execution Accuracy
conn = sqlite3.connect(":memory:")
cursor = conn.cursor()

# 🚀 Simulación de una base de datos (Asegúrate de usar una DB real en producción)
cursor.execute("CREATE TABLE employees (id INTEGER, name TEXT, salary INTEGER)")
cursor.executemany("INSERT INTO employees (id, name, salary) VALUES (?, ?, ?)", [
    (1, "Alice", 50000),
    (2, "Bob", 60000),
    (3, "Charlie", 70000)
])
conn.commit()

# 📌 Evaluar cada modelo
results = []

for model_name in models_dict.keys():
    pred_sql = predictions[model_name]

    # 📌 Calcular BLEU Score
    bleu_score = bleu_metric.compute(predictions=pred_sql, references=[[x] for x in true_sql])["score"]

    # 📌 Calcular Exact Match (EM)
    em_score = sum(exact_match(pred_sql[i], true_sql[i]) for i in range(len(true_sql))) / len(true_sql)

    # 📌 Calcular Execution Accuracy
    exec_acc = execution_accuracy(pred_sql, true_sql, conn)

    results.append([model_name, round(bleu_score, 2), round(em_score * 100, 2), round(exec_acc, 2)])

# 📌 Mostrar Resultados en Tabla
print("\n📊 **Comparación de Métricas**\n")
print(tabulate(results, headers=["Modelo", "BLEU Score", "Exact Match (%)", "Execution Accuracy (%)"], tablefmt="grid"))

# 📌 Cerrar la conexión con la base de datos
conn.close()



📊 **Comparación de Métricas**

+----------+--------------+-------------------+--------------------------+
| Modelo   |   BLEU Score |   Exact Match (%) |   Execution Accuracy (%) |
| T5-SQL   |        27.51 |              3.09 |                    99.81 |
+----------+--------------+-------------------+--------------------------+



Métrica	Valor	Interpretación
BLEU Score	27.51	Decente, pero aún puede mejorar.

Exact Match (%)	3.09%	Muy bajo, pero no es preocupante si Execution Accuracy es alto.

Execution Accuracy (%)	99.81%	Excelente, el modelo genera consultas funcionales.