<a href="https://colab.research.google.com/github/OsvaldoMS1982/TFM-NLP2SQL/blob/Fase-1/Fase1_EntrenamientodeModelosParte1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Montar Drive

In [None]:
from google.colab import drive
import os

# Montar Google Drive
drive.mount('/content/drive')

# Definir la ruta del dataset Spider en Google Drive
SPIDER_PATH = "/content/drive/My Drive/spider"

# Verificar que los archivos están en la ubicación correcta
print("Archivos en Spider:", os.listdir(SPIDER_PATH))


Mounted at /content/drive
Archivos en Spider: ['train_gold.sql', 'dev_gold.sql', 'dev.json', 'train_others.json', 'train_spider.json', 'tables.json', 'README.txt', 'test_tables.json', 'test.json', 'test_gold.sql', '.DS_Store', 'test_database', 'database', 'train_spider_fixed.json']


Cargar los Datos de Spider

In [None]:
import json

# Cargar datos de entrenamiento y validación
with open(f"{SPIDER_PATH}/train_spider.json", "r") as f:
    train_data = json.load(f)

with open(f"{SPIDER_PATH}/dev.json", "r") as f:
    val_data = json.load(f)

print("Ejemplo de entrenamiento:", train_data[0])
print("Ejemplo de validación:", val_data[0])


Ejemplo de entrenamiento: {'db_id': 'department_management', 'query': 'SELECT count(*) FROM head WHERE age  >  56', 'query_toks': ['SELECT', 'count', '(', '*', ')', 'FROM', 'head', 'WHERE', 'age', '>', '56'], 'query_toks_no_value': ['select', 'count', '(', '*', ')', 'from', 'head', 'where', 'age', '>', 'value'], 'question': 'How many heads of the departments are older than 56 ?', 'question_toks': ['How', 'many', 'heads', 'of', 'the', 'departments', 'are', 'older', 'than', '56', '?'], 'sql': {'from': {'table_units': [['table_unit', 1]], 'conds': []}, 'select': [False, [[3, [0, [0, 0, False], None]]]], 'where': [[False, 3, [0, [0, 10, False], None], 56.0, None]], 'groupBy': [], 'having': [], 'orderBy': [], 'limit': None, 'intersect': None, 'union': None, 'except': None}}
Ejemplo de validación: {'db_id': 'concert_singer', 'query': 'SELECT count(*) FROM singer', 'query_toks': ['SELECT', 'count', '(', '*', ')', 'FROM', 'singer'], 'query_toks_no_value': ['select', 'count', '(', '*', ')', 'fr

Preprocesar los Datos para los Modelos

In [None]:
import pandas as pd

# Función para convertir Spider a formato de entrenamiento
def preprocess_spider(data):
    inputs = []
    targets = []

    for item in data:
        question = item["question"]  # Pregunta en lenguaje natural
        sql_query = item["query"]  # SQL correspondiente

        # Formato de entrada para los modelos
        inputs.append(f"Translate to SQL: {question}")
        targets.append(sql_query)

    return pd.DataFrame({"input": inputs, "target": targets})

# Convertir datos de entrenamiento y validación
train_df = preprocess_spider(train_data)
val_df = preprocess_spider(val_data)

print("Ejemplo de entrada para T5:", train_df.head())


Ejemplo de entrada para T5:                                                input  \
0  Translate to SQL: How many heads of the depart...   
1  Translate to SQL: List the name, born state an...   
2  Translate to SQL: List the creation year, name...   
3  Translate to SQL: What are the maximum and min...   
4  Translate to SQL: What is the average number o...   

                                              target  
0         SELECT count(*) FROM head WHERE age  >  56  
1  SELECT name ,  born_state ,  age FROM head ORD...  
2  SELECT creation ,  name ,  budget_in_billions ...  
3  SELECT max(budget_in_billions) ,  min(budget_i...  
4  SELECT avg(num_employees) FROM department WHER...  


Función para Tokenización

In [None]:
from transformers import AutoTokenizer

def tokenize_data(df, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    inputs = tokenizer(df["input"].tolist(), padding="max_length", truncation=True, return_tensors="pt", max_length=512)
    targets = tokenizer(df["target"].tolist(), padding="max_length", truncation=True, return_tensors="pt", max_length=128)
    return inputs, targets, tokenizer


Crear Dataset PyTorch

In [None]:
import torch
from torch.utils.data import Dataset

class SpiderDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs["input_ids"][idx],
            "attention_mask": self.inputs["attention_mask"][idx],
            "labels": self.targets["input_ids"][idx]
        }


Configurar y Entrenar los Modelos

In [None]:
from transformers import AutoModelForSeq2SeqLM, Trainer, TrainingArguments

# Modelos en Hugging Face
models = {
    "T5-LM-Large-text2sql-spider": "gaussalgo/T5-LM-Large-text2sql-spider",
    "BART-LARGE-NL2SQL": "SwastikM/bart-large-nl2sql",
    #"NL2SQL-StarCoder-15B": "gabrielpondc/NL2SQL-StarCoder-15B",
  #  "AutoSQL-nl2sql-8b": "xbrain/AutoSQL-nl2sql-1.0-8b",
    "T5-Small-NL2SQL": "Shritama/t5-small-finetuned-nl2sql"
}

# Entrenar cada modelo
for model_name, model_path in models.items():
    print(f"\n🔵 Entrenando {model_name}...\n")

    # Tokenizar datos
    train_inputs, train_targets, tokenizer = tokenize_data(train_df, model_path)
    val_inputs, val_targets, _ = tokenize_data(val_df, model_path)

    # Crear dataset
    train_dataset = SpiderDataset(train_inputs, train_targets)
    val_dataset = SpiderDataset(val_inputs, val_targets)

    # Cargar modelo
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

    # Configurar entrenamiento
    training_args = TrainingArguments(
        output_dir=f"./{model_name}_spider",
        evaluation_strategy="epoch",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        learning_rate=3e-5,
        weight_decay=0.01,
        num_train_epochs=3,
        logging_dir=f"./logs/{model_name}",
        save_strategy="epoch"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    # Iniciar entrenamiento
    trainer.train()

    # Guardar modelo en Google Drive
    save_path = f"/content/drive/My Drive/spider_models/{model_name}"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"✅ Modelo {model_name} guardado en {save_path}")



🔵 Entrenando T5-LM-Large-text2sql-spider...



tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

KeyboardInterrupt: 

Probar el Modelo Entrenado

In [None]:
# Seleccionar un modelo entrenado
model_name = "T5-LM-Large-text2sql-spider"
model_path = f"/content/drive/My Drive/spider_models/{model_name}"

# Cargar modelo y tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Función de inferencia
def generate_sql(question):
    input_text = f"Translate to SQL: {question}"
    inputs = tokenizer(input_text, return_tensors="pt")

    # Generar SQL
    output = model.generate(**inputs)
    sql_query = tokenizer.decode(output[0], skip_special_tokens=True)
    return sql_query

# Prueba con una pregunta nueva
#question = "¿Cuántos empleados hay en la base de datos?"
question = "cuantos empleados tienen mas de 30 años?"
sql_generated = generate_sql(question)

print("Pregunta:", question)
print("SQL Generado:", sql_generated)


Pregunta: cuantos empleados tienen mas de 30 años?
SQL Generado: SELECT count(*) FROM employees WHERE age > 30


In [None]:
# Seleccionar un modelo entrenado
model_name = "BART-LARGE-NL2SQL"
model_path = f"/content/drive/My Drive/spider_models/{model_name}"

# Cargar modelo y tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Función de inferencia
def generate_sql(question):
    input_text = f"Translate to SQL: {question}"
    inputs = tokenizer(input_text, return_tensors="pt")

    # Generar SQL
    output = model.generate(**inputs)
    sql_query = tokenizer.decode(output[0], skip_special_tokens=True)
    return sql_query

# Prueba con una pregunta nueva
#question = "¿Cuántos empleados hay en la base de datos?"
#question = "How many employees are in the database?"
question = "cuantos empleados tienen mas de 30 años?"
sql_generated = generate_sql(question)

print("Pregunta:", question)
print("SQL Generado:", sql_generated)




Pregunta: cuantos empleados tienen mas de 30 años?
SQL Generado: 


In [None]:
# Seleccionar un modelo entrenado
model_name = "T5-Small-NL2SQL"
model_path = f"/content/drive/My Drive/spider_models/{model_name}"

# Cargar modelo y tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Función de inferencia
def generate_sql(question):
    input_text = f"Translate to SQL: {question}"
    inputs = tokenizer(input_text, return_tensors="pt")

    # Generar SQL
    output = model.generate(**inputs)
    sql_query = tokenizer.decode(output[0], skip_special_tokens=True)
    return sql_query

# Prueba con una pregunta nueva
#question = "¿Cuántos empleados hay en la base de datos?"
question = "cuantos empleados tienen mas de 30 años?"
sql_generated = generate_sql(question)

print("Pregunta:", question)
print("SQL Generado:", sql_generated)


Pregunta: cuantos empleados tienen mas de 30 años?
SQL Generado: SELECT t1.empleados tienen mas de 30 ao



Entrenar T5-SQL



In [None]:
from transformers import AutoModelForSeq2SeqLM , Trainer, TrainingArguments

from transformers import AutoTokenizer

def tokenize_data1(df, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token  # Definir padding
    inputs = tokenizer(df["input"].tolist(), padding="max_length", truncation=True, return_tensors="pt", max_length=512)
    targets = tokenizer(df["target"].tolist(), padding="max_length", truncation=True, return_tensors="pt", max_length=128)
    return inputs, targets, tokenizer


# Modelos en Hugging Face
models = {
    #"T5-LM-Large-text2sql-spider": "gaussalgo/T5-LM-Large-text2sql-spider",
    #"BART-LARGE-NL2SQL": "SwastikM/bart-large-nl2sql",
    "T5-SQL": "suriya7/t5-base-text-to-sql",
  #  "AutoSQL-nl2sql-8b": "xbrain/AutoSQL-nl2sql-1.0-8b",
    #"T5-Small-NL2SQL": "Shritama/t5-small-finetuned-nl2sql"
}

# Entrenar cada modelo
for model_name, model_path in models.items():
    print(f"\n🔵 Entrenando {model_name}...\n")

    # Tokenizar datos
    train_inputs, train_targets, tokenizer = tokenize_data1(train_df, model_path)
    val_inputs, val_targets, _ = tokenize_data1(val_df, model_path)

    # Crear dataset
    train_dataset = SpiderDataset(train_inputs, train_targets)
    val_dataset = SpiderDataset(val_inputs, val_targets)

    # Cargar modelo
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

    # Configurar entrenamiento
    training_args = TrainingArguments(
        output_dir=f"./{model_name}_spider",
        evaluation_strategy="epoch",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        learning_rate=3e-5,
        weight_decay=0.01,
        num_train_epochs=3,
        logging_dir=f"./logs/{model_name}",
        save_strategy="epoch"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    # Iniciar entrenamiento
    trainer.train()

    # Guardar modelo en Google Drive
    save_path = f"/content/drive/My Drive/spider_models/{model_name}"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"✅ Modelo {model_name} guardado en {save_path}")



🔵 Entrenando T5-SQL...



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33moms301082[0m ([33moms301082-citi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.4064,0.378555
2,0.3429,0.360241
3,0.3301,0.355897


✅ Modelo T5-SQL guardado en /content/drive/My Drive/spider_models/T5-SQL
