# TPC-DS Text-to-SQL Execution Benchmark

This notebook evaluates text-to-SQL models on a single TPC-DS benchmark and reports execution accuracy.


In [2]:
!git clone https://github.com/VuThanhLam124/Capstone-NLUS-VDD.git

Cloning into 'Capstone-NLUS-VDD'...
remote: Enumerating objects: 93, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 93 (delta 35), reused 78 (delta 23), pack-reused 0 (from 0)[K
Receiving objects: 100% (93/93), 341.28 KiB | 11.01 MiB/s, done.
Resolving deltas: 100% (35/35), done.


In [3]:
cd Capstone-NLUS-VDD

/kaggle/working/Capstone-NLUS-VDD


In [4]:
!pip install -r requirements.txt
!pip -q install sqlglot

Collecting duckdb==1.1.3 (from -r requirements.txt (line 1))
  Downloading duckdb-1.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)
Collecting openai-whisper (from -r requirements.txt (line 2))
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting jiwer (from -r requirements.txt (line 3))
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting bitsandbytes (from -r requirements.txt (line 4))
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting edge-tts (from -r requirements.txt (line 15))
  Downloading edge_tts-7.2.7-py3-none-any.whl.metadata (5.5 kB)
Collecting rapidfuzz>=3.9.7 

In [5]:
from pathlib import Path
import json
import os
import random
import time
import re
import gc
import math
from decimal import Decimal
from datetime import date, datetime

import duckdb
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoConfig, BitsAndBytesConfig


def find_repo_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "research_pipeline").exists():
            return p
    return start

REPO_ROOT = find_repo_root(Path.cwd())
DB_PATH = REPO_ROOT / "research_pipeline" / "data" / "ecommerce_dw.duckdb"
PRIMARY_BENCHMARK = REPO_ROOT / "research_pipeline" / "data" / "test_queries_vi_200.json"
FALLBACK_BENCHMARK = REPO_ROOT / "research_pipeline" / "test_queries.json"
OUTPUT_DIR = REPO_ROOT / "research_pipeline"
RUN_ID = None  # set like "run1" or time.strftime("%Y%m%d_%H%M%S")

MODEL_CHOICES = {
    "qwen_3_4b_text_to_sql": {
        "type": "lora_causal",
        "adapter_id": "Ellbendls/Qwen-3-4b-Text_to_SQL",
        "base_id": "Qwen/Qwen3-4B-Instruct-2507",
        "tokenizer_id": "Ellbendls/Qwen-3-4b-Text_to_SQL",
        "allow_vocab_shrink": True,
    },
    "t5_small_awesome": {
        "type": "seq2seq",
        "id": "cssupport/t5-small-awesome-text-to-sql",
        "tokenizer_id": "cssupport/t5-small-awesome-text-to-sql",
    },
    "llama3_1_8b_lora": {
        "type": "lora_causal",
        "adapter_id": "philschmid/code-llama-3-1-8b-text-to-sql-lora",
        "base_id": "meta-llama/Meta-Llama-3.1-8B",
        "tokenizer_id": "meta-llama/Meta-Llama-3.1-8B",
    },
}
MODEL_ORDER = ["qwen_3_4b_text_to_sql", "t5_small_awesome", "llama3_1_8b_lora"]
RUN_ALL_MODELS = True
MODEL_CHOICE = "qwen_3_4b_text_to_sql"
CONTINUE_ON_ERROR = True

MAX_SAMPLES = 50  # set None to run full benchmark
SAMPLE_SEED = 42
DEFAULT_LIMIT = None  # set to an int to force LIMIT on both GT and generated SQL
MAX_TABLES = None  # set to an int to shorten schema prompt
MAX_NEW_TOKENS = 256
NUM_BEAMS = 1

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
USE_4BIT = torch.cuda.is_available()
print(f"Using device: {DEVICE}")

def make_output_path(stem: str) -> Path:
    suffix = f"_{RUN_ID}" if RUN_ID else ""
    return OUTPUT_DIR / f"{stem}{suffix}.csv"

OUTPUT_CSV_ALL = make_output_path("benchmark_text_to_sql_all")


Using device: cuda


In [6]:
AUTO_SETUP_DB = True
SETUP_SCALE_FACTOR = 1
FORCE_RECREATE_DB = False

def setup_tpcds_db(db_path: Path, scale_factor: int = 1, force_recreate: bool = False) -> None:
    db_path.parent.mkdir(parents=True, exist_ok=True)
    con = duckdb.connect(str(db_path))
    try:
        con.execute("INSTALL tpcds;")
        con.execute("LOAD tpcds;")

        tables = [r[0] for r in con.execute("SHOW TABLES").fetchall()]
        if tables and not force_recreate:
            print(f"Found {len(tables)} tables. Skip generation.")
            return

        if force_recreate and tables:
            for t in tables:
                con.execute(f"DROP TABLE {t}")

        print(f"Generating TPC-DS (sf={scale_factor})...")
        start = time.time()
        con.execute(f"CALL dsdgen(sf={scale_factor});")
        print(f"Data generation completed in {time.time() - start:.2f}s")
    finally:
        con.close()

if not DB_PATH.exists():
    if AUTO_SETUP_DB:
        setup_tpcds_db(DB_PATH, scale_factor=SETUP_SCALE_FACTOR, force_recreate=FORCE_RECREATE_DB)
    else:
        raise FileNotFoundError(f"TPC-DS DuckDB not found: {DB_PATH}")


Generating TPC-DS (sf=1)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Data generation completed in 29.41s


In [7]:
con = duckdb.connect(str(DB_PATH), read_only=True)

def duckdb_schema_prompt(con, *, table_schema: str = "main", max_tables: int | None = None) -> str:
    rows = con.execute(
        """
        SELECT table_name, column_name, data_type, ordinal_position
        FROM information_schema.columns
        WHERE table_schema = ?
        ORDER BY table_name, ordinal_position
        """,
        [table_schema],
    ).fetchall()

    tables: dict[str, list[tuple[str, str]]] = {}
    for table_name, column_name, data_type, _ in rows:
        tables.setdefault(str(table_name), []).append((str(column_name), str(data_type)))

    table_names = sorted(tables.keys())
    if max_tables is not None:
        table_names = table_names[:max_tables]

    lines: list[str] = []
    for t in table_names:
        lines.append(f"TABLE {t} (")
        for col, typ in tables[t]:
            lines.append(f"  {col} {typ}")
        lines.append(")")
        lines.append("")
    return "".join(lines).strip()

schema_text = duckdb_schema_prompt(con, max_tables=MAX_TABLES)
print(f"Schema tables: {schema_text.count('TABLE ')}")


Schema tables: 24


In [8]:
if PRIMARY_BENCHMARK.exists():
    benchmark_path = PRIMARY_BENCHMARK
elif FALLBACK_BENCHMARK.exists():
    benchmark_path = FALLBACK_BENCHMARK
else:
    raise FileNotFoundError("No benchmark JSON found.")

raw_items = json.loads(benchmark_path.read_text())
items = []
for item in raw_items:
    question = item.get("text") or item.get("question")
    sql = item.get("sql")
    if not question or not sql:
        continue
    items.append({
        "id": item.get("id", f"q{len(items)+1}"),
        "text": question,
        "sql": sql,
    })

if MAX_SAMPLES:
    random.seed(SAMPLE_SEED)
    items = random.sample(items, min(MAX_SAMPLES, len(items)))

print(f"Benchmark items: {len(items)} from {benchmark_path}")


Benchmark items: 5 from /kaggle/working/Capstone-NLUS-VDD/research_pipeline/test_queries.json


In [9]:
try:
    import sqlglot
except Exception:
    sqlglot = None
    print("sqlglot not installed: SQL normalization will be simple.")


In [10]:
def load_model_and_tokenizer(spec: dict):
    model_type = spec["type"]
    quant_config = BitsAndBytesConfig(load_in_4bit=True) if USE_4BIT else None

    def resolve_tokenizer_and_config(model_id: str, tokenizer_id: str | None = None, allow_shrink: bool = False):
        tokenizer_id = tokenizer_id or model_id
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, use_fast=True, trust_remote_code=True)
        config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
        if len(tokenizer) != config.vocab_size and (len(tokenizer) > config.vocab_size or allow_shrink):
            print(f"Extending vocab_size for {model_id}: {config.vocab_size} -> {len(tokenizer)}")
            config.vocab_size = len(tokenizer)
        return tokenizer, config

    if model_type == "seq2seq":
        tokenizer, config = resolve_tokenizer_and_config(spec["id"], spec.get("tokenizer_id", spec.get("allow_vocab_shrink", False)), spec.get("allow_vocab_shrink", False))
        model = AutoModelForSeq2SeqLM.from_pretrained(
            spec["id"],
            config=config,
            device_map="auto" if DEVICE == "cuda" else None,
            dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
            trust_remote_code=True,
        )
        model_kind = "seq2seq"
        model_id = spec["id"]
    elif model_type == "lora_causal":
        from peft import PeftConfig, PeftModel
        adapter_id = spec["adapter_id"]
        peft_config = PeftConfig.from_pretrained(adapter_id)
        base_id = spec.get("base_id") or peft_config.base_model_name_or_path
        if base_id and "meta-llama" in base_id and not os.environ.get("HF_TOKEN"):
            raise RuntimeError(f"HF_TOKEN not set for gated base model: {base_id}. Set HF_TOKEN or disable this model.")
        tokenizer, config = resolve_tokenizer_and_config(base_id, spec.get("tokenizer_id", spec.get("allow_vocab_shrink", False)), spec.get("allow_vocab_shrink", False))
        model_kwargs = dict(
            config=config,
            device_map="auto" if DEVICE == "cuda" else None,
            dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
            trust_remote_code=True,
            ignore_mismatched_sizes=True,
        )
        if quant_config is not None:
            model_kwargs["quantization_config"] = quant_config
        model = AutoModelForCausalLM.from_pretrained(base_id, **model_kwargs)
        model = PeftModel.from_pretrained(model, adapter_id)
        model_kind = "causal"
        model_id = f"{base_id} + {adapter_id}"
    else:
        tokenizer, config = resolve_tokenizer_and_config(spec["id"], spec.get("tokenizer_id", spec.get("allow_vocab_shrink", False)), spec.get("allow_vocab_shrink", False))
        model_kwargs = dict(
            config=config,
            device_map="auto" if DEVICE == "cuda" else None,
            dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
            trust_remote_code=True,
            ignore_mismatched_sizes=True,
        )
        if quant_config is not None:
            model_kwargs["quantization_config"] = quant_config
        model = AutoModelForCausalLM.from_pretrained(spec["id"], **model_kwargs)
        model_kind = "causal"
        model_id = spec["id"]

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.eval()
    return tokenizer, model, model_kind, model_id


In [11]:
_FORBIDDEN_SQL = re.compile(
    r"\b(INSERT|UPDATE|DELETE|DROP|ALTER|CREATE|COPY|PRAGMA|ATTACH|DETACH|EXPORT|IMPORT|CALL)\b",
    re.IGNORECASE,
)

SYSTEM_PROMPT = (
    "You translate user questions into SQL for DuckDB (TPC-DS). "
    "Return only SQL, no markdown, no explanations. "
    "Use only tables and columns from the schema."
)
SEQ2SEQ_PROMPT_TEMPLATE = "translate to SQL:\n{question}\n\nSCHEMA:\n{schema}\n\nSQL:"

def extract_sql(text: str) -> str:
    text = text.strip()
    m = re.search(r"```(?:sql)?\s*(.*?)```", text, flags=re.IGNORECASE | re.DOTALL)
    if m:
        text = m.group(1).strip()
    if text.lower().startswith("sql:"):
        text = text[4:].strip()
    if ";" in text:
        text = text.split(";", 1)[0].strip()
    return text

def is_safe_select(sql: str) -> bool:
    s = re.sub(r"--.*?$", "", sql, flags=re.MULTILINE).strip()
    if not s:
        return False
    if _FORBIDDEN_SQL.search(s):
        return False
    first = re.split(r"\s+", s, maxsplit=1)[0].upper()
    return first in {"SELECT", "WITH"}

def ensure_limit(sql: str, limit: int | None) -> str:
    if limit is None:
        return sql
    s = sql.strip().rstrip(";").strip()
    if re.search(r"\bLIMIT\b", s, flags=re.IGNORECASE):
        return s
    return f"{s}\nLIMIT {limit}"

def has_order_by(sql: str) -> bool:
    return re.search(r"\border\s+by\b", sql, flags=re.IGNORECASE) is not None

def normalize_sql(sql: str) -> str:
    if sqlglot is not None:
        try:
            return sqlglot.parse_one(sql, read="duckdb").sql(dialect="duckdb", pretty=False)
        except Exception:
            pass
    return re.sub(r"\s+", " ", sql.strip()).lower()

def build_prompt(question: str, schema_text: str, tokenizer, model_kind: str) -> str:
    if model_kind == "seq2seq":
        return SEQ2SEQ_PROMPT_TEMPLATE.format(question=question, schema=schema_text)
    user = f"SCHEMA:\n{schema_text}\n\nQUESTION:\n{question}\n\nSQL:"
    if getattr(tokenizer, "chat_template", None):
        return tokenizer.apply_chat_template(
            [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user},
            ],
            tokenize=False,
            add_generation_prompt=True,
        )
    return f"{SYSTEM_PROMPT}\n\n{user}"

def generate_sql(question: str, schema_text: str, tokenizer, model, model_kind: str) -> str:
    prompt = build_prompt(question, schema_text, tokenizer, model_kind)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
    pad_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            num_beams=NUM_BEAMS,
            pad_token_id=pad_id,
        )
    if model_kind == "seq2seq":
        text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    else:
        gen_ids = output_ids[0][inputs["input_ids"].shape[1]:]
        text = tokenizer.decode(gen_ids, skip_special_tokens=True)
    sql = extract_sql(text)
    sql = ensure_limit(sql, DEFAULT_LIMIT)
    return sql


In [12]:
def normalize_value(v):
    if isinstance(v, float):
        if math.isnan(v):
            return "nan"
        return round(v, 6)
    if isinstance(v, Decimal):
        return float(round(v, 6))
    if isinstance(v, (datetime, date)):
        return v.isoformat()
    return v

def normalize_rows(rows, keep_order: bool):
    if rows is None:
        return None
    norm = [tuple(normalize_value(x) for x in row) for row in rows]
    return norm if keep_order else sorted(norm)

def run_sql(con, sql: str):
    try:
        res = con.execute(sql).fetchall()
        return res, None
    except Exception as e:
        return None, str(e)


In [13]:
gt_cache = {}
for item in items:
    qid = item["id"]
    gt_sql = ensure_limit(item["sql"], DEFAULT_LIMIT)
    gt_res, gt_err = run_sql(con, gt_sql)
    gt_cache[qid] = {
        "sql": gt_sql,
        "res": gt_res,
        "err": gt_err,
        "has_order": has_order_by(gt_sql),
        "norm_sorted": normalize_rows(gt_res, keep_order=False) if gt_err is None else None,
        "norm_ordered": normalize_rows(gt_res, keep_order=True) if gt_err is None else None,
    }
print(f"Ground-truth cached: {len(gt_cache)}")


Ground-truth cached: 5


In [14]:
def run_benchmark_for_model(model_choice: str):
    spec = MODEL_CHOICES[model_choice]
    print(f"Loading model choice: {model_choice}")
    tokenizer, model, model_kind, model_id = load_model_and_tokenizer(spec)

    results = []
    for idx, item in enumerate(items, 1):
        qid = item["id"]
        question = item["text"]
        gt = gt_cache[qid]

        start = time.time()
        gen_sql = generate_sql(question, schema_text, tokenizer, model, model_kind)
        gen_time = time.time() - start

        valid_sql = is_safe_select(gen_sql)
        if valid_sql:
            exec_start = time.time()
            gen_res, gen_err = run_sql(con, gen_sql)
            exec_time = time.time() - exec_start
        else:
            gen_res, gen_err, exec_time = None, "INVALID_SQL", None

        exact_match = False
        if valid_sql and gen_err is None:
            exact_match = normalize_sql(gen_sql) == normalize_sql(gt["sql"])

        exec_match = False
        if valid_sql and gen_err is None and gt["err"] is None:
            keep_order = gt["has_order"] or has_order_by(gen_sql)
            gt_norm = gt["norm_ordered"] if keep_order else gt["norm_sorted"]
            gen_norm = normalize_rows(gen_res, keep_order=keep_order)
            exec_match = gt_norm == gen_norm

        results.append({
            "id": qid,
            "question": question,
            "gt_sql": gt["sql"],
            "gen_sql": gen_sql,
            "valid_sql": valid_sql,
            "exact_match": exact_match,
            "exec_match": exec_match,
            "gen_error": gen_err,
            "gt_error": gt["err"],
            "gen_time_sec": gen_time,
            "exec_time_sec": exec_time,
            "model_choice": model_choice,
            "model_id": model_id,
        })

        if idx % 10 == 0:
            print(f"Processed {idx}/{len(items)}")

    results_df = pd.DataFrame(results)

    del model
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return results_df


In [15]:
model_choices = MODEL_ORDER if RUN_ALL_MODELS else [MODEL_CHOICE]
all_results = []
summary_rows = []

for choice in model_choices:
    try:
        results_df = run_benchmark_for_model(choice)
    except Exception as e:
        print(f"Model {choice} failed: {e}")
        if CONTINUE_ON_ERROR:
            continue
        raise

    out_path = make_output_path(f"benchmark_text_to_sql_{choice}")
    results_df.to_csv(out_path, index=False)
    all_results.append(results_df)

    valid_mask = results_df["valid_sql"]
    exec_success = results_df["gen_error"].isna()
    exec_acc_all = results_df["exec_match"].mean() if not results_df.empty else 0.0
    valid_exec_mask = valid_mask & results_df["gt_error"].isna()
    exec_acc_valid = results_df.loc[valid_exec_mask, "exec_match"].mean() if valid_exec_mask.any() else 0.0
    exact_match_rate = results_df.loc[valid_mask, "exact_match"].mean() if valid_mask.any() else 0.0

    summary_rows.append({
        "model_choice": choice,
        "model_id": results_df["model_id"].iloc[0] if not results_df.empty else None,
        "total": len(results_df),
        "valid_sql_rate": float(valid_mask.mean()) if not results_df.empty else 0.0,
        "exec_success_rate": float(exec_success.mean()) if not results_df.empty else 0.0,
        "exec_acc_all": exec_acc_all,
        "exec_acc_valid": exec_acc_valid,
        "exact_match_rate": exact_match_rate,
        "avg_gen_time_sec": float(results_df["gen_time_sec"].mean()) if not results_df.empty else 0.0,
        "avg_exec_time_sec": float(results_df["exec_time_sec"].dropna().mean()) if results_df["exec_time_sec"].notna().any() else 0.0,
        "invalid_sql": int((results_df["gen_error"] == "INVALID_SQL").sum()),
        "gen_exec_errors": int(results_df["gen_error"].notna().sum()),
        "gt_exec_errors": int(results_df["gt_error"].notna().sum()),
        "output_csv": str(out_path),
    })

if not all_results:
    raise RuntimeError("No model results produced.")

combined_df = pd.concat(all_results, ignore_index=True)
combined_df.to_csv(OUTPUT_CSV_ALL, index=False)

summary_df = pd.DataFrame(summary_rows)
print("Summary")
print(summary_df)
print(f"Combined results saved to: {OUTPUT_CSV_ALL}")


Loading model choice: qwen_3_4b_text_to_sql


2025-12-27 07:05:26.724598: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766819126.889665      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766819126.943130      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766819127.345128      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766819127.345171      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766819127.345174      55 computation_placer.cc:177] computation placer alr

adapter_config.json:   0%|          | 0.00/939 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/419 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/196 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Extending vocab_size for Qwen/Qwen3-4B-Instruct-2507: 151936 -> 151669


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Qwen3ForCausalLM were not initialized from the model checkpoint at Qwen/Qwen3-4B-Instruct-2507 and are newly initialized because the shapes did not match:
- model.embed_tokens.weight: found shape torch.Size([151936, 2560]) in the checkpoint and torch.Size([151669, 2560]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.24G [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading model choice: t5_small_awesome


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Loading model choice: llama3_1_8b_lora


adapter_config.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

Model llama3_1_8b_lora failed: HF_TOKEN not set for gated base model: meta-llama/Meta-Llama-3.1-8B. Set HF_TOKEN or disable this model.
Summary
            model_choice                                           model_id  \
0  qwen_3_4b_text_to_sql  Qwen/Qwen3-4B-Instruct-2507 + Ellbendls/Qwen-3...   
1       t5_small_awesome             cssupport/t5-small-awesome-text-to-sql   

   total  valid_sql_rate  exec_success_rate  exec_acc_all  exec_acc_valid  \
0      5             1.0                0.6           0.6             0.6   
1      5             1.0                0.2           0.0             0.0   

   exact_match_rate  avg_gen_time_sec  avg_exec_time_sec  invalid_sql  \
0               0.0         12.599303           0.012015            0   
1               0.0          1.889598           0.000993            0   

   gen_exec_errors  gt_exec_errors  \
0                2               0   
1                4               0   

                                          output_cs

In [18]:
import pandas as pd
data = pd.read_csv("/kaggle/working/Capstone-NLUS-VDD/research_pipeline/benchmark_text_to_sql_all.csv")
data.head(20)

Unnamed: 0,id,question,gt_sql,gen_sql,valid_sql,exact_match,exec_match,gen_error,gt_error,gen_time_sec,exec_time_sec,model_choice,model_id
0,q1,Find the top 10 electronics items with a price...,"SELECT i_item_id, i_item_desc, i_current_price...","SELECT i_item_sk, i_item_name, SUM(cs_ext_sale...",True,False,False,"Binder Error: Referenced column ""i_item_name"" ...",,15.591726,0.00112,qwen_3_4b_text_to_sql,Qwen/Qwen3-4B-Instruct-2507 + Ellbendls/Qwen-3...
1,q5,Who are the top 3 customers who spent the most...,"SELECT c_last_name, c_first_name, sum(ss_net_p...","SELECT c_customer_sk, SUM(ss_net_paid) as tota...",True,False,True,,,14.406594,0.0065,qwen_3_4b_text_to_sql,Qwen/Qwen3-4B-Instruct-2507 + Ellbendls/Qwen-3...
2,q3,Calculate the total quantity sold and total ne...,"SELECT sum(ss_quantity) as total_quantity, sum...","SELECT SUM(ss_quantity), SUM(ss_net_profit) FR...",True,False,True,,,10.953556,0.002932,qwen_3_4b_text_to_sql,Qwen/Qwen3-4B-Instruct-2507 + Ellbendls/Qwen-3...
3,q2,Which are the top 5 states with the highest nu...,"SELECT ca_state, COUNT(*) as customer_count FR...","SELECT s_state, COUNT(*) as num_customers FROM...",True,False,False,"Binder Error: Referenced column ""s_state"" not ...",,10.569335,0.000695,qwen_3_4b_text_to_sql,Qwen/Qwen3-4B-Instruct-2507 + Ellbendls/Qwen-3...
4,q4,Show me the average sales price for each categ...,"SELECT i_category, AVG(ss_sales_price) as avg_...","SELECT i.i_category, AVG(ss_sales_price) as av...",True,False,True,,,11.475305,0.048827,qwen_3_4b_text_to_sql,Qwen/Qwen3-4B-Instruct-2507 + Ellbendls/Qwen-3...
5,q1,Find the top 10 electronics items with a price...,"SELECT i_item_id, i_item_desc, i_current_price...","SELECT cc_name, MAX(cc_tax_percentage) FROM ca...",True,False,False,"Parser Error: syntax error at or near ""FROM""",,2.734055,0.000275,t5_small_awesome,cssupport/t5-small-awesome-text-to-sql
6,q5,Who are the top 3 customers who spent the most...,"SELECT c_last_name, c_first_name, sum(ss_net_p...","SELECT cc_call_center_name, MAX(cc_start_date_...",True,False,False,Parser Error: syntax error at end of input,,3.076737,0.000239,t5_small_awesome,cssupport/t5-small-awesome-text-to-sql
7,q3,Calculate the total quantity sold and total ne...,"SELECT sum(ss_quantity) as total_quantity, sum...","SELECT cc_county, SUM(null) FROM catalog_retur...",True,False,False,Catalog Error: Table with name cc_call_center_...,,2.733557,0.000778,t5_small_awesome,cssupport/t5-small-awesome-text-to-sql
8,q2,Which are the top 5 states with the highest nu...,"SELECT ca_state, COUNT(*) as customer_count FR...","SELECT cc_county, MAX(cc_county) FROM call_cen...",True,False,False,,,0.532614,0.003034,t5_small_awesome,cssupport/t5-small-awesome-text-to-sql
9,q4,Show me the average sales price for each categ...,"SELECT i_category, AVG(ss_sales_price) as avg_...","SELECT cc_county, AVG(sales) FROM catalog_retu...",True,False,False,"Binder Error: Referenced column ""cc_county"" no...",,0.371027,0.000637,t5_small_awesome,cssupport/t5-small-awesome-text-to-sql
