# Qwen Text-to-SQL Fine-tune (QLoRA)

This notebook fine-tunes the Qwen adapter `Ellbendls/Qwen-3-4b-Text_to_SQL` on the local TPC-DS dataset `research_pipeline/data/data_finetune.csv`.
It uses QLoRA (4-bit) and saves a new adapter locally for evaluation.


In [None]:
!git clone https://github.com/VuThanhLam124/Capstone-NLUS-VDD.git

In [None]:
cd Capstone-NLUS-VDD

In [None]:
!pip install -r requirements.txt

In [None]:
# Optional: install deps if missing (Kaggle)
!pip -q install -U "transformers>=4.43" "peft>=0.10" "bitsandbytes>=0.43" "accelerate>=0.30" "trl>=0.12" "datasets>=2.19"


In [None]:
from pathlib import Path
import os
import random
import re
import time
import unicodedata

import duckdb
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed,
)
from peft import PeftModel, prepare_model_for_kbit_training

try:
    from trl import SFTTrainer
    from trl import DataCollatorForCompletionOnlyLM
except Exception:
    SFTTrainer = None
    DataCollatorForCompletionOnlyLM = None

print("torch", torch.__version__)


In [None]:
def find_repo_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "research_pipeline").exists():
            return p
    return start


REPO_ROOT = find_repo_root(Path.cwd())
DATA_PATHS = [
    REPO_ROOT / "research_pipeline" / "data" / "data_finetune.csv",
    REPO_ROOT / "data" / "data_finetune.csv",
]
DB_PATH = REPO_ROOT / "research_pipeline" / "data" / "ecommerce_dw.duckdb"
OUTPUT_DIR = REPO_ROOT / "research_pipeline" / "qwen_text_to_sql_lora_v1"

SEED = 42
MAX_SAMPLES = None  # set to int for quick debug
TRAIN_SPLIT = 0.9
MAX_TABLES = 8
MAX_SEQ_LEN = 768

BASE_ID = "Qwen/Qwen3-4B-Instruct-2507"
ADAPTER_ID = "Ellbendls/Qwen-3-4b-Text_to_SQL"

BATCH_SIZE = 1
GRAD_ACCUM = 8
NUM_EPOCHS = 2
LEARNING_RATE = 2e-4
WARMUP_RATIO = 0.05

set_seed(SEED)


def resolve_data_path(paths):
    for p in paths:
        if p.exists():
            return p
    raise FileNotFoundError(f"No data_finetune.csv found in: {paths}")


DATA_PATH = resolve_data_path(DATA_PATHS)
print("Using data:", DATA_PATH)
print("DB path:", DB_PATH)


In [None]:
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["Transcription", "SQL Ground Truth"]).copy()
df["Transcription"] = df["Transcription"].astype(str).str.strip()
df["SQL Ground Truth"] = df["SQL Ground Truth"].astype(str).str.strip()

# Ensure SQL ends with a semicolon for consistency
def normalize_sql(sql: str) -> str:
    sql = sql.strip()
    if not sql.endswith(";"):
        sql = sql + ";"
    return sql


df["SQL Ground Truth"] = df["SQL Ground Truth"].map(normalize_sql)

# Optional: drop duplicate SQLs to reduce overfitting
df = df.drop_duplicates(subset=["SQL Ground Truth"]).reset_index(drop=True)

if MAX_SAMPLES is not None:
    df = df.sample(n=min(MAX_SAMPLES, len(df)), random_state=SEED).reset_index(drop=True)

print("Rows:", len(df))
df.head()


In [None]:
con = duckdb.connect(str(DB_PATH), read_only=True)
schema_map = {}
for (table_name,) in con.execute("SHOW TABLES").fetchall():
    columns = [r[0] for r in con.execute(f"DESCRIBE {table_name}").fetchall()]
    schema_map[table_name] = columns
con.close()


def strip_accents(text: str) -> str:
    return "".join(ch for ch in unicodedata.normalize("NFD", text) if unicodedata.category(ch) != "Mn")


def tokenize(text: str) -> list[str]:
    text = strip_accents(text.lower())
    raw_tokens = re.findall(r"[a-z0-9_]+", text)
    tokens = []
    for tok in raw_tokens:
        tokens.extend(tok.split("_"))
    return [t for t in tokens if len(t) > 1]


SYNONYMS = {
    "khach": "customer",
    "khachhang": "customer",
    "sanpham": "item",
    "hang": "item",
    "danhmuc": "category",
    "bang": "state",
    "tinh": "state",
    "cuahang": "store",
    "doanhthu": "revenue",
    "soluong": "quantity",
    "gia": "price",
    "thang": "month",
    "nam": "year",
    "quy": "quarter",
}


def expand_tokens(tokens: list[str]) -> set[str]:
    expanded = set(tokens)
    for tok in list(tokens):
        mapped = SYNONYMS.get(tok)
        if mapped:
            expanded.add(mapped)
    return expanded


table_tokens = {}
for table, cols in schema_map.items():
    tokens = set(tokenize(table))
    for col in cols:
        tokens.update(tokenize(col))
    table_tokens[table] = tokens


def select_tables_for_question(question: str, max_tables: int = 8) -> list[str]:
    q_tokens = expand_tokens(tokenize(question))
    scored = []
    for table, tokens in table_tokens.items():
        score = len(q_tokens & tokens)
        scored.append((score, table))
    scored.sort(reverse=True)

    selected = [t for score, t in scored if score > 0][:max_tables]

    def ensure(table: str):
        if table in schema_map and table not in selected:
            selected.append(table)

    if any(tok in q_tokens for tok in {"year", "month", "quarter", "date"}):
        ensure("date_dim")
    if any(tok in q_tokens for tok in {"customer"}):
        ensure("customer")
        ensure("customer_address")
    if "state" in q_tokens:
        ensure("customer_address")
        ensure("store")
    if "store" in q_tokens:
        ensure("store_sales")
        ensure("store")
    if "web" in q_tokens:
        ensure("web_sales")
        ensure("web_site")
    if "catalog" in q_tokens:
        ensure("catalog_sales")
        ensure("call_center")
    if "call" in q_tokens:
        ensure("call_center")
    if "inventory" in q_tokens:
        ensure("inventory")
    if any(tok in q_tokens for tok in {"item", "product", "category"}):
        ensure("item")
    if any(tok in q_tokens for tok in {"sales", "revenue", "quantity", "price"}):
        ensure("store_sales")

    return selected[: max_tables or len(selected)]


def build_schema_snippet(question: str, max_tables: int = 8) -> str:
    tables = select_tables_for_question(question, max_tables=max_tables)
    if not tables:
        tables = list(schema_map.keys())

    lines = []
    for table in tables:
        cols = schema_map[table]
        lines.append(f"TABLE {table} (")
        for col in cols:
            lines.append(f"  {col}")
        lines.append(")")
        lines.append("")
    return "".join(lines).strip()


print(f"Loaded schema for {len(schema_map)} tables")


In [None]:
SYSTEM_PROMPT = (
    "You translate user questions into SQL for DuckDB (TPC-DS). "
    "Return only SQL, no markdown, no explanations. "
    "Use only tables and columns from the schema."
)


def format_record(record, tokenizer) -> str:
    question = record["Transcription"]
    sql = record["SQL Ground Truth"]
    schema_text = build_schema_snippet(question, max_tables=MAX_TABLES)
    user = f"""SCHEMA:
{schema_text}

QUESTION:
{question}

SQL:"""

    if getattr(tokenizer, "chat_template", None):
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user},
            {"role": "assistant", "content": sql},
        ]
        return tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
        )

    return f"""{SYSTEM_PROMPT}

{user} {sql}"""


In [None]:
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

dataset = Dataset.from_pandas(df)

def map_fn(record):
    return {"text": format_record(record, tokenizer)}

dataset = dataset.map(map_fn, remove_columns=dataset.column_names)
dataset = dataset.shuffle(seed=SEED)

train_size = int(len(dataset) * TRAIN_SPLIT)
train_dataset = dataset.select(range(train_size))
eval_dataset = dataset.select(range(train_size, len(dataset)))

print("Train size:", len(train_dataset))
print("Eval size:", len(eval_dataset))


In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

config = AutoConfig.from_pretrained(BASE_ID, trust_remote_code=True)
if len(tokenizer) != config.vocab_size:
    print(f"Adjust vocab_size {config.vocab_size} -> {len(tokenizer)}")
    config.vocab_size = len(tokenizer)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_ID,
    config=config,
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True,
    ignore_mismatched_sizes=True,
)

if len(tokenizer) != base_model.get_input_embeddings().weight.shape[0]:
    base_model.resize_token_embeddings(len(tokenizer))

base_model = prepare_model_for_kbit_training(base_model)
model = PeftModel.from_pretrained(base_model, ADAPTER_ID, is_trainable=True)

model.print_trainable_parameters()
model.config.use_cache = False


In [None]:
output_dir = str(OUTPUT_DIR)
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    warmup_ratio=WARMUP_RATIO,
    fp16=True,
    logging_steps=20,
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=False,
    report_to="none",
    optim="paged_adamw_8bit",
)

response_template = "<|im_start|>assistant\n" if getattr(tokenizer, "chat_template", None) else "SQL:"

data_collator = None
if DataCollatorForCompletionOnlyLM is not None:
    data_collator = DataCollatorForCompletionOnlyLM(
        response_template=response_template,
        tokenizer=tokenizer,
    )

trainer_cls = SFTTrainer if SFTTrainer is not None else None

if trainer_cls is None:
    raise RuntimeError("TRL is required for this notebook. Install trl and restart kernel.")

trainer = trainer_cls(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LEN,
    packing=False,
    data_collator=data_collator,
)


In [None]:
train_result = trainer.train()
print(train_result)


In [None]:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved adapter to", OUTPUT_DIR)


## Next steps
- Use the saved adapter in `notebooks/text_to_sql_tpcds_exec_benchmark.ipynb` by setting `adapter_id` to the output dir.
- Re-run the benchmark on `test_queries_vi_200_v2.json` to compare execution accuracy.
