# SQL Ground Truth Validation

This notebook validates that every SQL in `data/data_finetune.csv` executes on the TPC-DS DuckDB and returns a result.


In [24]:
from pathlib import Path
import time
import re

import duckdb
import pandas as pd


def find_repo_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "research_pipeline").exists():
            return p
    return start


REPO_ROOT = find_repo_root(Path.cwd())
CSV_PATH = REPO_ROOT / "research_pipeline" / "data" / "data_finetune.csv"
DB_PATH = REPO_ROOT / "research_pipeline" / "data" / "ecommerce_dw.duckdb"
OUTPUT_PATH = REPO_ROOT / "research_pipeline" / "sql_ground_truth_validation.csv"

AUTO_SETUP_DB = True
SETUP_SCALE_FACTOR = 1
FORCE_RECREATE_DB = False

FETCH_LIMIT = 50
ALLOW_MUTATION = False

print(f"Repo root: {REPO_ROOT}")
print(f"CSV path: {CSV_PATH}")
print(f"DB path: {DB_PATH}")


Repo root: /home/ubuntu/DataScience/Capstone-NLUS-VDD
CSV path: /home/ubuntu/DataScience/Capstone-NLUS-VDD/research_pipeline/data/data_finetune.csv
DB path: /home/ubuntu/DataScience/Capstone-NLUS-VDD/research_pipeline/data/ecommerce_dw.duckdb


In [25]:
def setup_tpcds_db(db_path: Path, scale_factor: int = 1, force_recreate: bool = False) -> None:
    db_path.parent.mkdir(parents=True, exist_ok=True)
    con = duckdb.connect(str(db_path))
    try:
        con.execute("INSTALL tpcds;")
        con.execute("LOAD tpcds;")

        tables = [r[0] for r in con.execute("SHOW TABLES").fetchall()]
        if tables and not force_recreate:
            print(f"Found {len(tables)} tables. Skip generation.")
            return

        if force_recreate and tables:
            for t in tables:
                con.execute(f"DROP TABLE {t}")

        print(f"Generating TPC-DS (sf={scale_factor})...")
        start = time.time()
        con.execute(f"CALL dsdgen(sf={scale_factor});")
        print(f"Data generation completed in {time.time() - start:.2f}s")
    finally:
        con.close()


if not DB_PATH.exists():
    if AUTO_SETUP_DB:
        setup_tpcds_db(DB_PATH, scale_factor=SETUP_SCALE_FACTOR, force_recreate=FORCE_RECREATE_DB)
    else:
        raise FileNotFoundError(f"TPC-DS DuckDB not found: {DB_PATH}")


In [26]:
df = pd.read_csv(CSV_PATH)
required_cols = {"ID", "Transcription", "SQL Ground Truth"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing columns: {sorted(missing)}")

print(f"Loaded {len(df)} rows.")
df.head()


Loaded 900 rows.


Unnamed: 0,ID,Transcription,SQL Ground Truth
0,1,Lấy thông tin định danh và họ tên khách hàng,"SELECT c_customer_id, c_first_name, c_last_nam..."
1,2,Tìm tất cả khách hàng đến từ Việt Nam,SELECT * FROM customer \nWHERE c_birth_country...
2,3,Đếm tổng số lượng khách hàng trong bảng,SELECT COUNT(*) AS total_customers \nFROM cust...
3,4,Lọc danh sách khách hàng thân thiết,"SELECT c_first_name, c_last_name, c_email_addr..."
4,5,Lấy ra danh sách các khách hàng hiện tại không...,"SELECT c_first_name, c_last_name, c_email_addr..."


In [27]:
DML_PATTERN = re.compile(r"\b(create|insert|update|delete|drop|alter|copy|export)\b", re.IGNORECASE)


def normalize_sql(value) -> str | None:
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return None
    sql = str(value).strip()
    if not sql:
        return None
    while sql.endswith(";"):
        sql = sql[:-1].strip()
    return sql


def is_read_only(sql: str) -> bool:
    return DML_PATTERN.search(sql) is None


def execute_sql(con: duckdb.DuckDBPyConnection, sql: str) -> dict:
    start = time.time()
    try:
        cur = con.execute(sql)
        rows = cur.fetchmany(FETCH_LIMIT)
        cols = [d[0] for d in cur.description] if cur.description else []
        return {
            "exec_ok": True,
            "has_rows": len(rows) > 0,
            "row_count_sample": len(rows),
            "col_count": len(cols),
            "error_type": None,
            "error_message": None,
            "exec_time_sec": time.time() - start,
        }
    except Exception as exc:
        message = str(exc)
        error_type = message.split(":", 1)[0] if ":" in message else type(exc).__name__
        return {
            "exec_ok": False,
            "has_rows": False,
            "row_count_sample": 0,
            "col_count": 0,
            "error_type": error_type,
            "error_message": message,
            "exec_time_sec": time.time() - start,
        }


results = []
con = duckdb.connect(str(DB_PATH), read_only=True)
try:
    iterator = df.iterrows()
    try:
        from tqdm.auto import tqdm
        iterator = tqdm(iterator, total=len(df))
    except Exception:
        pass

    for _, row in iterator:
        sql_raw = row["SQL Ground Truth"]
        sql_clean = normalize_sql(sql_raw)

        record = {
            "id": row["ID"],
            "question": row["Transcription"],
            "sql_raw": sql_raw,
            "sql_clean": sql_clean,
        }

        if not sql_clean:
            record.update({
                "exec_ok": False,
                "has_rows": False,
                "row_count_sample": 0,
                "col_count": 0,
                "error_type": "EMPTY_SQL",
                "error_message": "Missing SQL Ground Truth",
                "exec_time_sec": 0.0,
            })
            results.append(record)
            continue

        if not ALLOW_MUTATION and not is_read_only(sql_clean):
            record.update({
                "exec_ok": False,
                "has_rows": False,
                "row_count_sample": 0,
                "col_count": 0,
                "error_type": "NON_READ_ONLY",
                "error_message": "Mutation statements are not allowed",
                "exec_time_sec": 0.0,
            })
            results.append(record)
            continue

        record.update(execute_sql(con, sql_clean))
        results.append(record)
finally:
    con.close()

results_df = pd.DataFrame(results)
results_df.head()


100%|██████████| 900/900 [00:12<00:00, 74.27it/s] 


Unnamed: 0,id,question,sql_raw,sql_clean,exec_ok,has_rows,row_count_sample,col_count,error_type,error_message,exec_time_sec
0,1,Lấy thông tin định danh và họ tên khách hàng,"SELECT c_customer_id, c_first_name, c_last_nam...","SELECT c_customer_id, c_first_name, c_last_nam...",True,True,50,3,,,0.002391
1,2,Tìm tất cả khách hàng đến từ Việt Nam,SELECT * FROM customer \nWHERE c_birth_country...,SELECT * FROM customer \nWHERE c_birth_country...,True,False,0,18,,,0.004563
2,3,Đếm tổng số lượng khách hàng trong bảng,SELECT COUNT(*) AS total_customers \nFROM cust...,SELECT COUNT(*) AS total_customers \nFROM cust...,True,True,1,1,,,0.001747
3,4,Lọc danh sách khách hàng thân thiết,"SELECT c_first_name, c_last_name, c_email_addr...","SELECT c_first_name, c_last_name, c_email_addr...",True,True,50,3,,,0.007997
4,5,Lấy ra danh sách các khách hàng hiện tại không...,"SELECT c_first_name, c_last_name, c_email_addr...","SELECT c_first_name, c_last_name, c_email_addr...",True,True,50,3,,,0.004736


In [28]:
total = len(results_df)
exec_ok = results_df["exec_ok"].sum()
non_empty = results_df["has_rows"].sum()
empty_sql = (results_df["error_type"] == "EMPTY_SQL").sum()

print(f"Total queries: {total}")
print(f"Exec OK: {exec_ok} ({exec_ok / total:.2%})")
print(f"Has rows (sample): {non_empty} ({non_empty / total:.2%})")
print(f"Empty SQL: {empty_sql}")

error_counts = results_df.loc[~results_df["exec_ok"], "error_type"].value_counts()
error_counts


Total queries: 900
Exec OK: 898 (99.78%)
Has rows (sample): 787 (87.44%)
Empty SQL: 0


error_type
Parser Error    1
Binder Error    1
Name: count, dtype: int64

## Analysis

Review error rows, empty results, SQL patterns, and table coverage.


In [29]:
errors_df = results_df.loc[~results_df["exec_ok"], ["id", "question", "sql_clean", "error_type", "error_message"]]
errors_df


Unnamed: 0,id,question,sql_clean,error_type,error_message
11,12,Cho tôi xem 10 khách hàng trẻ tuổi nhất,SELECT TOP 10 * FROM customer ORDER BY c_birth...,Parser Error,"Parser Error: syntax error at or near ""10""\n\n..."
69,70,Những khách hàng nào sống ở cùng thành phố với...,"SELECT DISTINCT c.c_first_name, ca.ca_city\r\n...",Binder Error,"Binder Error: Table ""ss"" does not have a colum..."


In [30]:
empty_df = results_df.loc[(results_df["exec_ok"]) & (~results_df["has_rows"]), ["id", "question", "sql_clean"]]
empty_df.head(20)


Unnamed: 0,id,question,sql_clean
1,2,Tìm tất cả khách hàng đến từ Việt Nam,SELECT * FROM customer \nWHERE c_birth_country...
10,11,Tìm những người sinh vào năm 2002,SELECT * FROM customer WHERE c_birth_year = 2002
18,19,Tôi cần tìm các khách hàng nam có danh xưng là...,SELECT * FROM customer WHERE c_salutation = 'M...
19,20,Bạn có thể giúp tôi lọc ra và hiển thị toàn bộ...,SELECT * FROM customer WHERE c_first_name = 'N...
21,22,Liệt kê các khách hàng có danh xưng là tiến sĩ...,SELECT * FROM customer WHERE c_salutation = 'D...
22,23,Lấy địa chỉ email của những khách hàng nào đan...,SELECT c_email_address FROM customer WHERE c_l...
27,28,Hiển thị chi tiết thông tin của người dùng hiệ...,SELECT * FROM customer WHERE c_login = 'user15'
28,29,Tìm tên các sản phẩm thuộc danh mục Fashion có...,"SELECT i_product_name, i_current_price \nFROM ..."
30,31,Hiển thị tên khách hàng ở New York đã từng mua...,"SELECT DISTINCT c_first_name, c_last_name \nFR..."
31,32,Liệt kê các khách hàng ưu tiên từ nước Anh đã ...,"SELECT DISTINCT c_first_name, c_last_name \nFR..."


In [31]:
sql_series = results_df["sql_clean"].fillna("")
features = pd.DataFrame({
    "sql_len_chars": sql_series.str.len(),
    "sql_len_tokens": sql_series.str.split().str.len(),
    "has_join": sql_series.str.contains(r"\bjoin\b", case=False, regex=True),
    "has_group_by": sql_series.str.contains(r"\bgroup\s+by\b", case=False, regex=True),
    "has_order_by": sql_series.str.contains(r"\border\s+by\b", case=False, regex=True),
    "has_limit": sql_series.str.contains(r"\blimit\b", case=False, regex=True),
})
features.describe()


Unnamed: 0,sql_len_chars,sql_len_tokens
count,900.0,900.0
mean,496.085556,59.45
std,330.347867,40.987786
min,44.0,5.0
25%,290.5,36.0
50%,417.5,53.0
75%,531.0,61.0
max,1586.0,189.0


In [32]:
TABLE_PATTERN = re.compile(r"(?:from|join)\s+([a-zA-Z_][\w]*)", re.IGNORECASE)

def extract_tables(sql: str) -> list[str]:
    if not sql:
        return []
    return [t.lower() for t in TABLE_PATTERN.findall(sql)]

table_counts = sql_series.apply(extract_tables).explode().value_counts()
table_counts


sql_clean
item                      848
customer                  729
store_sales               688
customer_address          593
date_dim                  509
customer_demographics     201
web_sales                 151
store_returns             127
catalog_sales             115
inventory                  57
store                      50
warehouse                  46
current_year               36
catalog_channel            31
web_channel                31
store_channel              31
web_profit                 28
store_profit               28
catalog_profit             28
household_demographics     26
category_state_returns     24
ship_mode                  23
web_rev                    18
store_rev                  18
catalog_rev                18
web_returns                13
reason                     12
call_center                11
web_site                   11
time_dim                   10
web_page                    8
income_band                 6
catalog_returns             6


In [33]:
dup_mask = results_df["sql_clean"].duplicated(keep=False)
duplicates = results_df.loc[dup_mask, ["id", "question", "sql_clean"]].sort_values("sql_clean")
duplicates.head(20)


Unnamed: 0,id,question,sql_clean
341,342,"Năm 1999, doanh thu từ kênh Store cao hơn hay ...",WITH store_rev AS (\n SELECT 'Store' as cha...
525,526,"Năm 1999, doanh thu từ kênh Web cao hơn hay th...",WITH store_rev AS (\n SELECT 'Store' as cha...
394,395,"Năm 2002, doanh thu từ kênh Web cao hơn hay th...",WITH store_rev AS (\n SELECT 'Store' as cha...
529,530,"Năm 2002, doanh thu từ kênh Catalog cao hơn ha...",WITH store_rev AS (\n SELECT 'Store' as cha...
573,574,"Năm 2002, doanh thu từ kênh Store cao hơn hay ...",WITH store_rev AS (\n SELECT 'Store' as cha...
661,662,"Năm 1999, doanh thu từ kênh Store cao hơn hay ...",WITH store_rev AS (\n SELECT 'Store' as chann...
762,763,"Năm 1999, doanh thu từ kênh Catalog cao hơn ha...",WITH store_rev AS (\n SELECT 'Store' as chann...
785,786,"Năm 1999, doanh thu từ kênh Store cao hơn hay ...",WITH store_rev AS (\n SELECT 'Store' as chann...
889,890,"Năm 1999, doanh thu từ kênh Catalog cao hơn ha...",WITH store_rev AS (\n SELECT 'Store' as chann...
672,673,"Năm 2000, doanh thu từ kênh Store cao hơn hay ...",WITH store_rev AS (\n SELECT 'Store' as chann...


In [34]:
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
results_df.to_csv(OUTPUT_PATH, index=False)
print(f"Saved validation results to: {OUTPUT_PATH}")


Saved validation results to: /home/ubuntu/DataScience/Capstone-NLUS-VDD/research_pipeline/sql_ground_truth_validation.csv
