# Text-to-SQL V6-SQL - Direct SQL Output (Kaggle Split)

**Dataset:** WikiSQL from Salesforce GitHub
- **56k train / 8k validation / 15k test**
- Schema (table headers) in every example
- Downloads directly — no HuggingFace script issues

**Key Difference from V6-Structured:**
- **V6-Structured**: Model outputs structured indices (sel, agg, conds) → converted to SQL
- **V6-SQL**: Model outputs **executable SQL directly**
  - Uses `convert_wikisql.py` logic to pre-generate SQL for training
  - End-to-end: NL question → SQL query

**Expected:** 45-65% execution accuracy

---

In [None]:
# Cell 1: Install
!pip install -q transformers>=4.35.0 datasets>=2.14.0 accelerate>=0.24.0
!pip install -q torch sentencepiece

In [None]:
# Cell 2: Setup
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
import numpy as np
import json
import re
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("TEXT-TO-SQL V6-SQL - DIRECT SQL OUTPUT (KAGGLE)")
print("=" * 60)
print(f"CUDA: {torch.cuda.is_available()}")

MODEL_NAME = "google-t5/t5-base" if torch.cuda.is_available() else "google-t5/t5-small"
print(f"Model: {MODEL_NAME}")

In [None]:
# Cell 3: Download WikiSQL directly from source
import urllib.request
import tarfile

DATA_URL = "https://github.com/salesforce/WikiSQL/raw/master/data.tar.bz2"
DATA_DIR = "./wikisql_data"

print("Downloading WikiSQL from Salesforce GitHub...")
if not os.path.exists(DATA_DIR):
    urllib.request.urlretrieve(DATA_URL, "data.tar.bz2")
    print("Extracting...")
    with tarfile.open("data.tar.bz2", "r:bz2") as tar:
        tar.extractall(".")
    os.rename("data", DATA_DIR)
    os.remove("data.tar.bz2")
    print("Done!")
else:
    print("Already downloaded.")

print(f"\nFiles: {os.listdir(DATA_DIR)}")

In [None]:
# Cell 4: SQL Generation Utilities (from convert_wikisql.py)
AGG_OPS = ["", "MAX", "MIN", "COUNT", "SUM", "AVG"]
OPS = ['=', '>', '<', '>=', '<=', '!=']

SQL_RESERVED = {
    'order', 'group', 'table', 'index', 'select', 'from', 'where', 'join',
    'left', 'right', 'inner', 'outer', 'on', 'as', 'and', 'or', 'not',
    'limit', 'offset', 'union', 'all', 'distinct', 'null', 'is', 'like',
    'between', 'in', 'exists', 'case', 'when', 'then', 'else', 'end',
    'count', 'sum', 'avg', 'min', 'max', 'having', 'by', 'asc', 'desc',
    'primary', 'key', 'foreign', 'references', 'constraint', 'unique',
    'check', 'default', 'create', 'alter', 'drop', 'insert', 'update', 'delete',
    'to', 'with', 'into', 'values', 'set', 'call', 'return', 'returning',
    'current', 'timestamp', 'user', 'session', 'system', 'date', 'time',
    'datetime', 'year', 'month', 'day', 'hour', 'minute', 'second',
}

def clean_column_name(col, used_names=None):
    """Convert column name to valid SQL identifier."""
    cleaned = re.sub(r'[^a-zA-Z0-9_]', '_', col)
    cleaned = re.sub(r'_+', '_', cleaned).strip('_')
    if cleaned and cleaned[0].isdigit():
        cleaned = 'col_' + cleaned
    if not cleaned:
        cleaned = 'col'
    cleaned = cleaned.lower()
    if cleaned in SQL_RESERVED:
        cleaned = f'"{cleaned}"'
    if used_names is not None:
        base_name = cleaned
        suffix = 0
        while cleaned in used_names:
            suffix += 1
            if base_name.startswith('"') and base_name.endswith('"'):
                cleaned = f'"{base_name[1:-1]}_{suffix}"'
            else:
                cleaned = f'{base_name}_{suffix}'
        used_names.add(cleaned)
    return cleaned

def get_column_names(headers):
    """Generate clean column names with duplicate handling."""
    used_names = set()
    return [clean_column_name(h, used_names) for h in headers]

def value_to_sql(value):
    """Convert a value to SQL literal."""
    if isinstance(value, str):
        escaped = value.replace("'", "''")
        return f"'{escaped}'"
    elif isinstance(value, bool):
        return '1' if value else '0'
    elif value is None:
        return 'NULL'
    else:
        return str(value)

def build_where_clause(conds, col_map):
    """Build WHERE clause from conditions."""
    if not conds:
        return ''
    clauses = []
    for col_idx, op, val in conds:
        if col_idx in col_map:
            col_name = col_map[col_idx]
            val_sql = value_to_sql(val)
            op_str = OPS[op] if op < len(OPS) else '='
            clauses.append(f"{col_name} {op_str} {val_sql}")
    return ' WHERE ' + ' AND '.join(clauses) if clauses else ''

def build_select_clause(sql_info, col_map, table_name):
    """Build SELECT clause."""
    sel = sql_info.get('sel', 0)
    agg = sql_info.get('agg', 0)
    col_name = col_map.get(sel, '*')
    if agg == 0:
        return f"SELECT {col_name} FROM {table_name}"
    else:
        agg_name = AGG_OPS[agg] if agg < len(AGG_OPS) else ""
        return f"SELECT {agg_name}({col_name}) FROM {table_name}"

def convert_to_sql(query, table, table_name="t1"):
    """Convert WikiSQL query to executable SQL (from convert_wikisql.py)."""
    headers = table.get('header', [])
    sql_info = query.get('sql', {})
    
    # Create column index to name mapping
    col_names = get_column_names(headers)
    col_map = {i: col_names[i] for i in range(len(col_names))}
    
    select = build_select_clause(sql_info, col_map, table_name)
    where = build_where_clause(sql_info.get('conds', []), col_map)
    
    return (select + where).strip()

# Test with example from convert_wikisql.py
test_query = {
    'sql': {'sel': 5, 'agg': 0, 'conds': [[3, 0, 'SOUTH AUSTRALIA']]}
}
test_table = {
    'header': ['State/territory', 'Text/background colour', 'Format', 'Current slogan', 'Current series', 'Notes'],
    'id': '1-1000181-1'
}
test_sql = convert_to_sql(test_query, test_table, "t1")
print(f"Test SQL: {test_sql}")

In [None]:
# Cell 5: Load and process WikiSQL - DIRECT SQL OUTPUT
from datasets import Dataset

def load_wikisql_sql_split(split_name):
    """Load WikiSQL with executable SQL as target (not structured indices)."""
    main_file = f"{DATA_DIR}/{split_name}.jsonl"
    tables_file = f"{DATA_DIR}/{split_name}.tables.jsonl"
    
    # Load tables
    with open(tables_file) as f:
        tables = {t["id"]: t for t in (json.loads(l) for l in f)}
    
    examples = []
    with open(main_file) as f:
        for line in f:
            row = json.loads(line)
            table = tables[row["table_id"]]
            
            question = row["question"]
            header = table["header"]
            table_name = table.get("name", "t1")
            
            # Schema string
            schema = f"{table_name}({', '.join(header)})"
            
            # Input: question + schema
            input_text = f"translate to SQL: {question} | schema: {schema}"
            
            # Output: EXECUTABLE SQL (using convert_wikisql.py logic)
            target_sql = convert_to_sql(row, table, table_name)
            
            examples.append({
                "input_text": input_text,
                "target_text": target_sql
            })
    
    return examples

print("Loading WikiSQL with SQL targets...")
train_data = load_wikisql_sql_split("train")
val_data = load_wikisql_sql_split("dev")
test_data = load_wikisql_sql_split("test")

print(f"Train: {len(train_data)} | Val: {len(val_data)} | Test: {len(test_data)}")

# Convert to HuggingFace datasets
train_ds = Dataset.from_list(train_data)
val_ds = Dataset.from_list(val_data)

print(f"\nSample (SQL output):")
print(f"Input: {train_ds[0]['input_text'][:100]}...")
print(f"Target SQL: {train_ds[0]['target_text']}")

In [None]:
# Cell 6: Tokenize
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    inputs = tokenizer(batch["input_text"], max_length=256, truncation=True)
    targets = tokenizer(text_target=batch["target_text"], max_length=128, truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

print("Tokenizing...")
train_tok = train_ds.map(tokenize, batched=True, remove_columns=train_ds.column_names)
val_tok = val_ds.map(tokenize, batched=True, remove_columns=val_ds.column_names)

print(f"Train: {len(train_tok)} | Val: {len(val_tok)}")

In [None]:
# Cell 7: Model
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.gradient_checkpointing_enable()
print(f"Loaded {MODEL_NAME} ({model.num_parameters():,} params)")

In [None]:
# Cell 8: Training Config
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

args = Seq2SeqTrainingArguments(
    output_dir="./t2sql_v6_sql",
    
    num_train_epochs=5,
    learning_rate=1e-4,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    max_grad_norm=1.0,
    
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    
    fp16=torch.cuda.is_available(),
    gradient_checkpointing=True,
    
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    predict_with_generate=True,
    generation_max_length=128,
    generation_num_beams=4,
    
    logging_steps=100,
    report_to="none",
    dataloader_num_workers=2,
    seed=42,
)

print(f"Epochs: {args.num_train_epochs} | LR: {args.learning_rate}")

In [None]:
# Cell 9: Metrics - SQL Execution Accuracy
import sqlite3

def normalize_sql(sql):
    """Normalize SQL for comparison."""
    return ' '.join(sql.lower().strip().split())

def compute_metrics(pred):
    """Compute exact match on generated SQL."""
    preds, labels = pred
    preds = np.clip(preds, 0, len(tokenizer)-1)
    pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = np.clip(labels, 0, len(tokenizer)-1)
    label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Exact match on normalized SQL
    exact = sum(normalize_sql(p) == normalize_sql(l) for p, l in zip(pred_str, label_str))
    return {"exact_match": exact / len(pred_str)}

print("Metrics: SQL exact match (normalized)")

In [None]:
# Cell 10: Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)
print("Trainer ready")

In [None]:
# Cell 11: Quick sanity check
batch = collator([train_tok[i] for i in range(4)])
model.eval()
with torch.no_grad():
    loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss.item()
print(f"Initial loss: {loss:.2f}")
if loss < 10:
    print("✓ Ready to train")
model.train()

In [None]:
# Cell 12: TRAIN
print("=" * 50)
print("TRAINING V6-SQL - DIRECT SQL OUTPUT")
print(f"{len(train_tok)} examples | 5 epochs")
print("=" * 50)

torch.cuda.empty_cache() if torch.cuda.is_available() else None
result = trainer.train()

print(f"\nDone! Loss: {result.training_loss:.4f}")

In [None]:
# Cell 13: Evaluate
ev = trainer.evaluate()

print("=" * 50)
print("V6-SQL DIRECT SQL OUTPUT RESULTS")
print("=" * 50)
print(f"Eval Loss: {ev['eval_loss']:.4f}")
print(f"Exact Match: {ev['eval_exact_match']*100:.1f}%")

In [None]:
# Cell 14: Inference Demo - Direct SQL Generation
from transformers import pipeline

trainer.save_model("./t2sql_v6_sql_final")
tokenizer.save_pretrained("./t2sql_v6_sql_final")

gen = pipeline("text2text-generation", model="./t2sql_v6_sql_final",
               device=0 if torch.cuda.is_available() else -1)

def text_to_sql(question, schema):
    """Convert question directly to executable SQL."""
    inp = f"translate to SQL: {question} | schema: {schema}"
    result = gen(inp, max_length=128, num_beams=4)[0]['generated_text']
    return result

# Test examples
tests = [
    ("How many players are there?", "players(id, name, age, team)"),
    ("What is the total population?", "countries(name, population, area)"),
    ("Show all products under $50", "products(id, name, price, category)"),
    ("What is the average age?", "employees(id, name, age, department)"),
]

print("\nDirect SQL predictions:")
print("-" * 50)
for q, schema in tests:
    sql = text_to_sql(q, schema)
    print(f"Q: {q}")
    print(f"SQL: {sql}\n")

In [None]:
# Cell 15: Execute Generated SQL (Optional Demo)
def execute_sql(sql, schema_data):
    """Execute generated SQL against in-memory SQLite."""
    conn = sqlite3.connect(':memory:')
    cursor = conn.cursor()
    
    # Create table and insert data
    # ... (implementation depends on your data)
    
    try:
        cursor.execute(sql)
        return cursor.fetchall()
    except Exception as e:
        return f"Error: {e}"
    finally:
        conn.close()

# Example: Execute a generated query
demo_sql = text_to_sql("Count all employees", "employees(id, name, dept)")
print(f"Generated: {demo_sql}")
# result = execute_sql(demo_sql, employees_data)
# print(f"Result: {result}")

In [None]:
# Cell 16: Save & Report
import shutil

report = {
    "version": "v6_sql_direct_output",
    "dataset": "WikiSQL (direct download)",
    "output_type": "executable_sql",
    "train_examples": len(train_tok),
    "train_loss": result.training_loss,
    "eval_loss": ev['eval_loss'],
    "exact_match": ev['eval_exact_match']*100,
}

json.dump(report, open("report_v6_sql.json", "w"), indent=2)
shutil.make_archive("t2sql_v6_sql", "zip", ".", "t2sql_v6_sql_final")

print("=" * 50)
print("SAVED")
print("=" * 50)
print(json.dumps(report, indent=2))
print("\nDownload: t2sql_v6_sql.zip")