# Text-to-SQL V5 - WikiSQL (Direct Download)

**Dataset:** WikiSQL from Salesforce GitHub
- **56k train / 8k validation / 15k test**
- Schema (table headers) in every example
- Downloads directly — no HuggingFace script issues

**Expected:** 50-70% accuracy

---

In [None]:
# Cell 1: Install
!pip install -q transformers>=4.35.0 datasets>=2.14.0 accelerate>=0.24.0
!pip install -q torch sentencepiece

In [None]:
# Cell 2: Setup
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("TEXT-TO-SQL V5 - WIKISQL")
print("=" * 60)
print(f"CUDA: {torch.cuda.is_available()}")

MODEL_NAME = "google-t5/t5-base" if torch.cuda.is_available() else "google-t5/t5-small"
print(f"Model: {MODEL_NAME}")

In [None]:
# Cell 3: Download WikiSQL directly from source
import urllib.request
import tarfile
import json
import os

DATA_URL = "https://github.com/salesforce/WikiSQL/raw/master/data.tar.bz2"
DATA_DIR = "./wikisql_data"

print("Downloading WikiSQL from Salesforce GitHub...")
if not os.path.exists(DATA_DIR):
    # Download
    urllib.request.urlretrieve(DATA_URL, "data.tar.bz2")
    print("Extracting...")
    with tarfile.open("data.tar.bz2", "r:bz2") as tar:
        tar.extractall(".")
    os.rename("data", DATA_DIR)
    os.remove("data.tar.bz2")
    print("Done!")
else:
    print("Already downloaded.")

print(f"\nFiles: {os.listdir(DATA_DIR)}")

In [None]:
# Cell 4: Load and process WikiSQL
from datasets import Dataset

AGG_OPS = ["", "MAX", "MIN", "COUNT", "SUM", "AVG"]
COND_OPS = ["=", ">", "<", "OP"]

def make_sql(sel, agg, columns, conds):
    """Convert to human readable SQL"""
    sql = f"SELECT {AGG_OPS[agg]} {columns[sel]} FROM table"
    if conds:
        where = " AND ".join([f"{columns[c[0]]} {COND_OPS[c[1]]} {c[2]}" for c in conds])
        sql += f" WHERE {where}"
    return " ".join(sql.split())

def load_wikisql_split(split_name):
    """Load a WikiSQL split (train/dev/test)"""
    main_file = f"{DATA_DIR}/{split_name}.jsonl"
    tables_file = f"{DATA_DIR}/{split_name}.tables.jsonl"
    
    # Load tables
    with open(tables_file) as f:
        tables = {t["id"]: t for t in (json.loads(l) for l in f)}
    
    # Load examples
    examples = []
    with open(main_file) as f:
        for line in f:
            row = json.loads(line)
            table = tables[row["table_id"]]
            
            question = row["question"]
            header = table["header"]
            table_name = table.get("name", "table")
            
            # Build SQL
            sql = make_sql(
                row["sql"]["sel"],
                row["sql"]["agg"],
                header,
                row["sql"]["conds"]
            )
            
            # Schema from header
            schema = f"{table_name}({', '.join(header)})"
            
            input_text = f"translate to SQL: {question} | schema: {schema}"
            target_text = sql.lower()
            
            examples.append({
                "input_text": input_text,
                "target_text": target_text
            })
    
    return examples

print("Loading WikiSQL...")
train_data = load_wikisql_split("train")
val_data = load_wikisql_split("dev")
test_data = load_wikisql_split("test")

print(f"Train: {len(train_data)} | Val: {len(val_data)} | Test: {len(test_data)}")

# Convert to HuggingFace datasets
train_ds = Dataset.from_list(train_data)
val_ds = Dataset.from_list(val_data)

print(f"\nSample:")
print(f"Input: {train_ds[0]['input_text'][:100]}...")
print(f"Target: {train_ds[0]['target_text']}")

In [None]:
# Cell 5: Tokenize
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    inputs = tokenizer(batch["input_text"], max_length=256, truncation=True)
    targets = tokenizer(text_target=batch["target_text"], max_length=128, truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

print("Tokenizing...")
train_tok = train_ds.map(tokenize, batched=True, remove_columns=train_ds.column_names)
val_tok = val_ds.map(tokenize, batched=True, remove_columns=val_ds.column_names)

print(f"Train: {len(train_tok)} | Val: {len(val_tok)}")

In [None]:
# Cell 6: Model
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.gradient_checkpointing_enable()
print(f"Loaded {MODEL_NAME} ({model.num_parameters():,} params)")

In [None]:
# Cell 7: Training Config
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

# WikiSQL is bigger, so fewer epochs needed
args = Seq2SeqTrainingArguments(
    output_dir="./t2sql_wikisql",
    
    num_train_epochs=5,            # Fewer epochs (more data)
    learning_rate=1e-4,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    max_grad_norm=1.0,
    
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,  # Effective batch 32
    
    fp16=torch.cuda.is_available(),
    gradient_checkpointing=True,
    
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    predict_with_generate=True,
    generation_max_length=128,
    generation_num_beams=4,
    
    logging_steps=100,
    report_to="none",
    dataloader_num_workers=2,
    seed=42,
)

print(f"Epochs: {args.num_train_epochs} | LR: {args.learning_rate}")
print(f"Train examples: {len(train_tok)} → ~2-3 hours")

In [None]:
# Cell 8: Metrics
VOCAB = len(tokenizer)

def normalize(sql):
    return " ".join(str(sql).lower().split())

def compute_metrics(pred):
    preds, labels = pred
    preds = np.clip(preds, 0, VOCAB-1)
    
    pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = np.clip(labels, 0, VOCAB-1)
    label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    exact = sum(p.strip() == l.strip() for p, l in zip(pred_str, label_str))
    norm = sum(normalize(p) == normalize(l) for p, l in zip(pred_str, label_str))
    n = len(pred_str)
    
    return {"exact_match": exact/n, "normalized_match": norm/n}

In [None]:
# Cell 9: Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)
print("Trainer ready")

In [None]:
# Cell 10: Quick sanity check
batch = collator([train_tok[i] for i in range(4)])
model.eval()
with torch.no_grad():
    loss = model(**{k: v.to(model.device) for k, v in batch.items()}).loss.item()
print(f"Initial loss: {loss:.2f}")
if loss < 10:
    print("✓ Ready to train")
model.train()

In [None]:
# Cell 11: TRAIN
print("=" * 50)
print("TRAINING V5 WikiSQL")
print(f"{len(train_tok)} examples | 5 epochs | ~2-3 hours")
print("=" * 50)

torch.cuda.empty_cache() if torch.cuda.is_available() else None
result = trainer.train()

print(f"\nDone! Loss: {result.training_loss:.4f}")

In [None]:
# Cell 12: Evaluate
ev = trainer.evaluate()

print("=" * 50)
print("WikiSQL RESULTS")
print("=" * 50)
print(f"Eval Loss: {ev['eval_loss']:.4f}")
print(f"Exact Match: {ev['eval_exact_match']*100:.1f}%")
print(f"Normalized Match: {ev['eval_normalized_match']*100:.1f}%")

In [None]:
# Cell 13: Test predictions
from transformers import pipeline

trainer.save_model("./t2sql_wikisql_final")
tokenizer.save_pretrained("./t2sql_wikisql_final")

gen = pipeline("text2text-generation", model="./t2sql_wikisql_final", 
               device=0 if torch.cuda.is_available() else -1)

tests = [
    ("How many players are there?", "players(id, name, age, team)"),
    ("What is the total population?", "countries(name, population, area)"),
    ("Show all products under $50", "products(id, name, price, category)"),
]

print("\nTest predictions:")
for q, schema in tests:
    inp = f"translate to SQL: {q} | schema: {schema}"
    out = gen(inp, max_length=128, num_beams=4)[0]['generated_text']
    print(f"Q: {q}")
    print(f"SQL: {out}\n")

In [None]:
# Cell 14: Save & Zip
import shutil

report = {
    "version": "v5_wikisql",
    "dataset": "WikiSQL (direct download)",
    "train_examples": len(train_tok),
    "train_loss": result.training_loss,
    "eval_loss": ev['eval_loss'],
    "exact_match": ev['eval_exact_match']*100,
    "normalized_match": ev['eval_normalized_match']*100,
}

json.dump(report, open("report_v5.json", "w"), indent=2)
shutil.make_archive("t2sql_v5_wikisql", "zip", ".", "t2sql_wikisql_final")

print("=" * 50)
print("SAVED")
print("=" * 50)
print(json.dumps(report, indent=2))
print("\nDownload: t2sql_v5_wikisql.zip")