# Text-to-SQL V5 Spider - FAST & FIXED

**V4 crashed because:** Schema serialization produced bad data (loss=112)

**V5 Fixes:**
- Debug cell to inspect schema BEFORE training
- Validate all data (no empty strings)
- Keep fast LR (1e-4)
- FP16 OFF (stability)
- Compact schema format

**SETUP:** Add dataset `jeromeblanchet/yale-universitys-spider-10-nlp-dataset`

---

In [None]:
# Cell 1: Install
!pip install -q transformers>=4.35.0 datasets>=2.14.0 accelerate>=0.24.0
!pip install -q torch sentencepiece

In [None]:
# Cell 2: Setup
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import json
import torch
import numpy as np
import glob
import warnings
from collections import defaultdict

warnings.filterwarnings('ignore')

print("=" * 60)
print("TEXT-TO-SQL V5 - SPIDER FAST")
print("=" * 60)

KAGGLE_INPUT = "/kaggle/input"
SPIDER_PATH = None

for folder in os.listdir(KAGGLE_INPUT):
    if "spider" in folder.lower():
        SPIDER_PATH = os.path.join(KAGGLE_INPUT, folder)
        break

if not SPIDER_PATH:
    raise FileNotFoundError("Add: jeromeblanchet/yale-universitys-spider-10-nlp-dataset")

print(f"Spider: {SPIDER_PATH}")
print(f"CUDA: {torch.cuda.is_available()}")

MODEL_NAME = "google-t5/t5-base" if torch.cuda.is_available() else "google-t5/t5-small"
print(f"Model: {MODEL_NAME}")

In [None]:
# Cell 3: Load tables.json
tables_file = glob.glob(f"{SPIDER_PATH}/**/tables.json", recursive=True)
if not tables_file:
    tables_file = [f"{SPIDER_PATH}/tables.json"]

with open(tables_file[0]) as f:
    tables_data = json.load(f)

SCHEMA_LOOKUP = {db["db_id"]: db for db in tables_data}
print(f"Loaded {len(SCHEMA_LOOKUP)} schemas")

In [None]:
# Cell 4: Load train/dev
train_files = [f for f in glob.glob(f"{SPIDER_PATH}/**/train*.json", recursive=True) if "table" not in f]
dev_files = [f for f in glob.glob(f"{SPIDER_PATH}/**/dev*.json", recursive=True) if "table" not in f]

with open(train_files[0]) as f:
    train_data = json.load(f)
with open(dev_files[0]) as f:
    dev_data = json.load(f)

print(f"Train: {len(train_data)} | Dev: {len(dev_data)}")

In [None]:
# Cell 5: SIMPLE Schema - just table names and columns
def get_schema(db_id):
    """Simple schema: table1.col1,col2 | table2.col3,col4"""
    if db_id not in SCHEMA_LOOKUP:
        return db_id
    
    db = SCHEMA_LOOKUP[db_id]
    tables = db.get("table_names", [])
    columns = db.get("column_names", [])
    
    if not tables or not columns:
        return db_id
    
    table_cols = defaultdict(list)
    for col in columns:
        if isinstance(col, list) and len(col) >= 2:
            tidx, cname = col[0], col[1]
            if 0 <= tidx < len(tables):
                table_cols[tables[tidx]].append(cname)
    
    if not table_cols:
        return db_id
    
    # Compact format: table.col1,col2,col3
    parts = []
    for t, cols in table_cols.items():
        cols_str = ",".join(cols[:6])  # Max 6 cols
        parts.append(f"{t}.{cols_str}")
    
    schema = " | ".join(parts[:5])  # Max 5 tables
    return schema[:250]  # Hard limit

# DEBUG: Check schemas
print("Schema samples:")
for i in range(3):
    db_id = train_data[i]["db_id"]
    schema = get_schema(db_id)
    print(f"  {db_id}: {schema[:70]}...")

In [None]:
# Cell 6: Process data with VALIDATION
from datasets import Dataset

def normalize_sql(sql):
    return " ".join(str(sql).lower().split()).rstrip(";")

def process(data_list):
    result = []
    bad = 0
    
    for item in data_list:
        q = str(item.get("question", "")).strip()
        sql = str(item.get("query", "")).strip()
        db_id = item.get("db_id", "")
        schema = get_schema(db_id)
        
        # VALIDATE
        if not q or not sql or len(q) < 5 or len(sql) < 5:
            bad += 1
            continue
        
        inp = f"translate to SQL: {q} schema: {schema}"
        tgt = normalize_sql(sql)
        
        # Length check
        if len(inp) > 600 or len(tgt) > 300:
            bad += 1
            continue
        
        result.append({"input_text": inp, "target_text": tgt})
    
    print(f"  Valid: {len(result)}, Skipped: {bad}")
    return result

print("Processing train...")
train_proc = process(train_data)
print("Processing dev...")
dev_proc = process(dev_data)

train_ds = Dataset.from_list(train_proc)
dev_ds = Dataset.from_list(dev_proc)

# Show samples
print("\n--- Samples ---")
for i in range(2):
    print(f"IN: {train_ds[i]['input_text'][:80]}...")
    print(f"OUT: {train_ds[i]['target_text']}\n")

In [None]:
# Cell 7: Tokenize
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    inputs = tokenizer(batch["input_text"], max_length=384, truncation=True)
    targets = tokenizer(text_target=batch["target_text"], max_length=128, truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

train_tok = train_ds.map(tokenize, batched=True, remove_columns=train_ds.column_names)
dev_tok = dev_ds.map(tokenize, batched=True, remove_columns=dev_ds.column_names)

print(f"Tokenized: {len(train_tok)} train, {len(dev_tok)} dev")

In [None]:
# Cell 8: Model
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.gradient_checkpointing_enable()
print(f"Loaded {MODEL_NAME}")

In [None]:
# Cell 9: Training Config - FAST but STABLE
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

args = Seq2SeqTrainingArguments(
    output_dir="./t2sql_v5",
    
    num_train_epochs=20,
    learning_rate=1e-4,           # Fast LR
    warmup_ratio=0.06,
    lr_scheduler_type="cosine",
    max_grad_norm=1.0,            # Gradient clipping
    
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    
    fp16=False,                   # OFF for stability
    gradient_checkpointing=True,
    
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    predict_with_generate=True,
    generation_max_length=128,
    generation_num_beams=4,
    
    logging_steps=50,
    report_to="none",
    dataloader_num_workers=0,
    seed=42,
)

print(f"Config: LR={args.learning_rate}, Epochs={args.num_train_epochs}, FP16={args.fp16}")

In [None]:
# Cell 10: Metrics
VOCAB_SIZE = len(tokenizer)

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.clip(preds, 0, VOCAB_SIZE - 1)
    
    pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = np.clip(labels, 0, VOCAB_SIZE - 1)
    label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    exact = sum(p.strip() == l.strip() for p, l in zip(pred_str, label_str))
    norm = sum(normalize_sql(p) == normalize_sql(l) for p, l in zip(pred_str, label_str))
    n = len(pred_str)
    
    return {"exact_match": exact/n, "normalized_match": norm/n}

In [None]:
# Cell 11: Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=dev_tok,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)
print("Trainer ready")

In [None]:
# Cell 12: SANITY CHECK - if loss > 15, something is wrong
print("Checking initial loss...")

batch = collator([train_tok[i] for i in range(4)])
model.eval()
with torch.no_grad():
    out = model(**{k: v.to(model.device) for k, v in batch.items()})
    loss = out.loss.item()

print(f"Initial loss: {loss:.2f}")

if loss > 15:
    print("\n⚠️ WARNING: Loss too high! Checking data...")
    for i in range(2):
        print(f"Input: {tokenizer.decode(batch['input_ids'][i][:30])}...")
        lbls = [t for t in batch['labels'][i].tolist() if t != -100]
        print(f"Label: {tokenizer.decode(lbls[:20])}...")
else:
    print("✓ Loss normal. Ready to train.")

model.train()

In [None]:
# Cell 13: TRAIN
print("=" * 50)
print("TRAINING V5 SPIDER")
print(f"LR: 1e-4 | Epochs: 20 | ~2-3 hours")
print("=" * 50)

torch.cuda.empty_cache() if torch.cuda.is_available() else None
result = trainer.train()

print(f"\nDone! Loss: {result.training_loss:.4f}")

In [None]:
# Cell 14: Evaluate
ev = trainer.evaluate()
print(f"Eval Loss: {ev['eval_loss']:.4f}")
print(f"Exact Match: {ev['eval_exact_match']*100:.1f}%")
print(f"Normalized: {ev['eval_normalized_match']*100:.1f}%")

In [None]:
# Cell 15: Save & Zip
import shutil

trainer.save_model("./t2sql_v5_final")
tokenizer.save_pretrained("./t2sql_v5_final")

report = {
    "version": "v5_spider",
    "train_loss": result.training_loss,
    "eval_loss": ev['eval_loss'],
    "exact_match": ev['eval_exact_match']*100,
    "normalized_match": ev['eval_normalized_match']*100,
}
json.dump(report, open("report_v5.json", "w"), indent=2)

shutil.make_archive("t2sql_v5_spider", "zip", ".", "t2sql_v5_final")
print("Saved: t2sql_v5_spider.zip")
print(json.dumps(report, indent=2))