# Text-to-SQL V4 - Spider with REAL Schema

**SETUP REQUIRED:**
1. Click **+ Add Input** (right sidebar)
2. Search: `yale-universitys-spider-10-nlp-dataset`
3. Add the dataset by `jeromeblanchet`

**What's Fixed:**
- Uses `tables.json` for real schema (table names, column names, types)
- Merges schema into each training example by `db_id`
- Model finally sees actual columns like `students(id, name, gpa)`

**Expected:** 35-50% accuracy (vs 11% without schema)

---

In [None]:
# Cell 1: Install
!pip install -q transformers>=4.35.0 datasets>=2.14.0 accelerate>=0.24.0
!pip install -q torch sentencepiece pandas numpy tqdm

In [None]:
# Cell 2: Setup & Verify Kaggle Dataset
import os
import json
import torch
import numpy as np
import warnings
from collections import defaultdict

warnings.filterwarnings('ignore')

print("=" * 60)
print("TEXT-TO-SQL V4 - SPIDER WITH SCHEMA")
print("=" * 60)

# Find the Kaggle input path
KAGGLE_INPUT = "/kaggle/input"
SPIDER_PATH = None

if os.path.exists(KAGGLE_INPUT):
    for folder in os.listdir(KAGGLE_INPUT):
        if "spider" in folder.lower():
            SPIDER_PATH = os.path.join(KAGGLE_INPUT, folder)
            break

if SPIDER_PATH is None:
    print("ERROR: Spider dataset not found!")
    print("Please add dataset: jeromeblanchet/yale-universitys-spider-10-nlp-dataset")
    raise FileNotFoundError("Add Spider dataset first")

print(f"Spider dataset found: {SPIDER_PATH}")
print(f"\nContents:")
for item in os.listdir(SPIDER_PATH)[:10]:
    print(f"  {item}")

print(f"\nPyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    GPU_NAME = torch.cuda.get_device_name(0)
    GPU_MEM = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {GPU_NAME} ({GPU_MEM:.1f} GB)")
    MODEL_NAME = "google-t5/t5-base"
else:
    MODEL_NAME = "google-t5/t5-small"

print(f"Model: {MODEL_NAME}")
print("=" * 60)

In [None]:
# Cell 3: Load tables.json (THE KEY FIX)
import glob

# Find tables.json
tables_files = glob.glob(f"{SPIDER_PATH}/**/tables.json", recursive=True)
if not tables_files:
    # Try without recursion
    tables_files = [f"{SPIDER_PATH}/tables.json"]

TABLES_JSON = tables_files[0] if tables_files else None

if TABLES_JSON and os.path.exists(TABLES_JSON):
    print(f"Found: {TABLES_JSON}")
    with open(TABLES_JSON) as f:
        tables_data = json.load(f)
    print(f"Loaded {len(tables_data)} database schemas")
else:
    print("ERROR: tables.json not found!")
    print(f"Searched in: {SPIDER_PATH}")
    raise FileNotFoundError("tables.json missing")

# Create lookup: db_id -> schema
SCHEMA_LOOKUP = {db["db_id"]: db for db in tables_data}

# Show example
example_db = list(SCHEMA_LOOKUP.keys())[0]
example_schema = SCHEMA_LOOKUP[example_db]
print(f"\nExample schema for '{example_db}':")
print(f"  Tables: {example_schema.get('table_names', [])[:5]}")
print(f"  Columns: {example_schema.get('column_names', [])[:5]}")

In [None]:
# Cell 4: Load Train/Dev Data

# Find train and dev json files
train_files = glob.glob(f"{SPIDER_PATH}/**/train*.json", recursive=True)
dev_files = glob.glob(f"{SPIDER_PATH}/**/dev*.json", recursive=True)

# Filter out tables.json
train_files = [f for f in train_files if "tables" not in f.lower()]
dev_files = [f for f in dev_files if "tables" not in f.lower()]

print(f"Train files: {train_files}")
print(f"Dev files: {dev_files}")

# Load data
with open(train_files[0]) as f:
    train_data = json.load(f)
with open(dev_files[0]) as f:
    dev_data = json.load(f)

print(f"\nLoaded: {len(train_data)} train, {len(dev_data)} dev examples")

# Show example
print(f"\nExample train item keys: {list(train_data[0].keys())}")
print(f"Question: {train_data[0].get('question', '')}")
print(f"Query: {train_data[0].get('query', '')}")
print(f"DB ID: {train_data[0].get('db_id', '')}")

In [None]:
# Cell 5: Schema Serialization Function

def serialize_schema(db_id):
    """
    Convert schema to string format: table1(col1, col2) | table2(col3, col4)
    """
    if db_id not in SCHEMA_LOOKUP:
        return f"database: {db_id}"
    
    schema = SCHEMA_LOOKUP[db_id]
    table_names = schema.get("table_names", [])
    column_names = schema.get("column_names", [])  # [[table_idx, col_name], ...]
    
    # Group columns by table
    table_cols = defaultdict(list)
    
    for col_info in column_names:
        if not isinstance(col_info, list) or len(col_info) < 2:
            continue
        table_idx, col_name = col_info[0], col_info[1]
        
        # Skip * column (table_idx = -1)
        if table_idx < 0 or table_idx >= len(table_names):
            continue
        
        table_name = table_names[table_idx].lower().replace(" ", "_")
        col_name = col_name.lower().replace(" ", "_")
        table_cols[table_name].append(col_name)
    
    # Build schema string
    if table_cols:
        parts = [f"{tbl}({', '.join(cols)})" for tbl, cols in table_cols.items()]
        return " | ".join(parts)
    
    return f"database: {db_id}"

# Test on a few examples
print("Schema serialization test:")
for i in range(3):
    db_id = train_data[i]["db_id"]
    schema = serialize_schema(db_id)
    print(f"\n{db_id}:")
    print(f"  {schema[:100]}..." if len(schema) > 100 else f"  {schema}")

In [None]:
# Cell 6: SQL Normalization

def normalize_sql(sql):
    """Normalize SQL for fair comparison."""
    if not sql:
        return ""
    
    sql = str(sql).strip().lower()
    sql = ' '.join(sql.split())  # Normalize whitespace
    
    # Normalize operators
    for op in ['>=', '<=', '!=', '<>', '=', '>', '<']:
        sql = sql.replace(op, f' {op} ')
    
    sql = sql.replace(',', ', ')
    sql = ' '.join(sql.split())  # Clean up extra spaces
    sql = sql.rstrip(';').strip()
    
    return sql

print("SQL normalization ready.")

In [None]:
# Cell 7: Create HuggingFace Dataset with Schema
from datasets import Dataset

def process_examples(data_list):
    """Convert raw data to training format with schema."""
    processed = []
    
    for item in data_list:
        question = item.get("question", "").strip()
        query = item.get("query", "").strip()
        db_id = item.get("db_id", "")
        
        # Get real schema!
        schema = serialize_schema(db_id)
        
        # Format input
        input_text = f"translate to SQL: {question} | schema: {schema}"
        target_text = normalize_sql(query)
        
        processed.append({
            "input_text": input_text,
            "target_text": target_text
        })
    
    return processed

print("Processing train data...")
train_processed = process_examples(train_data)
print("Processing dev data...")
dev_processed = process_examples(dev_data)

# Convert to HuggingFace datasets
train_dataset = Dataset.from_list(train_processed)
dev_dataset = Dataset.from_list(dev_processed)

print(f"\nTrain: {len(train_dataset)} | Dev: {len(dev_dataset)}")

# Verify schema is present
print("\n" + "=" * 60)
print("VERIFICATION - Schema is now included!")
print("=" * 60)
for i in range(2):
    print(f"\nExample {i}:")
    print(f"Input: {train_dataset[i]['input_text'][:150]}...")
    print(f"Target: {train_dataset[i]['target_text']}")

In [None]:
# Cell 8: Tokenization
from transformers import AutoTokenizer

print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_INPUT = 512
MAX_TARGET = 256

def tokenize(examples):
    inputs = tokenizer(
        examples["input_text"],
        max_length=MAX_INPUT,
        truncation=True,
        padding=False
    )
    targets = tokenizer(
        text_target=examples["target_text"],
        max_length=MAX_TARGET,
        truncation=True,
        padding=False
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

print("Tokenizing...")
train_tokenized = train_dataset.map(tokenize, batched=True, remove_columns=train_dataset.column_names)
dev_tokenized = dev_dataset.map(tokenize, batched=True, remove_columns=dev_dataset.column_names)

print(f"Done. Columns: {train_tokenized.column_names}")

In [None]:
# Cell 9: Load Model
from transformers import AutoModelForSeq2SeqLM

print(f"Loading model: {MODEL_NAME}")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.gradient_checkpointing_enable()

print(f"Parameters: {model.num_parameters():,}")

In [None]:
# Cell 10: Training Config
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    padding=True
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./t2sql_v4",
    
    num_train_epochs=25,
    learning_rate=2e-4,
    warmup_ratio=0.06,
    lr_scheduler_type="cosine",
    
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    gradient_checkpointing=True,
    label_smoothing_factor=0.1,
    
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    predict_with_generate=True,
    generation_max_length=MAX_TARGET,
    generation_num_beams=4,
    
    logging_steps=50,
    report_to="none",
    dataloader_num_workers=2,
    seed=42,
)

print("Training config:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  LR: {training_args.learning_rate}")
print(f"  Effective batch: 32")

In [None]:
# Cell 11: Metrics
VOCAB_SIZE = len(tokenizer)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    predictions = np.clip(predictions, 0, VOCAB_SIZE - 1)
    
    try:
        pred_texts = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    except:
        return {"exact_match": 0.0, "normalized_match": 0.0}
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = np.clip(labels, 0, VOCAB_SIZE - 1)
    
    try:
        label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)
    except:
        return {"exact_match": 0.0, "normalized_match": 0.0}
    
    exact = 0
    normalized = 0
    total = len(pred_texts)
    
    for pred, label in zip(pred_texts, label_texts):
        if pred.strip() == label.strip():
            exact += 1
        if normalize_sql(pred) == normalize_sql(label):
            normalized += 1
    
    return {
        "exact_match": exact / total if total > 0 else 0.0,
        "normalized_match": normalized / total if total > 0 else 0.0
    }

print("Metrics ready.")

In [None]:
# Cell 12: Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=dev_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print(f"Trainer ready.")
print(f"Train: {len(train_tokenized)} | Eval: {len(dev_tokenized)}")

In [None]:
# Cell 13: Verify Setup
print("Final verification...")

test_batch = [train_tokenized[i] for i in range(2)]
collated = data_collator(test_batch)
print("✓ Collator OK")

model.eval()
with torch.no_grad():
    out = model(**{k: v.to(model.device) for k, v in collated.items()})
print(f"✓ Forward OK (loss: {out.loss.item():.2f})")

print("\n" + "=" * 60)
print("V4 READY - NOW WITH REAL SCHEMA!")
print("=" * 60)

In [None]:
# Cell 14: TRAIN
print("=" * 60)
print("STARTING V4 TRAINING (WITH SCHEMA)")
print("=" * 60)
print(f"Model: {MODEL_NAME}")
print(f"Epochs: 25 | LR: 2e-4 | Batch: 32")
print(f"Schema: INCLUDED (from tables.json)")
print("=" * 60)
print("\nYou can close browser. Training continues.\n")

if torch.cuda.is_available():
    torch.cuda.empty_cache()

result = trainer.train()

print("\n" + "=" * 60)
print("V4 TRAINING COMPLETE!")
print("=" * 60)
print(f"Train loss: {result.training_loss:.4f}")
print(f"Time: {result.metrics['train_runtime']/3600:.2f} hours")

In [None]:
# Cell 15: Evaluate
print("Evaluating...\n")

eval_results = trainer.evaluate()

print("=" * 60)
print("V4 RESULTS")
print("=" * 60)
print(f"Eval Loss: {eval_results['eval_loss']:.4f}")
print(f"Exact Match: {eval_results['eval_exact_match']*100:.2f}%")
print(f"Normalized Match: {eval_results['eval_normalized_match']*100:.2f}%")
print("=" * 60)

nm = eval_results['eval_normalized_match'] * 100
if nm >= 40:
    grade = "EXCELLENT"
elif nm >= 30:
    grade = "GOOD"
elif nm >= 20:
    grade = "ACCEPTABLE"
else:
    grade = "NEEDS WORK"
print(f"\nGrade: {grade}")

In [None]:
# Cell 16: Save Model
OUTPUT = "./t2sql_final_v4"

print(f"Saving to {OUTPUT}...")
trainer.save_model(OUTPUT)
tokenizer.save_pretrained(OUTPUT)

report = {
    "version": "v4_with_schema",
    "model": MODEL_NAME,
    "epochs": 25,
    "learning_rate": "2e-4",
    "schema_source": "tables.json",
    "train_examples": len(train_data),
    "train_loss": result.training_loss,
    "eval_loss": eval_results['eval_loss'],
    "exact_match_pct": eval_results['eval_exact_match'] * 100,
    "normalized_match_pct": eval_results['eval_normalized_match'] * 100,
    "training_hours": result.metrics['train_runtime'] / 3600
}

with open("report_v4.json", "w") as f:
    json.dump(report, f, indent=2)

print("Saved!")

In [None]:
# Cell 17: Test Predictions
from transformers import pipeline

print("Testing V4 model...\n")

gen = pipeline(
    "text2text-generation",
    model=OUTPUT,
    device=0 if torch.cuda.is_available() else -1
)

def predict(question, schema):
    inp = f"translate to SQL: {question} | schema: {schema}"
    out = gen(inp, max_length=256, num_beams=4)
    return out[0]['generated_text']

# Test with real schemas from training
tests = [
    ("How many singers are there?", serialize_schema("concert_singer")),
    ("Show all stadium names", serialize_schema("concert_singer")),
    ("Find pets older than 3 years", serialize_schema("pets_1")),
    ("Count employees per department", serialize_schema("employee_hire_evaluation")),
]

for q, s in tests:
    sql = predict(q, s)
    print(f"Q: {q}")
    print(f"Schema: {s[:60]}..." if len(s) > 60 else f"Schema: {s}")
    print(f"SQL: {sql}")
    print()

In [None]:
# Cell 18: Zip Model
import shutil

print("Zipping model...")
shutil.make_archive("t2sql_v4_model", "zip", ".", "t2sql_final_v4")
print("Created: t2sql_v4_model.zip")

print("\n" + "=" * 60)
print("V4 FINAL REPORT")
print("=" * 60)
print(json.dumps(report, indent=2))
print("=" * 60)
print("\nDownload: t2sql_v4_model.zip")
print("\nThis version includes REAL SCHEMA from tables.json!")