# Text-to-SQL Production System - Final Version

**Team:** Eba Adisu (UGR/2749/14), Mati Milkessa (UGR/0949/14), Nahom Garefo (UGR/6739/14)

**Target:** 35-50% Exact Match on Spider Dataset

---

In [None]:
# Cell 1: Install Dependencies
!pip install -q transformers>=4.35.0 datasets>=2.14.0 accelerate>=0.24.0
!pip install -q torch sentencepiece sqlparse pandas numpy tqdm scikit-learn

In [None]:
# Cell 2: Imports and System Check
import torch
import numpy as np
import pandas as pd
import json
import re
import warnings
from collections import defaultdict

warnings.filterwarnings('ignore')

print("=" * 60)
print("TEXT-TO-SQL PRODUCTION SYSTEM")
print("=" * 60)
print(f"PyTorch: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    GPU_NAME = torch.cuda.get_device_name(0)
    GPU_MEM = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {GPU_NAME} ({GPU_MEM:.1f} GB)")
    
    if GPU_MEM >= 15:
        MODEL_NAME = "google-t5/t5-base"
        print(f"\nUsing: T5-Base (expect 35-45% accuracy)")
    else:
        MODEL_NAME = "google-t5/t5-small"
        print(f"\nUsing: T5-Small (expect 20-30% accuracy)")
else:
    MODEL_NAME = "google-t5/t5-small"
    print("\nWARNING: No GPU detected!")

print("=" * 60)

In [None]:
# Cell 3: Load Spider Dataset
from datasets import load_dataset

print("Loading Spider dataset...")

dataset = None
for source in ["xlangai/spider", "spider"]:
    try:
        print(f"  Trying {source}...")
        dataset = load_dataset(source)
        print(f"  Success!")
        break
    except:
        print(f"  Failed")

if dataset is None:
    raise Exception("Could not load Spider dataset. Download manually from https://yale-lily.github.io/spider")

print(f"\nDataset loaded:")
print(f"  Train: {len(dataset['train'])} examples")
print(f"  Validation: {len(dataset['validation'])} examples")

In [None]:
# Cell 4: Schema Serialization Function
def serialize_schema(example):
    """
    Convert Spider schema to text format.
    Format: "table1: col1, col2 | table2: col1, col2"
    """
    try:
        table_names = example.get('db_table_names', [])
        column_names = example.get('db_column_names', [])
        column_types = example.get('db_column_types', [])
        
        # Group columns by table
        table_cols = defaultdict(list)
        
        for idx, col_info in enumerate(column_names):
            if not isinstance(col_info, (list, tuple)) or len(col_info) < 2:
                continue
            
            table_idx = col_info[0]
            col_name = col_info[1]
            
            # Skip wildcard column
            if table_idx == -1:
                continue
            
            # Skip invalid table index
            if table_idx >= len(table_names):
                continue
            
            table_name = table_names[table_idx]
            col_str = str(col_name).lower()
            
            # Add type if available
            if idx < len(column_types) and column_types[idx]:
                col_str += f" ({column_types[idx]})"
            
            table_cols[table_name].append(col_str)
        
        # Build schema string
        if table_cols:
            parts = [f"{tbl}: {', '.join(cols)}" for tbl, cols in table_cols.items()]
            return " | ".join(parts)
        else:
            return example.get('db_id', 'database')
    
    except Exception:
        return example.get('db_id', 'database')

print("Schema serialization function defined.")

In [None]:
# Cell 5: Preprocessing Function
def preprocess_example(example):
    """
    Convert Spider example to model input/output format.
    """
    question = str(example.get('question', '')).strip()
    sql = str(example.get('query', '')).strip()
    schema = serialize_schema(example)
    
    # Format input
    input_text = f"translate to SQL: {question} | schema: {schema}"
    
    # Normalize SQL whitespace
    target_text = re.sub(r'\s+', ' ', sql).strip()
    
    return {
        "input_text": input_text,
        "target_text": target_text
    }

print("Preprocessing dataset...")
processed = dataset.map(
    preprocess_example,
    num_proc=4,
    desc="Preprocessing"
)

print(f"\nPreprocessing complete!")
print(f"\nSample input:")
print(f"  {processed['train'][0]['input_text'][:150]}...")
print(f"\nSample target:")
print(f"  {processed['train'][0]['target_text']}")

In [None]:
# Cell 6: Load Tokenizer
from transformers import AutoTokenizer

print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_INPUT_LEN = 512
MAX_TARGET_LEN = 256

print(f"  Vocab size: {len(tokenizer)}")
print(f"  Max input: {MAX_INPUT_LEN} tokens")
print(f"  Max target: {MAX_TARGET_LEN} tokens")

In [None]:
# Cell 7: Tokenization
def tokenize_fn(examples):
    """
    Tokenize inputs and targets.
    """
    inputs = tokenizer(
        examples["input_text"],
        max_length=MAX_INPUT_LEN,
        truncation=True,
        padding=False
    )
    
    targets = tokenizer(
        text_target=examples["target_text"],
        max_length=MAX_TARGET_LEN,
        truncation=True,
        padding=False
    )
    
    inputs["labels"] = targets["input_ids"]
    return inputs

print("Tokenizing dataset...")

# Get columns to remove (all original columns)
cols_to_remove = processed['train'].column_names

tokenized = processed.map(
    tokenize_fn,
    batched=True,
    num_proc=4,
    remove_columns=cols_to_remove,
    desc="Tokenizing"
)

print(f"\nTokenization complete!")
print(f"  Final columns: {tokenized['train'].column_names}")
print(f"  Sample input_ids length: {len(tokenized['train'][0]['input_ids'])}")
print(f"  Sample labels length: {len(tokenized['train'][0]['labels'])}")

In [None]:
# Cell 8: Load Model
from transformers import AutoModelForSeq2SeqLM

print(f"Loading model: {MODEL_NAME}")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Enable memory optimization
model.gradient_checkpointing_enable()

print(f"  Parameters: {model.num_parameters():,}")
print(f"  Gradient checkpointing: Enabled")

In [None]:
# Cell 9: Training Setup
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    padding=True
)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./text2sql_model",
    
    # Epochs and batch
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    
    # Optimizer
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    
    # Training optimization
    fp16=torch.cuda.is_available(),
    gradient_checkpointing=True,
    label_smoothing_factor=0.1,
    
    # Evaluation
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="exact_match",
    greater_is_better=True,
    
    # Generation
    predict_with_generate=True,
    generation_max_length=MAX_TARGET_LEN,
    generation_num_beams=4,
    
    # Logging
    logging_steps=50,
    report_to="none",
    
    # System
    dataloader_num_workers=2,
    seed=42,
)

print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size} x {training_args.gradient_accumulation_steps} = {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  FP16: {training_args.fp16}")

In [None]:
# Cell 10: Metrics Function (Safe)
VOCAB_SIZE = len(tokenizer)

def compute_metrics(eval_pred):
    """
    Compute exact match accuracy with overflow protection.
    """
    predictions, labels = eval_pred
    
    # CRITICAL: Clip predictions to valid vocab range to prevent overflow
    predictions = np.clip(predictions, 0, VOCAB_SIZE - 1)
    
    # Decode predictions
    try:
        pred_texts = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    except Exception as e:
        print(f"Decode error: {e}")
        return {"exact_match": 0.0}
    
    # Replace -100 in labels with pad token, then clip
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = np.clip(labels, 0, VOCAB_SIZE - 1)
    
    # Decode labels
    try:
        label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)
    except Exception as e:
        print(f"Decode error: {e}")
        return {"exact_match": 0.0}
    
    # Calculate exact match
    correct = 0
    total = len(pred_texts)
    
    for pred, label in zip(pred_texts, label_texts):
        pred_norm = re.sub(r'\s+', ' ', pred.strip().lower())
        label_norm = re.sub(r'\s+', ' ', label.strip().lower())
        if pred_norm == label_norm:
            correct += 1
    
    accuracy = correct / total if total > 0 else 0.0
    
    return {"exact_match": accuracy}

print("Metrics function defined (with overflow protection).")

In [None]:
# Cell 11: Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

print("Trainer initialized!")
print(f"\nReady to train:")
print(f"  Train examples: {len(tokenized['train'])}")
print(f"  Validation examples: {len(tokenized['validation'])}")

In [None]:
# Cell 12: Verify Before Training
print("Running verification checks...\n")

# Test 1: Data collator
test_batch = [tokenized['train'][i] for i in range(2)]
try:
    collated = data_collator(test_batch)
    print(f"✓ Data collator works")
    print(f"  Keys: {list(collated.keys())}")
except Exception as e:
    print(f"✗ Data collator failed: {e}")
    raise

# Test 2: Model forward pass
try:
    model.eval()
    with torch.no_grad():
        batch_gpu = {k: v.to(model.device) for k, v in collated.items()}
        outputs = model(**batch_gpu)
    print(f"✓ Forward pass works (loss: {outputs.loss.item():.4f})")
except Exception as e:
    print(f"✗ Forward pass failed: {e}")
    raise

# Test 3: Metrics function
try:
    fake_preds = np.random.randint(0, VOCAB_SIZE, (4, 20))
    fake_labels = np.random.randint(0, VOCAB_SIZE, (4, 20))
    metrics = compute_metrics((fake_preds, fake_labels))
    print(f"✓ Metrics function works")
except Exception as e:
    print(f"✗ Metrics function failed: {e}")
    raise

print("\n" + "=" * 60)
print("ALL CHECKS PASSED - READY TO TRAIN")
print("=" * 60)

In [None]:
# Cell 13: TRAIN
print("=" * 60)
print("STARTING TRAINING")
print("=" * 60)
print(f"Model: {MODEL_NAME}")
print(f"Train: {len(tokenized['train'])} examples")
print(f"Epochs: {training_args.num_train_epochs}")
print("=" * 60)
print("\nThis will take 6-8 hours. You can close the browser.\n")

# Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Train!
train_result = trainer.train()

print("\n" + "=" * 60)
print("TRAINING COMPLETE!")
print("=" * 60)
print(f"Train loss: {train_result.training_loss:.4f}")
print(f"Time: {train_result.metrics['train_runtime']/3600:.1f} hours")

In [None]:
# Cell 14: Evaluate
print("Running final evaluation...\n")

eval_results = trainer.evaluate()

print("=" * 60)
print("FINAL RESULTS")
print("=" * 60)
print(f"Eval Loss: {eval_results['eval_loss']:.4f}")
print(f"Exact Match: {eval_results['eval_exact_match']*100:.2f}%")
print("=" * 60)

# Grade
em = eval_results['eval_exact_match'] * 100
if em >= 40:
    print("\nGrade: EXCELLENT")
elif em >= 30:
    print("\nGrade: GOOD")
elif em >= 20:
    print("\nGrade: ACCEPTABLE")
else:
    print("\nGrade: NEEDS IMPROVEMENT")

In [None]:
# Cell 15: Save Model
OUTPUT_DIR = "./text2sql_final"

print(f"Saving model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Save report
report = {
    "team": "Eba Adisu, Mati Milkessa, Nahom Garefo",
    "model": MODEL_NAME,
    "train_examples": len(dataset['train']),
    "val_examples": len(dataset['validation']),
    "epochs": training_args.num_train_epochs,
    "exact_match_pct": eval_results['eval_exact_match'] * 100,
    "eval_loss": eval_results['eval_loss'],
    "training_hours": train_result.metrics['train_runtime'] / 3600
}

with open("report.json", "w") as f:
    json.dump(report, f, indent=2)

print("\nSaved!")
print("\nFiles to download:")
print("  1. text2sql_final/ (model)")
print("  2. report.json (metrics)")
print("  3. This notebook")

In [None]:
# Cell 16: Test Inference
from transformers import pipeline

print("Testing inference...\n")

generator = pipeline(
    "text2text-generation",
    model=OUTPUT_DIR,
    device=0 if torch.cuda.is_available() else -1
)

def predict(question, schema):
    input_text = f"translate to SQL: {question} | schema: {schema}"
    result = generator(input_text, max_length=256, num_beams=4)
    return result[0]['generated_text']

# Test cases
tests = [
    ("Show all students", "students: id, name, age, gpa"),
    ("Find students with GPA above 3.5", "students: id, name, gpa"),
    ("Count students by major", "students: id, name, major"),
    ("What is the average salary?", "employees: id, name, salary")
]

print("Sample predictions:\n")
for q, s in tests:
    sql = predict(q, s)
    print(f"Q: {q}")
    print(f"SQL: {sql}\n")

In [None]:
# Cell 17: Final Report
print("=" * 60)
print("FINAL REPORT")
print("=" * 60)
print(json.dumps(report, indent=2))
print("=" * 60)
print("\nTraining complete. Download your files and submit!")