In [1]:
pip install wandb

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
pip install peft==0.15.0 accelerate>=0.21.0

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
HF_TOKEN="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"

In [4]:
pip install evaluate

[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
pip install sqlparse

[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
pip install sqlalchemy

[0mNote: you may need to restart the kernel to use updated packages.


In [7]:
# Import necessary libraries
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import load_dataset
import pandas as pd
from torch.utils.data import DataLoader
import evaluate
import numpy as np
import sqlparse
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# For database execution
from sqlalchemy import create_engine, text
import sqlite3

  warn(


In [8]:
# Cell 3: Configuration and constants
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
DATASET_NAME = "xlangai/spider"
OUTPUT_DIR = "./llama3-text2sql"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# LoRA config
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# Training args
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=500,
    logging_steps=100,
    learning_rate=2e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    eval_steps=500,
    report_to="wandb"  # optional
)

In [9]:
pip install dataset

[0mNote: you may need to restart the kernel to use updated packages.


In [11]:
# Cell 4a: Load schemas separately
from datasets import load_dataset
dataset = load_dataset("spider")
from sqlalchemy import create_engine
import sqlite3


def get_schema(db_id):
    # You'll need to download the SQLite database files from Spider
    # and place them in a 'database' folder
    try:
        conn = sqlite3.connect(f'database/{db_id}/{db_id}.sqlite')
        cursor = conn.cursor()
        
        # Get tables and columns
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = cursor.fetchall()
        
        schema_info = []
        for table in tables:
            table_name = table[0]
            cursor.execute(f"PRAGMA table_info({table_name});")
            columns = [col[1] for col in cursor.fetchall()]
            schema_info.append(f"Table {table_name}: {', '.join(columns)}")
        
        conn.close()
        return "\n".join(schema_info)
    except:
        return "Schema not available"

# Cell 5a: Modified with schema
def preprocess_with_schema(example):
    schema = get_schema(example['db_id'])
    
    prompt = f"""Translate this question to SQL using the database schema.
    
Database Schema:
{schema}

Question: {example['question']}
SQL Query:"""
    
    return {
        "prompt": prompt,
        "completion": example['query'],
        "db_id": example['db_id']
    }

# Apply preprocessing
train_dataset = dataset['train'].map(preprocess_with_schema)
eval_dataset = dataset['validation'].map(preprocess_with_schema)

In [12]:
pip install --upgrade bitsandbytes

[0mNote: you may need to restart the kernel to use updated packages.


In [13]:
# Cell 5: Load tokenizer and model with authentication
from huggingface_hub import login

# Authenticate with your Hugging Face token
login(token="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")  # Replace with your actual token

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    token="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"  # Add token here as well for redundancy
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # for batch inference

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    token="your_hf_token_here"  # Add token here
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 54,525,952 || all params: 8,084,787,200 || trainable%: 0.6744


In [14]:
# Cell 6: Data collator and tokenization
def tokenize_function(examples):
    # Tokenize prompts and completions
    tokenized_prompts = tokenizer(examples["prompt"], truncation=True, padding="max_length", max_length=512)
    tokenized_completions = tokenizer(examples["completion"], truncation=True, padding="max_length", max_length=256)
    
    # Combine and create labels (ignore prompt tokens in loss calculation)
    input_ids = [p + c for p, c in zip(tokenized_prompts["input_ids"], tokenized_completions["input_ids"])]
    attention_mask = [p + c for p, c in zip(tokenized_prompts["attention_mask"], tokenized_completions["attention_mask"])]
    labels = [[-100] * len(p) + c for p, c in zip(tokenized_prompts["input_ids"], tokenized_completions["input_ids"])]
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

In [15]:
# Cell 7: Optimized Fine-tuning
from transformers import Trainer
import os
import torch

# Enable faster training optimizations
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.environ["TORCHDYNAMO_DISABLE"] = "1"

# Create optimized trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
)

# Start training with progress monitoring
try:
    print("Starting training...")
    train_result = trainer.train()
    
    # Save only the adapter weights to save time/space
    trainer.save_model(OUTPUT_DIR)
    print(f"Model saved to {OUTPUT_DIR}")
    
    # Save the final fine-tuned model in .pt file
    torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "model.pt"))
    print(f"Final model saved as model.pt in {OUTPUT_DIR}")
    
    # Save training metrics
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    
except KeyboardInterrupt:
    print("Training interrupted. Saving current progress...")
    trainer.save_model(OUTPUT_DIR + "_interrupted")
    print("Partial model saved.")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


[34m[1mwandb[0m: Currently logged in as: [33mpsindhu1905[0m ([33mpsindhu1905-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
500,0.0345,0.085089
1000,0.0145,0.106944


Model saved to ./llama3-text2sql
Final model saved as model.pt in ./llama3-text2sql
***** train metrics *****
  epoch                    =      2.9943
  total_flos               = 679976828GF
  train_loss               =      0.1222
  train_runtime            =  2:48:14.76
  train_samples_per_second =        2.08
  train_steps_per_second   =        0.13


In [16]:
# Cell 8: Evaluation metrics setup
# Load metrics
bleu = evaluate.load("bleu")
exact_match = evaluate.load("exact_match")

def compute_metrics(preds, labels):
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute BLEU score
    bleu_score = bleu.compute(
        predictions=decoded_preds,
        references=[[label] for label in decoded_labels]
    )["bleu"]
    
    # Compute exact match
    em_score = exact_match.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )["exact_match"]
    
    # Compute SQL similarity (simplified)
    sql_similarity = []
    for pred, label in zip(decoded_preds, decoded_labels):
        try:
            # Parse and format SQL
            pred_sql = sqlparse.format(pred, reindent=True, keyword_case='upper')
            label_sql = sqlparse.format(label, reindent=True, keyword_case='upper')
            sql_similarity.append(pred_sql == label_sql)
        except:
            sql_similarity.append(False)
    
    sql_accuracy = np.mean(sql_similarity)
    
    return {
        "bleu": bleu_score,
        "exact_match": em_score,
        "sql_accuracy": sql_accuracy
    }

In [17]:
# Cell 9 (Final Corrected Version): Evaluation with fixed syntax
def evaluate_pretrained_model(sample_size=100):
    # Select evaluation samples
    eval_samples = eval_dataset.select(range(sample_size))
    
    predictions = []
    references = []
    
    model.eval()  # Set model to evaluation mode
    
    for example in tqdm(eval_samples, desc="Evaluating pretrained model"):
        try:
            prompt = example["prompt"]
            
            # Tokenize with proper formatting
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            ).to(model.device)
            
            # Generate output
            with torch.no_grad():
                outputs = model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    max_new_tokens=256,
                    pad_token_id=tokenizer.eos_token_id
                )
            
            # Decode only the generated part
            generated_sql = tokenizer.decode(
                outputs[0][inputs["input_ids"].shape[1]:], 
                skip_special_tokens=True
            ).strip()
            
            predictions.append(generated_sql)
            references.append(example["completion"])
            
        except Exception as e:
            print(f"Error processing example: {str(e)}")
            predictions.append("")
            references.append(example["completion"])
    
    # Compute metrics with proper syntax
    def safe_compute_metrics(preds, refs):
        # Convert all inputs to strings
        preds = [str(p) for p in preds]
        refs = [[str(r)] for r in refs]
        
        # Calculate BLEU
        try:
            bleu_score = bleu.compute(predictions=preds, references=refs)["bleu"]
        except:
            bleu_score = 0.0
            
        # Calculate Exact Match
        try:
            em_score = sum(1 for p, r in zip(preds, refs) if p == r[0]) / len(preds)
        except:
            em_score = 0.0
            
        # Calculate SQL Accuracy (fixed syntax)
        try:
            sql_acc = sum(1 for p, r in zip(preds, refs) 
                      if sqlparse.format(p) == sqlparse.format(r[0])) / len(preds)
        except:
            sql_acc = 0.0
            
        return {
            "bleu": bleu_score,
            "exact_match": em_score,
            "sql_accuracy": sql_acc
        }
    
    metrics = safe_compute_metrics(predictions, references)
    print("\nEvaluation Metrics:")
    print(f"BLEU: {metrics['bleu']:.4f}")
    print(f"Exact Match: {metrics['exact_match']:.4f}")
    print(f"SQL Accuracy: {metrics['sql_accuracy']:.4f}")
    
    return metrics

# Run evaluation
pretrained_metrics = evaluate_pretrained_model()

Evaluating pretrained model: 100%|██████████| 100/100 [02:22<00:00,  1.43s/it]


Evaluation Metrics:
BLEU: 0.3925
Exact Match: 0.0700
SQL Accuracy: 0.0700





In [20]:
import sqlparse
import numpy as np
from tqdm import tqdm
import torch
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from evaluate import load

def evaluate_finetuned_model(trainer, eval_dataset, tokenizer, sample_size=50):
    eval_samples = eval_dataset.select(range(min(sample_size, len(eval_dataset))))
    predictions = []
    references = []
    
    trainer.model.eval()
    
    for example in tqdm(eval_samples, desc="Evaluating fine-tuned model"):
        try:
            prompt = example["prompt"]
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            ).to(trainer.model.device)
            
            with torch.no_grad():
                outputs = trainer.model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    max_new_tokens=512,
                    pad_token_id=tokenizer.eos_token_id,
                    num_beams=5,
                    temperature=0.7
                )
            
            generated_sql = tokenizer.decode(
                outputs[0][inputs["input_ids"].shape[1]:],
                skip_special_tokens=True
            ).strip()
            
            predictions.append(generated_sql)
            references.append(example["completion"])
            
        except Exception as e:
            print(f"Error processing example {prompt}: {str(e)}")
            predictions.append("SELECT * FROM table")  # Fallback query
            references.append(example["completion"])
    
    # Normalize SQL
    def normalize_sql(sql):
        try:
            sql = sqlparse.format(sql, reindent=True, keyword_case='upper', strip_comments=True)
            sql = ' '.join(sql.split())
            return sql.rstrip(';')
        except:
            return sql
    
    preds = [normalize_sql(p) for p in predictions]
    refs = [[normalize_sql(r)] for r in references]
    
    # Compute metrics
    exact_match = load("exact_match")
    smoothing = SmoothingFunction().method1
    
    # Compute BLEU score using NLTK
    bleu_scores = []
    for pred, ref in zip(preds, refs):
        pred_tokens = pred.split()
        ref_tokens = [r.split() for r in ref]  # refs is a list of lists
        bleu_scores.append(sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothing))
    bleu_score = np.mean(bleu_scores)
    
    em_score = exact_match.compute(predictions=preds, references=[r[0] for r in refs])["exact_match"]
    
    sql_similarity = []
    for pred, ref in zip(preds, [r[0] for r in refs]):
        sql_similarity.append(pred == ref)
    sql_accuracy = np.mean(sql_similarity)
    
    metrics = {
        "bleu": bleu_score,
        "exact_match": em_score,
        "sql_accuracy": sql_accuracy
    }
    
    print("\nFine-tuned Model Evaluation Metrics:")
    print(f"BLEU: {metrics['bleu']:.4f}")
    print(f"Exact Match: {metrics['exact_match']:.4f}")
    print(f"SQL Accuracy: {metrics['sql_accuracy']:.4f}")
    
    # Print sample predictions
    for i, (pred, ref) in enumerate(zip(preds[:3], [r[0] for r in refs][:3])):
        print(f"\nExample {i+1}:")
        print(f"Predicted: {pred}")
        print(f"Reference: {ref}")
    
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)
    
    return metrics

# Run evaluation
finetuned_metrics = evaluate_finetuned_model(trainer, tokenized_eval, tokenizer)

Evaluating fine-tuned model: 100%|██████████| 50/50 [03:36<00:00,  4.33s/it]



Fine-tuned Model Evaluation Metrics:
BLEU: 0.5158
Exact Match: 0.2400
SQL Accuracy: 0.2400

Example 1:
Predicted: SELECT count(*) FROM singer
Reference: SELECT count(*) FROM singer

Example 2:
Predicted: SELECT count(*) FROM singer
Reference: SELECT count(*) FROM singer

Example 3:
Predicted: SELECT name, country, age FROM singer ORDER BY age DESC
Reference: SELECT name, country, age FROM singer ORDER BY age DESC
***** eval metrics *****
  bleu         = 0.5158
  exact_match  =   0.24
  sql_accuracy =   0.24


In [25]:
# Cell 11 (Interactive Inference Function with 3-question limit)
import sqlite3
import sqlparse

def interactive_text_to_sql(db_path=None):
    """
    Interactive interface for text-to-SQL conversion (3 questions max)
    Args:
        db_path: Optional path to SQLite database for execution
    """
    print("\n" + "="*50)
    print("Text-to-SQL Query Interface")
    print("="*50)
    print("You can enter up to 3 natural language questions about the database.")
    print("Type 'exit' to quit early.\n")
    
    question_count = 0
    
    while question_count < 3:
        # Get user input
        question = input(f"\nEnter question {question_count + 1} of 3: ").strip()
        
        if question.lower() in ['exit', 'quit']:
            print("\nExiting text-to-SQL interface early...")
            break
            
        if not question:
            print("Please enter a valid question.")
            continue
            
        # Generate SQL
        prompt = f"""Translate the following natural language question into SQL query.
        
Question: {question}
SQL Query:"""
        
        try:
            # Generate SQL
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    pad_token_id=tokenizer.eos_token_id
                )
            
            # Extract and clean SQL
            generated_sql = tokenizer.decode(
                outputs[0][inputs["input_ids"].shape[1]:],
                skip_special_tokens=True
            ).strip()
            generated_sql = sqlparse.format(generated_sql.split(";")[0] + ";")
            
            print("\n" + "-"*50)
            print(f"Question: {question}")
            print(f"Generated SQL: {generated_sql}")
            
            # Execute if database path provided
            if db_path:
                try:
                    conn = sqlite3.connect(db_path)
                    cursor = conn.cursor()
                    cursor.execute(generated_sql)
                    results = cursor.fetchall()
                    columns = [desc[0] for desc in cursor.description] if cursor.description else []
                    conn.close()
                    
                    if results:
                        print("\nQuery Results:")
                        # Print column headers
                        print(" | ".join(columns))
                        print("-" * (sum(len(col) for col in columns) + 3*len(columns)))
                        # Print rows
                        for row in results:
                            print(" | ".join(str(x) for x in row))
                    else:
                        print("\nQuery executed successfully but returned no results.")
                        
                except Exception as e:
                    print(f"\nError executing query: {str(e)}")
            
            print("-"*50 + "\n")
            
            question_count += 1  # Only increment on successful question processing
            
        except Exception as e:
            print(f"\nError generating SQL: {str(e)}")
    
    print("\nThank you for using the Text-to-SQL interface!")

# Example usage:
if __name__ == "__main__":
    # Uncomment and provide your database path if available
    # interactive_text_to_sql(db_path="your_database.db")
    interactive_text_to_sql()  # Without database execution


Text-to-SQL Query Interface
You can enter up to 3 natural language questions about the database.
Type 'exit' to quit early.




Enter question 1 of 3:  Which statuses correspond to both cities that have a population over 1500 and cities that have a population lower than 500?



--------------------------------------------------
Question: Which statuses correspond to both cities that have a population over 1500 and cities that have a population lower than 500?
Generated SQL: SELECT Status FROM city WHERE Population  >  1500 INTERSECT SELECT Status FROM city WHERE Population  <  500;
--------------------------------------------------




Enter question 2 of 3:  What details do we have on the students who registered for courses most recently?



--------------------------------------------------
Question: What details do we have on the students who registered for courses most recently?
Generated SQL: SELECT T1.student_details FROM Students AS T1 JOIN Student_Course_Registration AS T2 ON T1.student_id  =  T2.student_id ORDER BY T2.registration_date DESC LIMIT 1;
--------------------------------------------------




Enter question 3 of 3:  What are the ids of the candidates that have an outcome code of Pass?



--------------------------------------------------
Question: What are the ids of the candidates that have an outcome code of Pass?
Generated SQL: SELECT candidate_id FROM candidate_outcomes WHERE outcome_code  =  'Pass';
--------------------------------------------------


Thank you for using the Text-to-SQL interface!
