# Evaluation: Base vs Finetuned (GPU)

## Comparison Goals

After training, we want to:
1. **Quantitative:** Compare perplexity (base vs finetuned)
2. **Qualitative:** Generate samples side-by-side
3. **Document:** Record hyperparameters, costs, latency

This notebook runs on GPU for speed, but you can adapt it for CPU if needed.


In [None]:
# === TODO (you code this) ===
# Load base model + attach LoRA adapters; run perplexity on validation slice.
# Hints:
#   - Load base model in 4-bit
#   - Use PeftModel.from_pretrained() to attach adapters
#   - Compute perplexity on validation set (similar to notebook 04, but on GPU)
# Acceptance:
#   - prints ppl_base vs ppl_finetuned

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch
from datasets import load_dataset

def eval_perplexity_with_adapters(base_model: str, adapter_repo: str, dataset, n_samples: int=25):
    """
    Evaluate perplexity with base and finetuned models.
    
    Args:
        base_model: Base model name
        adapter_repo: Hub repo with LoRA adapters
        dataset: Validation dataset (raw text, not tokenized)
        n_samples: Number of samples to evaluate
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if device == "cpu":
        print("⚠️  WARNING: No GPU detected! This will be very slow.")
        print("   For faster evaluation, enable GPU in Colab: Runtime → Change runtime type → GPU")
        print("   Continuing on CPU (this may take 30-60 minutes)...\n")
    else:
        print(f"✅ GPU detected: {torch.cuda.get_device_name(0)}")
    print(f"Loading base model {base_model}...")
    
    # Load tokenizer first
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Check if bitsandbytes is available (required for 4-bit quantization)
    try:
        import bitsandbytes
        use_quantization = device == "cuda"  # Only use quantization on GPU
    except ImportError:
        use_quantization = False
        if device == "cuda":
            print("⚠️  bitsandbytes not installed. Loading model without quantization (will use more memory).")
        else:
            print("ℹ️  bitsandbytes not available on CPU. Loading model without quantization.")
    
    # Load base model with or without quantization
    load_kwargs = {}
    
    if use_quantization:
        # Use 4-bit quantization (same as training) - GPU only
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
        load_kwargs["quantization_config"] = quantization_config
        load_kwargs["torch_dtype"] = torch.bfloat16
        load_kwargs["device_map"] = "auto"
        print("Loading model with 4-bit quantization...")
    else:
        # Load without quantization (CPU or if bitsandbytes unavailable)
        if device == "cuda":
            load_kwargs["torch_dtype"] = torch.bfloat16
            load_kwargs["device_map"] = "auto"
        else:
            load_kwargs["torch_dtype"] = torch.float32
        print("Loading model without quantization...")
    
    base_model_obj = AutoModelForCausalLM.from_pretrained(
        base_model,
        **load_kwargs
    )
    
    if device == "cpu" and not use_quantization:
        base_model_obj = base_model_obj.to("cpu")
    base_model_obj.eval()
    
    print("✅ Base model loaded")
    print(f"Loading adapters from {adapter_repo}...")
    
    # Load finetuned model (base + adapters)
    finetuned_model = PeftModel.from_pretrained(base_model_obj, adapter_repo)
    finetuned_model.eval()
    
    print("✅ Finetuned model loaded")
    
    # Limit samples
    n_samples = min(n_samples, len(dataset))
    print(f"\nComputing perplexity on {n_samples} samples...")
    print("⚠️  This may take 5-15 minutes. Be patient!\n")
    
    import time
    start_time = time.time()
    
    # Compute base model perplexity
    print("Computing BASE model perplexity...")
    base_total_nll = 0.0
    base_total_tokens = 0
    
    with torch.no_grad():
        for i, sample in enumerate(dataset.select(range(n_samples))):
            text = sample['text']
            # Tokenize
            encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
            input_ids = encoded['input_ids']
            if device == "cuda":
                input_ids = input_ids.to(device)
            
            # Forward pass to get loss
            outputs = base_model_obj(input_ids, labels=input_ids)
            nll = outputs.loss.item() * input_ids.size(1)
            
            base_total_nll += nll
            base_total_tokens += input_ids.size(1)
            
            if (i + 1) % 5 == 0:
                print(f"  Base: Processed {i + 1}/{n_samples} samples...")
    
    base_avg_nll = base_total_nll / base_total_tokens
    base_perplexity = torch.exp(torch.tensor(base_avg_nll)).item()
    
    # Compute finetuned model perplexity
    print("\nComputing FINETUNED model perplexity...")
    finetuned_total_nll = 0.0
    finetuned_total_tokens = 0
    
    with torch.no_grad():
        for i, sample in enumerate(dataset.select(range(n_samples))):
            text = sample['text']
            # Tokenize
            encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
            input_ids = encoded['input_ids']
            if device == "cuda":
                input_ids = input_ids.to(device)
            
            # Forward pass to get loss
            outputs = finetuned_model(input_ids, labels=input_ids)
            nll = outputs.loss.item() * input_ids.size(1)
            
            finetuned_total_nll += nll
            finetuned_total_tokens += input_ids.size(1)
            
            if (i + 1) % 5 == 0:
                print(f"  Finetuned: Processed {i + 1}/{n_samples} samples...")
    
    finetuned_avg_nll = finetuned_total_nll / finetuned_total_tokens
    finetuned_perplexity = torch.exp(torch.tensor(finetuned_avg_nll)).item()
    
    elapsed_time = time.time() - start_time
    
    # Print results
    print("\n" + "="*60)
    print("PERPLEXITY COMPARISON")
    print("="*60)
    print(f"Base Model:        {base_perplexity:.2f}")
    print(f"Finetuned Model:   {finetuned_perplexity:.2f}")
    print(f"Improvement:       {base_perplexity - finetuned_perplexity:.2f} points")
    print(f"Relative Change:   {((finetuned_perplexity - base_perplexity) / base_perplexity * 100):.1f}%")
    print(f"\nSamples evaluated: {n_samples}")
    print(f"Time taken: {elapsed_time/60:.1f} minutes")
    print("="*60)
    
    return {
        'base_perplexity': base_perplexity,
        'finetuned_perplexity': finetuned_perplexity,
        'improvement': base_perplexity - finetuned_perplexity
    }

# Load dataset (raw text, not tokenized)
from datasets import load_dataset
from huggingface_hub import HfFolder
import os

# Get token (same as notebook 10)
hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or HfFolder.get_token()
ds_val = load_dataset("Tuminha/frankenstein-fanfic-snippets", token=hf_token)['validation']

# Evaluate
results = eval_perplexity_with_adapters(
    "mistralai/Mistral-7B-Instruct-v0.2",
    "Tuminha/mistral-frankenstein-qlora",
    ds_val,
    n_samples=25  # Start with 25, can increase later
)


   For faster evaluation, enable GPU in Colab: Runtime → Change runtime type → GPU
   Continuing on CPU (this may take 30-60 minutes)...

Loading base model mistralai/Mistral-7B-Instruct-v0.2...


PackageNotFoundError: No package metadata was found for bitsandbytes

## Side-by-Side Generation

Generate text with both models using the same prompts. Compare style, coherence, and Frankenstein-like tone.


In [None]:
# === TODO (you code this) ===
# Generate 3-5 short continuations with both models for side-by-side comparison.
# Hints:
#   - Load base model and finetuned (base + adapters)
#   - Use same prompts for both
#   - Print outputs side-by-side or in a table
#   - Use reasonable generation parameters (temperature, top_p)
# Acceptance:
#   - prints paired outputs with fixed prompts

def compare_samples(base_model: str, adapter_repo: str, prompts: list, max_new_tokens: int=100):
    """
    Generate samples with base and finetuned models for comparison.
    
    Args:
        base_model: Base model name
        adapter_repo: Hub repo with LoRA adapters
        prompts: List of prompt strings
        max_new_tokens: Maximum tokens to generate
    """
    raise NotImplementedError

# Compare
prompts = [
    "It was on a dreary night of November that",
    "The monster gazed upon his creator with",
    "I beheld the wretch—the miserable monster",
    "Life and death appeared to me ideal bounds"
]
# compare_samples(
#     "mistralai/Mistral-7B-Instruct-v0.2",
#     "YOURUSER/mistral-frankenstein-qlora",
#     prompts,
#     max_new_tokens=100
# )
