# Evaluation: Base vs Finetuned (GPU)

## Comparison Goals

After training, we want to:
1. **Quantitative:** Compare perplexity (base vs finetuned)
2. **Qualitative:** Generate samples side-by-side
3. **Document:** Record hyperparameters, costs, latency

This notebook runs on GPU for speed, but you can adapt it for CPU if needed.


In [None]:
# === INSTALL DEPENDENCIES (Run this first!) ===
# Install all required packages from requirements-gpu.txt
# These are needed for loading and evaluating the finetuned model with QLoRA adapters

import sys

if sys.version_info >= (3, 12):
    raise SystemExit(
        "‚ùå Python 3.12 detected. bitsandbytes does not publish wheels for 3.12 yet.\n"
        "   Switch the runtime/kernel to Python 3.10 or 3.11, then rerun this cell."
    )

%pip install -U torch "transformers<4.45" datasets peft "accelerate>=0.27" "bitsandbytes>=0.42.0" trl huggingface_hub

# Verify installation
try:
    import torch
    import transformers
    import datasets
    import peft
    import accelerate
    import bitsandbytes
    import trl
    import huggingface_hub
    from packaging import version
    import importlib.metadata
    
    print("‚úÖ All packages installed successfully:")
    print(f"   - python: {sys.version.split()[0]}")
    print(f"   - torch: {torch.__version__}")
    print(f"   - transformers: {transformers.__version__}")
    print(f"   - datasets: {datasets.__version__}")
    print(f"   - peft: {peft.__version__}")
    print(f"   - accelerate: {accelerate.__version__}")
    print(f"   - bitsandbytes: {importlib.metadata.version('bitsandbytes')}")
    print(f"   - trl: {trl.__version__}")
    print(f"   - huggingface_hub: {huggingface_hub.__version__}")
    
    # Check critical version requirements
    bnb_version = importlib.metadata.version("bitsandbytes")
    if version.parse(bnb_version) < version.parse("0.42.0"):
        print("‚ö†Ô∏è  WARNING: bitsandbytes version is < 0.42.0. You may need to restart the runtime.")
        print("   Upgrade with: pip install -U bitsandbytes  # then restart kernel")
    
    transformers_version = transformers.__version__
    if version.parse(transformers_version) >= version.parse("4.45"):
        print("‚ö†Ô∏è  WARNING: transformers version is >= 4.45. Requirements specify <4.45.")
    
    accelerate_version = accelerate.__version__
    if version.parse(accelerate_version) < version.parse("0.27"):
        print("‚ö†Ô∏è  WARNING: accelerate version is < 0.27. Requirements specify >=0.27.")
        
except Exception as e:
    print(f"‚ùå Error checking packages: {e}")
    print("   Please restart the runtime after installation and try again.")


SystemExit: ‚ùå Python 3.12 detected. bitsandbytes does not publish wheels for 3.12 yet.
   Switch the runtime/kernel to Python 3.10 or 3.11, then rerun this cell.

In [None]:
# === TODO (you code this) ===
# Load base model + attach LoRA adapters; run perplexity on validation slice.
# Hints:
#   - Load base model in 4-bit
#   - Use PeftModel.from_pretrained() to attach adapters
#   - Compute perplexity on validation set (similar to notebook 04, but on GPU)
# Acceptance:
#   - prints ppl_base vs ppl_finetuned

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch
from datasets import load_dataset

def eval_perplexity_with_adapters(base_model: str, adapter_repo: str, dataset, n_samples: int=25):
    """
    Evaluate perplexity with base and finetuned models.
    
    Args:
        base_model: Base model name
        adapter_repo: Hub repo with LoRA adapters
        dataset: Validation dataset (raw text, not tokenized)
        n_samples: Number of samples to evaluate
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if device == "cpu":
        print("‚ö†Ô∏è  WARNING: No GPU detected! This will be very slow.")
        print("   For faster evaluation, enable GPU in Colab: Runtime ‚Üí Change runtime type ‚Üí GPU")
        print("   Continuing on CPU (this may take 30-60 minutes)...\n")
    else:
        print(f"‚úÖ GPU detected: {torch.cuda.get_device_name(0)}")
    print(f"Loading base model {base_model}...")
    
    # Load tokenizer first
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    tokenizer.pad_token = tokenizer.eos_token
    
    # CRITICAL: Adapters were trained with 4-bit quantization, so we MUST use the same structure
    # Check if bitsandbytes is available and up-to-date (REQUIRED for loading adapters trained with QLoRA)
    try:
        import bitsandbytes
        from packaging import version
        import importlib.metadata
        
        # Check version - need >= 0.42.0 for 4-bit quantization
        bnb_version = importlib.metadata.version("bitsandbytes")
        min_version = version.parse("0.42.0")
        current_version = version.parse(bnb_version)
        
        if current_version < min_version:
            raise ImportError(
                f"‚ùå bitsandbytes version {bnb_version} is too old! Need >= 0.42.0\n"
                "   Upgrade it with: !pip install -U bitsandbytes\n"
                "   Then restart the runtime and run this cell again."
            )
        
        use_quantization = device == "cuda"  # Only use quantization on GPU
        print(f"‚úÖ bitsandbytes {bnb_version} is installed and compatible")
        
    except ImportError as e:
        use_quantization = False
        if device == "cuda":
            error_msg = str(e)
            if "version" in error_msg.lower() or "0.39" in error_msg:
                # Version issue - already handled above
                raise
            else:
                raise ImportError(
                    "‚ùå bitsandbytes is REQUIRED to load QLoRA adapters!\n"
                    "   The adapters were trained with 4-bit quantization and require the same model structure.\n"
                    "   Install it with: !pip install -U bitsandbytes\n"
                    "   Then restart the runtime and run this cell again."
                ) from e
        else:
            raise RuntimeError(
                "‚ùå Cannot load QLoRA adapters on CPU!\n"
                "   The adapters were trained with 4-bit quantization (GPU only).\n"
                "   Please run this notebook on GPU (Colab: Runtime ‚Üí Change runtime type ‚Üí GPU)."
            )
    
    # Load base model - MUST use 4-bit quantization to match adapter structure
    load_kwargs = {}
    
    if use_quantization:
        # Use 4-bit quantization (same as training) - REQUIRED for adapter compatibility
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
        load_kwargs["quantization_config"] = quantization_config
        load_kwargs["dtype"] = torch.bfloat16
        load_kwargs["device_map"] = "auto"
        print("Loading model with 4-bit quantization (required for adapter compatibility)...")
    else:
        # This should not happen if checks above work, but just in case
        raise RuntimeError("Quantization is required but not available!")
    
    # Load base model with quantization (required for adapter compatibility)
    base_model_obj = AutoModelForCausalLM.from_pretrained(
        base_model,
        **load_kwargs
    )
    base_model_obj.eval()
    
    print("‚úÖ Base model loaded")
    print(f"Loading adapters from {adapter_repo}...")
    
    # Load finetuned model (base + adapters)
    finetuned_model = PeftModel.from_pretrained(base_model_obj, adapter_repo)
    
    # CRITICAL: For 4-bit quantized models, we may need to merge adapters for inference
    # Try merging adapters to ensure they're active (this is safe for inference)
    try:
        # Check if we can merge (some PEFT versions support this)
        if hasattr(finetuned_model, 'merge_and_unload'):
            print("‚ö†Ô∏è  Attempting to merge adapters for better inference performance...")
            # Note: merge_and_unload() may not work with 4-bit, so we'll try-catch it
            try:
                finetuned_model = finetuned_model.merge_and_unload()
                print("‚úÖ Adapters merged successfully")
            except Exception as merge_error:
                print(f"‚ö†Ô∏è  Could not merge adapters (expected with 4-bit): {merge_error}")
                print("   Continuing with unmerged adapters (should still work)")
        else:
            # Ensure adapters are active
            if hasattr(finetuned_model, 'set_adapter'):
                finetuned_model.set_adapter('default')
                print("‚úÖ Adapters set to 'default'")
    except Exception as e:
        print(f"‚ö†Ô∏è  Note: {e}")
        print("   Continuing with adapters as loaded...")
    
    finetuned_model.eval()
    
    print("‚úÖ Finetuned model loaded")
    
    # DIAGNOSTIC: Verify adapters are actually loaded and active
    print("\n" + "="*60)
    print("ADAPTER DIAGNOSTICS")
    print("="*60)
    try:
        # Check if adapters are present
        if hasattr(finetuned_model, 'peft_config'):
            print(f"‚úÖ Adapters found: {list(finetuned_model.peft_config.keys())}")
        else:
            print("‚ö†Ô∏è  WARNING: No peft_config found - adapters may not be loaded!")
        
        # Check active adapters
        if hasattr(finetuned_model, 'active_adapters'):
            active = finetuned_model.active_adapters
            print(f"‚úÖ Active adapters: {active}")
        else:
            print("‚ö†Ô∏è  WARNING: Cannot check active adapters")
        
        # Check adapter parameters
        trainable_params = sum(p.numel() for p in finetuned_model.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in finetuned_model.parameters())
        print(f"‚úÖ Trainable parameters: {trainable_params:,} / {total_params:,} ({100*trainable_params/total_params:.2f}%)")
        
        # Compare a single forward pass to see if outputs differ
        test_text = "It was on a dreary night of November that"
        test_input = tokenizer(test_text, return_tensors="pt", truncation=True, max_length=50)
        if device == "cuda":
            test_input = {k: v.to(device) for k, v in test_input.items()}
        
        with torch.no_grad():
            base_output = base_model_obj(**test_input)
            finetuned_output = finetuned_model(**test_input)
        
        base_logits = base_output.logits[0, -1, :10].cpu()  # First 10 logits of last token
        finetuned_logits = finetuned_output.logits[0, -1, :10].cpu()
        
        logit_diff = torch.abs(base_logits - finetuned_logits).mean().item()
        print(f"‚úÖ Logit difference (first 10 tokens): {logit_diff:.4f}")
        if logit_diff < 0.001:
            print("‚ö†Ô∏è  WARNING: Logits are nearly identical! Adapters may not be active.")
            print("   Possible causes:")
            print("   1. Adapters didn't learn meaningful changes during training")
            print("   2. Training loss reduction was minimal (check training logs)")
            print("   3. 4-bit quantization compatibility issue with PEFT version")
            print("   4. Adapters need to be explicitly enabled (try set_adapter if available)")
            print("   Recommendation: Check training logs to verify loss decreased during training")
        else:
            print("‚úÖ Logits differ - adapters appear to be active")
        
    except Exception as e:
        print(f"‚ö†Ô∏è  Error during diagnostics: {e}")
    print("="*60 + "\n")
    
    # Limit samples
    n_samples = min(n_samples, len(dataset))
    print(f"\nComputing perplexity on {n_samples} samples...")
    print("‚ö†Ô∏è  This may take 5-15 minutes. Be patient!\n")
    
    import time
    start_time = time.time()
    
    # Compute base model perplexity
    print("Computing BASE model perplexity...")
    base_total_nll = 0.0
    base_total_tokens = 0
    
    with torch.no_grad():
        for i, sample in enumerate(dataset.select(range(n_samples))):
            text = sample['text']
            # Tokenize
            encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
            input_ids = encoded['input_ids']
            if device == "cuda":
                input_ids = input_ids.to(device)
            
            # Forward pass to get loss
            outputs = base_model_obj(input_ids, labels=input_ids)
            nll = outputs.loss.item() * input_ids.size(1)
            
            base_total_nll += nll
            base_total_tokens += input_ids.size(1)
            
            if (i + 1) % 5 == 0:
                print(f"  Base: Processed {i + 1}/{n_samples} samples...")
    
    base_avg_nll = base_total_nll / base_total_tokens
    base_perplexity = torch.exp(torch.tensor(base_avg_nll)).item()
    
    # Compute finetuned model perplexity
    print("\nComputing FINETUNED model perplexity...")
    finetuned_total_nll = 0.0
    finetuned_total_tokens = 0
    
    with torch.no_grad():
        for i, sample in enumerate(dataset.select(range(n_samples))):
            text = sample['text']
            # Tokenize
            encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
            input_ids = encoded['input_ids']
            if device == "cuda":
                input_ids = input_ids.to(device)
            
            # Forward pass to get loss
            outputs = finetuned_model(input_ids, labels=input_ids)
            nll = outputs.loss.item() * input_ids.size(1)
            
            finetuned_total_nll += nll
            finetuned_total_tokens += input_ids.size(1)
            
            if (i + 1) % 5 == 0:
                print(f"  Finetuned: Processed {i + 1}/{n_samples} samples...")
    
    finetuned_avg_nll = finetuned_total_nll / finetuned_total_tokens
    finetuned_perplexity = torch.exp(torch.tensor(finetuned_avg_nll)).item()
    
    elapsed_time = time.time() - start_time
    
    # Print results
    print("\n" + "="*60)
    print("PERPLEXITY COMPARISON")
    print("="*60)
    print(f"Base Model:        {base_perplexity:.2f}")
    print(f"Finetuned Model:   {finetuned_perplexity:.2f}")
    print(f"Improvement:       {base_perplexity - finetuned_perplexity:.2f} points")
    print(f"Relative Change:   {((finetuned_perplexity - base_perplexity) / base_perplexity * 100):.1f}%")
    print(f"\nSamples evaluated: {n_samples}")
    print(f"Time taken: {elapsed_time/60:.1f} minutes")
    print("="*60)
    
    return {
        'base_perplexity': base_perplexity,
        'finetuned_perplexity': finetuned_perplexity,
        'improvement': base_perplexity - finetuned_perplexity
    }

# Load dataset (raw text, not tokenized)
from datasets import load_dataset
from huggingface_hub import HfFolder
import os

# Get token - use environment variable or Hugging Face login
# Option 1: Set HF_TOKEN environment variable
# Option 2: Use: from huggingface_hub import login; login()
hf_token = os.getenv("HF_TOKEN") or HfFolder.get_token()
if not hf_token:
    print("‚ö†Ô∏è  WARNING: No HF token found. Set HF_TOKEN environment variable or use login()")
    print("   For Colab: Use Colab secrets (HF_TOKEN) or login()")
    hf_token = None  # Will try without token (may fail for private datasets)

ds_val = load_dataset("Tuminha/frankenstein-fanfic-snippets", token=hf_token)['validation']

# Evaluate
results = eval_perplexity_with_adapters(
    "mistralai/Mistral-7B-Instruct-v0.2",
    "Tuminha/mistral-frankenstein-qlora",
    ds_val,
    n_samples=25  # Start with 25, can increase later
)


‚úÖ GPU detected: Tesla T4
Loading base model mistralai/Mistral-7B-Instruct-v0.2...


ImportError: ‚ùå bitsandbytes is REQUIRED to load QLoRA adapters!
   The adapters were trained with 4-bit quantization and require the same model structure.
   Install it with: !pip install -U bitsandbytes
   Then restart the runtime and run this cell again.

## Side-by-Side Generation

Generate text with both models using the same prompts. Compare style, coherence, and Frankenstein-like tone.


In [None]:
# === TODO (you code this) ===
# Generate 3-5 short continuations with both models for side-by-side comparison.
# Hints:
#   - Load base model and finetuned (base + adapters)
#   - Use same prompts for both
#   - Print outputs side-by-side or in a table
#   - Use reasonable generation parameters (temperature, top_p)
# Acceptance:
#   - prints paired outputs with fixed prompts

def compare_samples(base_model: str, adapter_repo: str, prompts: list, max_new_tokens: int=100):
    """
    Generate samples with base and finetuned models for comparison.
    
    Args:
        base_model: Base model name
        adapter_repo: Hub repo with LoRA adapters
        prompts: List of prompt strings
        max_new_tokens: Maximum tokens to generate
    """
    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
    from peft import PeftModel
    import torch
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    print("Loading models for generation comparison...")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Load base model (same as in perplexity evaluation)
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    
    base_model_obj = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=quantization_config,
        dtype=torch.bfloat16,
        device_map="auto"
    )
    base_model_obj.eval()
    
    # Load finetuned model
    finetuned_model = PeftModel.from_pretrained(base_model_obj, adapter_repo)
    finetuned_model.eval()
    
    print("‚úÖ Models loaded for generation\n")
    
    # Generation parameters
    generation_kwargs = {
        "max_new_tokens": max_new_tokens,
        "temperature": 0.7,
        "top_p": 0.9,
        "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id
    }
    
    print("="*80)
    print("SIDE-BY-SIDE GENERATION COMPARISON")
    print("="*80)
    
    for i, prompt in enumerate(prompts, 1):
        print(f"\n{'='*80}")
        print(f"PROMPT {i}: {prompt}")
        print("="*80)
        
        # Tokenize prompt
        inputs = tokenizer(prompt, return_tensors="pt")
        if device == "cuda":
            inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate with base model
        print("\nüìò BASE MODEL:")
        print("-" * 80)
        with torch.no_grad():
            base_outputs = base_model_obj.generate(**inputs, **generation_kwargs)
        base_text = tokenizer.decode(base_outputs[0], skip_special_tokens=True)
        # Only show the generated part (after the prompt)
        base_generated = base_text[len(prompt):].strip()
        print(base_generated)
        
        # Generate with finetuned model
        print("\nüìó FINETUNED MODEL:")
        print("-" * 80)
        with torch.no_grad():
            finetuned_outputs = finetuned_model.generate(**inputs, **generation_kwargs)
        finetuned_text = tokenizer.decode(finetuned_outputs[0], skip_special_tokens=True)
        finetuned_generated = finetuned_text[len(prompt):].strip()
        print(finetuned_generated)
        
        # Highlight differences (simple comparison)
        if base_generated != finetuned_generated:
            print("\n‚úÖ Outputs differ - adapters are affecting generation")
        else:
            print("\n‚ö†Ô∏è  Outputs are identical - adapters may not be active")
    
    print("\n" + "="*80)
    print("COMPARISON COMPLETE")
    print("="*80)

# Compare
prompts = [
    "It was on a dreary night of November that",
    "The monster gazed upon his creator with",
    "I beheld the wretch‚Äîthe miserable monster",
    "Life and death appeared to me ideal bounds"
]
compare_samples(
    "mistralai/Mistral-7B-Instruct-v0.2",
    "Tuminha/mistral-frankenstein-qlora",
    prompts,
    max_new_tokens=100
)
