# Lab 3.1.4: 8B Model LoRA Fine-Tuning - Solutions

Complete solutions for the 8B fine-tuning exercises.

## Exercise 1: Complete Training Pipeline

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset

def create_complete_training_pipeline(
    model_id: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    dataset_name: str = "tatsu-lab/alpaca",
    output_dir: str = "./lora-finetuned",
    use_dora: bool = True,
    neftune_alpha: float = 5.0,
    lora_r: int = 16,
    lora_alpha: int = 32,
    epochs: int = 1,
    max_samples: int = 1000
):
    """
    Complete production-ready fine-tuning pipeline.
    """
    print("=" * 60)
    print("COMPLETE FINE-TUNING PIPELINE")
    print("=" * 60)
    
    # 1. Quantization config for memory efficiency
    print("\n1. Setting up quantization...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True
    )
    
    # 2. Load model
    print("2. Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="flash_attention_2" if torch.cuda.is_available() else "eager"
    )
    model = prepare_model_for_kbit_training(model)
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    
    # 3. LoRA configuration with DoRA
    print("3. Configuring LoRA/DoRA...")
    lora_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=0.05,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        bias="none",
        task_type="CAUSAL_LM",
        use_dora=use_dora  # Enable DoRA for better performance
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    # 4. Load and format dataset
    print("4. Loading dataset...")
    dataset = load_dataset(dataset_name, split=f"train[:{max_samples}]")
    
    def format_instruction(example):
        if example.get("input", "").strip():
            text = f"""### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
{example['output']}"""
        else:
            text = f"""### Instruction:
{example['instruction']}

### Response:
{example['output']}"""
        return {"text": text}
    
    dataset = dataset.map(format_instruction)
    
    # 5. Training configuration with NEFTune
    print("5. Setting up training...")
    training_args = SFTConfig(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        logging_steps=10,
        save_strategy="epoch",
        bf16=True,
        gradient_checkpointing=True,
        optim="paged_adamw_8bit",
        max_seq_length=512,
        
        # NEFTune for better generalization
        neftune_noise_alpha=neftune_alpha,
        
        # Disable unnecessary features
        report_to="none",
        push_to_hub=False,
    )
    
    # 6. Create trainer
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer,
        dataset_text_field="text",
    )
    
    print("\n" + "=" * 60)
    print("Configuration Summary:")
    print(f"  Model: {model_id}")
    print(f"  LoRA rank: {lora_r}")
    print(f"  DoRA enabled: {use_dora}")
    print(f"  NEFTune alpha: {neftune_alpha}")
    print(f"  Training samples: {len(dataset)}")
    print("=" * 60)
    
    return trainer, model, tokenizer

# Create pipeline
# trainer, model, tokenizer = create_complete_training_pipeline()
# trainer.train()

## Exercise 2: Memory Optimization Analysis

In [None]:
def analyze_memory_usage():
    """
    Analyze memory usage for different configurations.
    """
    import gc
    
    configs = [
        {"name": "FP16 Full", "bits": 16, "lora": False, "grad_ckpt": False},
        {"name": "FP16 + LoRA", "bits": 16, "lora": True, "grad_ckpt": False},
        {"name": "4-bit QLoRA", "bits": 4, "lora": True, "grad_ckpt": False},
        {"name": "4-bit QLoRA + GC", "bits": 4, "lora": True, "grad_ckpt": True},
    ]
    
    # Theoretical memory for 8B model
    params_b = 8  # 8 billion params
    
    results = []
    for config in configs:
        # Base model memory
        bytes_per_param = config["bits"] / 8
        model_mem = params_b * bytes_per_param
        
        # Optimizer states (only for trainable params)
        if config["lora"]:
            trainable_ratio = 0.01  # ~1% trainable
        else:
            trainable_ratio = 1.0
        
        # AdamW: 2 states per param (m and v)
        optimizer_mem = params_b * trainable_ratio * 4 * 2  # FP32 states
        
        # Gradients
        gradient_mem = params_b * trainable_ratio * 4  # FP32
        
        # Activations (rough estimate)
        batch_size = 4
        seq_len = 512
        hidden = 4096
        layers = 32
        
        if config["grad_ckpt"]:
            activation_mem = batch_size * seq_len * hidden * 2 / 1e9  # Only keep essential
        else:
            activation_mem = batch_size * seq_len * hidden * layers * 2 / 1e9
        
        total = model_mem + optimizer_mem + gradient_mem + activation_mem
        
        results.append({
            "config": config["name"],
            "model_gb": model_mem,
            "optimizer_gb": optimizer_mem,
            "gradient_gb": gradient_mem,
            "activation_gb": activation_mem,
            "total_gb": total
        })
    
    # Display
    print("Memory Analysis for 8B Model:")
    print("=" * 80)
    print(f"{'Config':<20} {'Model':>10} {'Optimizer':>10} {'Gradients':>10} {'Activations':>12} {'Total':>10}")
    print("-" * 80)
    
    for r in results:
        print(f"{r['config']:<20} {r['model_gb']:>9.1f}G {r['optimizer_gb']:>9.1f}G {r['gradient_gb']:>9.1f}G {r['activation_gb']:>11.1f}G {r['total_gb']:>9.1f}G")
    
    print("\nðŸ’¡ Key Insight: 4-bit QLoRA + Gradient Checkpointing reduces memory by ~10x!")
    return results

memory_results = analyze_memory_usage()

## Exercise 3: Hyperparameter Tuning

In [None]:
import itertools

def hyperparameter_search_space():
    """
    Define hyperparameter search space for LoRA fine-tuning.
    """
    search_space = {
        # LoRA parameters
        "lora_r": [8, 16, 32, 64],
        "lora_alpha": [16, 32, 64],
        "lora_dropout": [0.0, 0.05, 0.1],
        
        # Training parameters
        "learning_rate": [1e-4, 2e-4, 5e-4],
        "batch_size": [4, 8],
        "warmup_ratio": [0.03, 0.1],
        
        # Regularization
        "neftune_alpha": [0, 5, 10],
        "use_dora": [False, True],
    }
    
    # Recommended configurations based on research
    recommended_configs = [
        {
            "name": "Baseline LoRA",
            "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.05,
            "learning_rate": 2e-4, "batch_size": 4, "warmup_ratio": 0.1,
            "neftune_alpha": 0, "use_dora": False
        },
        {
            "name": "DoRA + NEFTune (Recommended)",
            "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.05,
            "learning_rate": 2e-4, "batch_size": 4, "warmup_ratio": 0.1,
            "neftune_alpha": 5, "use_dora": True
        },
        {
            "name": "High Capacity",
            "lora_r": 64, "lora_alpha": 128, "lora_dropout": 0.1,
            "learning_rate": 1e-4, "batch_size": 4, "warmup_ratio": 0.1,
            "neftune_alpha": 5, "use_dora": True
        },
        {
            "name": "Fast Training",
            "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.0,
            "learning_rate": 5e-4, "batch_size": 8, "warmup_ratio": 0.03,
            "neftune_alpha": 0, "use_dora": False
        },
    ]
    
    print("Recommended Configurations:")
    print("=" * 80)
    
    for config in recommended_configs:
        print(f"\n{config['name']}:")
        for k, v in config.items():
            if k != "name":
                print(f"  {k}: {v}")
    
    # Calculate total combinations
    total = 1
    for values in search_space.values():
        total *= len(values)
    print(f"\nTotal possible combinations: {total:,}")
    print("Consider using Optuna or Ray Tune for efficient search.")
    
    return search_space, recommended_configs

search_space, configs = hyperparameter_search_space()

## Exercise 4: Model Evaluation

In [None]:
def evaluate_finetuned_model(model, tokenizer, prompts: list):
    """
    Evaluate fine-tuned model on test prompts.
    """
    model.eval()
    results = []
    
    for prompt in prompts:
        formatted = f"### Instruction:\n{prompt}\n\n### Response:\n"
        
        inputs = tokenizer(formatted, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response.split("### Response:\n")[-1].strip()
        
        results.append({
            "prompt": prompt,
            "response": response
        })
        
        print(f"\n{'='*60}")
        print(f"Prompt: {prompt}")
        print(f"-"*60)
        print(f"Response: {response}")
    
    return results

# Test prompts
test_prompts = [
    "Explain quantum computing in simple terms.",
    "Write a Python function to reverse a string.",
    "What are the benefits of exercise?",
    "Summarize the plot of Romeo and Juliet in one paragraph."
]

# Uncomment after training
# results = evaluate_finetuned_model(model, tokenizer, test_prompts)

## Exercise 5: Save and Load Adapters

In [None]:
from peft import PeftModel

def save_and_load_adapter(model, tokenizer, save_path: str):
    """
    Save LoRA adapter and demonstrate loading.
    """
    # Save adapter only (not full model)
    print(f"Saving adapter to {save_path}...")
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    
    # Check saved files
    import os
    files = os.listdir(save_path)
    print(f"\nSaved files: {files}")
    
    # Calculate size
    total_size = sum(
        os.path.getsize(os.path.join(save_path, f)) 
        for f in files if os.path.isfile(os.path.join(save_path, f))
    )
    print(f"Total size: {total_size / 1024 / 1024:.2f} MB")

def load_adapter(base_model_id: str, adapter_path: str):
    """
    Load adapter onto base model.
    """
    print(f"Loading base model: {base_model_id}")
    
    # Quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        quantization_config=bnb_config,
        device_map="auto"
    )
    
    print(f"Loading adapter: {adapter_path}")
    model = PeftModel.from_pretrained(base_model, adapter_path)
    
    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
    
    print("Model loaded successfully!")
    return model, tokenizer

# Example usage:
# save_and_load_adapter(model, tokenizer, "./my-adapter")
# loaded_model, loaded_tokenizer = load_adapter(
#     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
#     "./my-adapter"
# )

## Key Takeaways

1. **QLoRA + DoRA + NEFTune**: The winning combination for efficient fine-tuning
2. **Memory**: 4-bit quantization + gradient checkpointing enables 8B training on consumer GPUs
3. **Hyperparameters**: r=16, alpha=32, lr=2e-4 is a solid starting point
4. **Evaluation**: Always test on diverse prompts before deployment
5. **Saving**: Adapters are tiny (~100MB) vs full model (~16GB)