# Lab 3.1.3 Solutions: 70B Model QLoRA Fine-Tuning (DGX Spark Showcase)

**Module:** 3.1 - Large Language Model Fine-Tuning  
**Difficulty:** ⭐⭐⭐⭐☆ (Advanced)  
**Exercises:** 3 (Memory Estimation, Memory Profiler, Config Optimizer)

This notebook contains solutions for exercises in the 70B QLoRA fine-tuning notebook.

**Note:** This is the DGX Spark showcase task - fine-tuning a 70B model on a single desktop!

---

---

## Exercise 1 Solution: Memory Estimation Calculator

**Task:** Create a function that estimates memory requirements for different model sizes and configurations.

In [None]:
from dataclasses import dataclass
from typing import Dict, Optional
import math

@dataclass
class ModelConfig:
    """Model architecture configuration."""
    name: str
    params_billions: float
    hidden_size: int
    num_layers: int
    num_attention_heads: int
    intermediate_size: int
    vocab_size: int = 128256  # Llama 3 default


# Common model configurations
MODEL_CONFIGS = {
    "llama-3.1-8b": ModelConfig(
        name="Llama 3.1 8B",
        params_billions=8.0,
        hidden_size=4096,
        num_layers=32,
        num_attention_heads=32,
        intermediate_size=14336,
    ),
    "llama-3.1-70b": ModelConfig(
        name="Llama 3.1 70B",
        params_billions=70.0,
        hidden_size=8192,
        num_layers=80,
        num_attention_heads=64,
        intermediate_size=28672,
    ),
    "llama-3.1-405b": ModelConfig(
        name="Llama 3.1 405B",
        params_billions=405.0,
        hidden_size=16384,
        num_layers=126,
        num_attention_heads=128,
        intermediate_size=53248,
    ),
}


def estimate_memory_requirements(
    model_config: ModelConfig,
    batch_size: int = 1,
    sequence_length: int = 2048,
    lora_rank: int = 16,
    num_lora_modules: int = 4,  # e.g., q, k, v, o projections
    quantization_bits: int = 4,  # 4 for QLoRA, 16 for full precision
    gradient_checkpointing: bool = True,
    optimizer_states_per_param: int = 2,  # AdamW: 2 (momentum + variance)
) -> Dict[str, float]:
    """
    Estimate memory requirements for model training/inference.
    
    Args:
        model_config: Model configuration
        batch_size: Training batch size
        sequence_length: Maximum sequence length
        lora_rank: LoRA rank (r)
        num_lora_modules: Number of modules with LoRA adapters per layer
        quantization_bits: Bits for base model weights (4 for QLoRA)
        gradient_checkpointing: Whether gradient checkpointing is enabled
        optimizer_states_per_param: Optimizer state multiplier
    
    Returns:
        Dictionary with memory estimates in GB
    """
    params = model_config.params_billions * 1e9
    
    # 1. Base model weights
    bytes_per_param = quantization_bits / 8
    model_memory_gb = (params * bytes_per_param) / 1e9
    
    # 2. LoRA parameters (always in full precision)
    lora_params_per_layer = (
        num_lora_modules * 2 *  # A and B matrices
        lora_rank * model_config.hidden_size
    )
    total_lora_params = lora_params_per_layer * model_config.num_layers
    lora_memory_gb = (total_lora_params * 4) / 1e9  # FP32
    
    # 3. Optimizer states (only for LoRA params)
    optimizer_memory_gb = (
        total_lora_params * optimizer_states_per_param * 4
    ) / 1e9  # FP32
    
    # 4. Gradients (only for LoRA params)
    gradient_memory_gb = (total_lora_params * 4) / 1e9  # FP32
    
    # 5. Activations (this is approximate)
    # Per layer: attention + MLP activations
    # With gradient checkpointing, we only store sqrt(L) checkpoints
    bytes_per_activation = 2  # BF16
    
    # Attention activations: QKV, attention weights, attention output
    attention_activations = (
        3 * batch_size * sequence_length * model_config.hidden_size +  # Q, K, V
        batch_size * model_config.num_attention_heads * sequence_length ** 2  # Attention weights
    )
    
    # MLP activations
    mlp_activations = (
        batch_size * sequence_length * model_config.intermediate_size * 2
    )
    
    activations_per_layer = (attention_activations + mlp_activations) * bytes_per_activation
    
    if gradient_checkpointing:
        # Only store sqrt(L) checkpoints
        num_stored_layers = int(math.sqrt(model_config.num_layers))
        activation_memory_gb = (activations_per_layer * num_stored_layers) / 1e9
    else:
        activation_memory_gb = (activations_per_layer * model_config.num_layers) / 1e9
    
    # 6. KV Cache (for inference/generation)
    kv_cache_per_layer = (
        2 * batch_size * sequence_length * model_config.hidden_size * 2  # K and V, BF16
    )
    kv_cache_memory_gb = (kv_cache_per_layer * model_config.num_layers) / 1e9
    
    # 7. Buffer and overhead (typically 10-20%)
    overhead_factor = 0.15
    
    # Total for training
    training_total = (
        model_memory_gb +
        lora_memory_gb +
        optimizer_memory_gb +
        gradient_memory_gb +
        activation_memory_gb
    )
    training_with_overhead = training_total * (1 + overhead_factor)
    
    # Total for inference
    inference_total = model_memory_gb + lora_memory_gb + kv_cache_memory_gb
    inference_with_overhead = inference_total * (1 + overhead_factor)
    
    return {
        "model_name": model_config.name,
        "base_model_weights_gb": model_memory_gb,
        "lora_params": total_lora_params,
        "lora_params_gb": lora_memory_gb,
        "optimizer_states_gb": optimizer_memory_gb,
        "gradients_gb": gradient_memory_gb,
        "activations_gb": activation_memory_gb,
        "kv_cache_gb": kv_cache_memory_gb,
        "training_total_gb": training_with_overhead,
        "inference_total_gb": inference_with_overhead,
        "fits_dgx_spark_128gb": training_with_overhead < 128,
        "quantization": f"{quantization_bits}-bit",
        "gradient_checkpointing": gradient_checkpointing,
    }


# Example: Estimate for 70B model
estimates_70b = estimate_memory_requirements(
    MODEL_CONFIGS["llama-3.1-70b"],
    batch_size=1,
    sequence_length=2048,
    lora_rank=16,
    quantization_bits=4,
    gradient_checkpointing=True,
)

print("Memory Estimation: Llama 3.1 70B with QLoRA")
print("=" * 50)
print(f"Base model weights: {estimates_70b['base_model_weights_gb']:.1f} GB")
print(f"LoRA parameters: {estimates_70b['lora_params']:,} ({estimates_70b['lora_params_gb']:.3f} GB)")
print(f"Optimizer states: {estimates_70b['optimizer_states_gb']:.3f} GB")
print(f"Gradients: {estimates_70b['gradients_gb']:.3f} GB")
print(f"Activations: {estimates_70b['activations_gb']:.1f} GB")
print(f"\nTotal Training Memory: {estimates_70b['training_total_gb']:.1f} GB")
print(f"Total Inference Memory: {estimates_70b['inference_total_gb']:.1f} GB")
print(f"\nFits on DGX Spark (128GB): {estimates_70b['fits_dgx_spark_128gb']}")

In [None]:
# Compare different configurations
import pandas as pd

configurations = [
    # (model_key, batch_size, seq_len, lora_rank, quant_bits, grad_ckpt)
    ("llama-3.1-8b", 4, 2048, 16, 16, True),   # Full precision 8B
    ("llama-3.1-8b", 4, 2048, 16, 4, True),    # QLoRA 8B
    ("llama-3.1-70b", 1, 2048, 16, 16, True),  # Full precision 70B (won't fit!)
    ("llama-3.1-70b", 1, 2048, 16, 4, True),   # QLoRA 70B
    ("llama-3.1-70b", 2, 4096, 32, 4, True),   # QLoRA 70B larger batch
    ("llama-3.1-405b", 1, 2048, 8, 4, True),   # QLoRA 405B
]

results = []
for model_key, bs, seq_len, rank, bits, grad_ckpt in configurations:
    est = estimate_memory_requirements(
        MODEL_CONFIGS[model_key],
        batch_size=bs,
        sequence_length=seq_len,
        lora_rank=rank,
        quantization_bits=bits,
        gradient_checkpointing=grad_ckpt,
    )
    results.append({
        "Model": est["model_name"],
        "Batch": bs,
        "SeqLen": seq_len,
        "Rank": rank,
        "Quant": est["quantization"],
        "Training GB": f"{est['training_total_gb']:.1f}",
        "Fits 128GB": "Yes" if est['fits_dgx_spark_128gb'] else "No",
    })

df = pd.DataFrame(results)
print("\nMemory Comparison Across Configurations:")
print(df.to_string(index=False))
print("\nKey Insight: QLoRA makes 70B training possible on DGX Spark!")

---

## Exercise 2 Solution: Memory Profiler for Training

**Task:** Create a memory profiler that tracks GPU memory during training steps.

In [None]:
import torch
import time
from typing import List, Dict, Callable, Any
from dataclasses import dataclass, field
from contextlib import contextmanager
import matplotlib.pyplot as plt

@dataclass
class MemorySnapshot:
    """A single memory measurement."""
    timestamp: float
    phase: str
    allocated_gb: float
    reserved_gb: float
    cached_gb: float


class GPUMemoryProfiler:
    """
    Profile GPU memory usage during model training.
    
    Works with DGX Spark's unified memory architecture.
    """
    
    def __init__(self, device: int = 0):
        self.device = device
        self.snapshots: List[MemorySnapshot] = []
        self.start_time = None
        self.phase_starts: Dict[str, float] = {}
        
    def reset(self):
        """Clear all snapshots and reset."""
        self.snapshots = []
        self.start_time = None
        self.phase_starts = {}
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats(self.device)
            torch.cuda.empty_cache()
    
    def _get_memory_gb(self) -> Dict[str, float]:
        """Get current memory statistics in GB."""
        if not torch.cuda.is_available():
            return {"allocated": 0, "reserved": 0, "cached": 0}
        
        return {
            "allocated": torch.cuda.memory_allocated(self.device) / 1e9,
            "reserved": torch.cuda.memory_reserved(self.device) / 1e9,
            "cached": torch.cuda.memory_reserved(self.device) / 1e9,  # Same as reserved in PyTorch
        }
    
    def snapshot(self, phase: str = "unknown"):
        """Take a memory snapshot."""
        if self.start_time is None:
            self.start_time = time.time()
        
        mem = self._get_memory_gb()
        self.snapshots.append(MemorySnapshot(
            timestamp=time.time() - self.start_time,
            phase=phase,
            allocated_gb=mem["allocated"],
            reserved_gb=mem["reserved"],
            cached_gb=mem["cached"],
        ))
    
    @contextmanager
    def track_phase(self, phase: str):
        """
        Context manager to track a training phase.
        
        Usage:
            with profiler.track_phase("forward"):
                outputs = model(inputs)
        """
        self.snapshot(f"{phase}_start")
        try:
            yield
        finally:
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            self.snapshot(f"{phase}_end")
    
    def get_peak_memory(self) -> float:
        """Get peak allocated memory in GB."""
        if not self.snapshots:
            return 0.0
        return max(s.allocated_gb for s in self.snapshots)
    
    def get_phase_memory_delta(self, phase: str) -> float:
        """Get memory increase during a phase."""
        start_snap = None
        end_snap = None
        
        for s in self.snapshots:
            if s.phase == f"{phase}_start":
                start_snap = s
            elif s.phase == f"{phase}_end":
                end_snap = s
        
        if start_snap and end_snap:
            return end_snap.allocated_gb - start_snap.allocated_gb
        return 0.0
    
    def summary(self) -> Dict[str, Any]:
        """Get a summary of memory usage."""
        if not self.snapshots:
            return {"error": "No snapshots recorded"}
        
        allocated_values = [s.allocated_gb for s in self.snapshots]
        
        # Extract unique phases
        phases = set()
        for s in self.snapshots:
            if s.phase.endswith("_start"):
                phases.add(s.phase.replace("_start", ""))
        
        phase_deltas = {p: self.get_phase_memory_delta(p) for p in phases}
        
        return {
            "num_snapshots": len(self.snapshots),
            "peak_allocated_gb": max(allocated_values),
            "min_allocated_gb": min(allocated_values),
            "final_allocated_gb": allocated_values[-1],
            "phase_deltas_gb": phase_deltas,
            "total_duration_s": self.snapshots[-1].timestamp,
        }
    
    def plot_memory_timeline(self, save_path: str = None):
        """Plot memory usage over time."""
        if not self.snapshots:
            print("No snapshots to plot")
            return
        
        timestamps = [s.timestamp for s in self.snapshots]
        allocated = [s.allocated_gb for s in self.snapshots]
        reserved = [s.reserved_gb for s in self.snapshots]
        
        fig, ax = plt.subplots(figsize=(12, 6))
        
        ax.plot(timestamps, allocated, 'b-', label='Allocated', linewidth=2)
        ax.plot(timestamps, reserved, 'r--', label='Reserved', linewidth=2, alpha=0.7)
        
        # Mark phase boundaries
        colors = plt.cm.Set3(range(10))
        phase_idx = 0
        for i, s in enumerate(self.snapshots):
            if s.phase.endswith("_start"):
                phase_name = s.phase.replace("_start", "")
                ax.axvline(s.timestamp, color=colors[phase_idx % 10], 
                          linestyle=':', alpha=0.7)
                ax.annotate(phase_name, (s.timestamp, ax.get_ylim()[1] * 0.95),
                           rotation=90, fontsize=8)
                phase_idx += 1
        
        ax.set_xlabel('Time (seconds)')
        ax.set_ylabel('Memory (GB)')
        ax.set_title('GPU Memory Usage During Training Step')
        ax.legend()
        ax.grid(True, alpha=0.3)
        
        # Add DGX Spark reference line
        ax.axhline(y=128, color='green', linestyle='--', alpha=0.5,
                  label='DGX Spark 128GB')
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=150, bbox_inches='tight')
        
        plt.show()


print("GPUMemoryProfiler class defined!")
print("\nUsage example:")
print("  profiler = GPUMemoryProfiler()")
print("  with profiler.track_phase('forward'):")
print("      outputs = model(inputs)")
print("  print(profiler.summary())")

In [None]:
# Demo the profiler with a simulated training step
import torch.nn as nn

# Create a simple model
class DemoModel(nn.Module):
    def __init__(self, hidden_size=1024, num_layers=4):
        super().__init__()
        self.layers = nn.ModuleList([
            nn.Linear(hidden_size, hidden_size)
            for _ in range(num_layers)
        ])
        self.activation = nn.GELU()
    
    def forward(self, x):
        for layer in self.layers:
            x = self.activation(layer(x))
        return x

# Initialize
if torch.cuda.is_available():
    profiler = GPUMemoryProfiler()
    model = DemoModel(hidden_size=2048, num_layers=8).cuda()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    
    profiler.reset()
    
    # Profile a training step
    batch = torch.randn(32, 512, 2048).cuda()
    target = torch.randn(32, 512, 2048).cuda()
    
    with profiler.track_phase("forward"):
        output = model(batch)
    
    with profiler.track_phase("loss"):
        loss = nn.functional.mse_loss(output, target)
    
    with profiler.track_phase("backward"):
        loss.backward()
    
    with profiler.track_phase("optimizer"):
        optimizer.step()
        optimizer.zero_grad()
    
    # Print summary
    summary = profiler.summary()
    print("Memory Profiling Summary:")
    print("=" * 40)
    print(f"Peak memory: {summary['peak_allocated_gb']:.2f} GB")
    print(f"Final memory: {summary['final_allocated_gb']:.2f} GB")
    print(f"Duration: {summary['total_duration_s']:.3f} s")
    print("\nPhase memory deltas:")
    for phase, delta in summary['phase_deltas_gb'].items():
        print(f"  {phase}: {delta:+.3f} GB")
    
    # Plot
    profiler.plot_memory_timeline()
else:
    print("CUDA not available - skipping GPU profiling demo")

---

## Exercise 3 Solution: QLoRA Configuration Optimizer

**Task:** Create a function that suggests optimal QLoRA configurations based on available memory.

In [None]:
from typing import List, Tuple, Optional
from dataclasses import dataclass

@dataclass
class QLoRAConfig:
    """QLoRA training configuration."""
    batch_size: int
    gradient_accumulation: int
    sequence_length: int
    lora_rank: int
    lora_alpha: int
    target_modules: List[str]
    estimated_memory_gb: float
    effective_batch_size: int
    
    def __str__(self):
        return (
            f"QLoRA Config:\n"
            f"  Batch size: {self.batch_size} × {self.gradient_accumulation} = "
            f"{self.effective_batch_size}\n"
            f"  Sequence length: {self.sequence_length}\n"
            f"  LoRA rank: {self.lora_rank}\n"
            f"  LoRA alpha: {self.lora_alpha}\n"
            f"  Target modules: {self.target_modules}\n"
            f"  Estimated memory: {self.estimated_memory_gb:.1f} GB"
        )


def optimize_qlora_config(
    model_params_billions: float,
    available_memory_gb: float = 128.0,  # DGX Spark default
    target_batch_size: int = 16,
    max_sequence_length: int = 4096,
    safety_margin: float = 0.85,  # Use 85% of available memory
) -> QLoRAConfig:
    """
    Optimize QLoRA configuration for given memory constraints.
    
    Args:
        model_params_billions: Model size in billions of parameters
        available_memory_gb: Available GPU memory in GB
        target_batch_size: Desired effective batch size
        max_sequence_length: Maximum sequence length to consider
        safety_margin: Fraction of memory to actually use
    
    Returns:
        Optimized QLoRAConfig
    """
    usable_memory = available_memory_gb * safety_margin
    
    # Base model memory (4-bit quantized)
    base_model_memory = model_params_billions * 0.5  # ~0.5 GB per billion params in 4-bit
    
    # Memory available for training overhead
    training_budget = usable_memory - base_model_memory
    
    if training_budget <= 0:
        raise ValueError(f"Model ({model_params_billions}B) too large for {available_memory_gb}GB")
    
    # Try different configurations
    best_config = None
    best_score = -1
    
    # Configuration options to try
    rank_options = [8, 16, 32, 64]
    seq_len_options = [512, 1024, 2048, 4096]
    batch_options = [1, 2, 4, 8]
    grad_accum_options = [1, 2, 4, 8, 16, 32]
    
    # Target module configurations
    module_configs = [
        ["q_proj", "v_proj"],  # Minimal
        ["q_proj", "k_proj", "v_proj", "o_proj"],  # Attention only
        ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],  # All
    ]
    
    for rank in rank_options:
        for seq_len in seq_len_options:
            if seq_len > max_sequence_length:
                continue
            
            for batch_size in batch_options:
                for grad_accum in grad_accum_options:
                    effective_batch = batch_size * grad_accum
                    
                    for modules in module_configs:
                        # Estimate memory for this config
                        num_modules = len(modules)
                        
                        # LoRA params (rough estimate)
                        hidden_size = int(model_params_billions * 500)  # Rough approximation
                        hidden_size = min(hidden_size, 8192)  # Cap at 70B hidden size
                        lora_memory = (rank * hidden_size * num_modules * 80 * 4) / 1e9  # ~80 layers
                        
                        # Activation memory (rough estimate)
                        activation_memory = (
                            batch_size * seq_len * hidden_size * 4 * 0.001  # Simplified
                        )
                        
                        # Optimizer states
                        optimizer_memory = lora_memory * 2  # AdamW
                        
                        total_training_memory = (
                            lora_memory + activation_memory + optimizer_memory
                        )
                        
                        total_memory = base_model_memory + total_training_memory
                        
                        # Check if it fits
                        if total_memory > usable_memory:
                            continue
                        
                        # Score this configuration
                        # Prioritize: effective batch size, sequence length, rank
                        score = (
                            min(effective_batch, target_batch_size) / target_batch_size * 40 +
                            seq_len / max_sequence_length * 30 +
                            rank / 64 * 20 +
                            num_modules / 7 * 10
                        )
                        
                        if score > best_score:
                            best_score = score
                            best_config = QLoRAConfig(
                                batch_size=batch_size,
                                gradient_accumulation=grad_accum,
                                sequence_length=seq_len,
                                lora_rank=rank,
                                lora_alpha=rank * 2,  # Common practice
                                target_modules=modules,
                                estimated_memory_gb=total_memory,
                                effective_batch_size=effective_batch,
                            )
    
    if best_config is None:
        raise ValueError("Could not find valid configuration")
    
    return best_config


# Demo: Optimize for different model sizes on DGX Spark
print("QLoRA Configuration Optimization for DGX Spark (128GB)")
print("=" * 60)

for model_size in [8, 70, 100]:
    try:
        config = optimize_qlora_config(
            model_params_billions=model_size,
            available_memory_gb=128.0,
            target_batch_size=16,
        )
        print(f"\n{model_size}B Model:")
        print(config)
    except ValueError as e:
        print(f"\n{model_size}B Model: {e}")

---

## Summary

These solutions demonstrate:

1. **Memory Estimation**: Calculate expected memory usage before training

2. **Memory Profiling**: Track actual memory usage during training

3. **Configuration Optimization**: Automatically find the best QLoRA config for your hardware

### Key Takeaways for DGX Spark

- **128GB unified memory** allows 70B models with QLoRA
- **4-bit quantization** reduces base model from 140GB to ~35GB
- **Gradient checkpointing** is essential for large models
- **Batch size 1-2** with gradient accumulation is typical for 70B
- Always leave **15-20% memory headroom** for safety

In [None]:
# Cleanup
import gc
gc.collect()

try:
    import torch
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("GPU cache cleared")
except ImportError:
    pass

print("Cleanup complete!")