# Lab 3.1.5: 70B Model QLoRA Fine-Tuning - Solutions

Complete solutions for fine-tuning 70B models on DGX Spark.

## Exercise 1: Memory Planning Calculator

In [None]:
def calculate_70b_memory(
    model_params_b: float = 70,
    bits: int = 4,
    lora_r: int = 16,
    batch_size: int = 1,
    seq_len: int = 2048,
    hidden_dim: int = 8192,
    num_layers: int = 80,
    gradient_checkpointing: bool = True
):
    """
    Calculate memory requirements for 70B QLoRA training.
    """
    print("=" * 60)
    print("70B MODEL MEMORY CALCULATOR")
    print("=" * 60)
    
    # Base model memory (quantized)
    model_mem = model_params_b * (bits / 8)
    print(f"\n1. Base Model ({bits}-bit): {model_mem:.1f} GB")
    
    # LoRA adapter parameters
    # Targeting: q, k, v, o projections + gate, up, down
    lora_targets = 7  # per layer
    lora_params = num_layers * lora_targets * 2 * hidden_dim * lora_r
    lora_mem = lora_params * 2 / 1e9  # FP16
    print(f"2. LoRA Adapters (r={lora_r}): {lora_mem:.2f} GB")
    
    # Optimizer states (AdamW: m and v)
    optimizer_mem = lora_params * 4 * 2 / 1e9  # FP32 states
    print(f"3. Optimizer States: {optimizer_mem:.2f} GB")
    
    # Gradients for LoRA params
    gradient_mem = lora_params * 4 / 1e9  # FP32
    print(f"4. Gradients: {gradient_mem:.2f} GB")
    
    # Activation memory
    if gradient_checkpointing:
        # Only keep sqrt(L) checkpoints
        ckpt_layers = int(num_layers ** 0.5)
        activation_mem = batch_size * seq_len * hidden_dim * ckpt_layers * 2 / 1e9
    else:
        activation_mem = batch_size * seq_len * hidden_dim * num_layers * 2 / 1e9
    print(f"5. Activations {'(with GC)' if gradient_checkpointing else ''}: {activation_mem:.2f} GB")
    
    # KV cache for inference
    kv_cache = 2 * batch_size * seq_len * hidden_dim * num_layers * 2 / 1e9
    print(f"6. KV Cache: {kv_cache:.2f} GB")
    
    # CUDA overhead
    cuda_overhead = 2.0
    print(f"7. CUDA Overhead: {cuda_overhead:.1f} GB")
    
    # Total
    total = model_mem + lora_mem + optimizer_mem + gradient_mem + activation_mem + kv_cache + cuda_overhead
    
    print("\n" + "=" * 60)
    print(f"TOTAL ESTIMATED: {total:.1f} GB")
    print(f"DGX Spark Available: 128 GB")
    print(f"Headroom: {128 - total:.1f} GB")
    
    if total > 128:
        print("\n‚ö†Ô∏è WARNING: May exceed memory. Consider:")
        print("  - Reduce batch size")
        print("  - Reduce sequence length")
        print("  - Use lower LoRA rank")
    else:
        print("\n‚úÖ Should fit in DGX Spark memory!")
    
    print("=" * 60)
    
    return {
        "model": model_mem,
        "lora": lora_mem,
        "optimizer": optimizer_mem,
        "gradients": gradient_mem,
        "activations": activation_mem,
        "kv_cache": kv_cache,
        "overhead": cuda_overhead,
        "total": total
    }

# Calculate for default settings
mem = calculate_70b_memory()

# Try with larger batch
print("\n" + "="*60)
print("WITH BATCH SIZE 2:")
mem_batch2 = calculate_70b_memory(batch_size=2)

## Exercise 2: Optimal Configuration Finder

In [None]:
def find_optimal_config(memory_budget_gb: float = 120):
    """
    Find optimal training configuration for given memory budget.
    """
    configs = []
    
    for batch_size in [1, 2, 4]:
        for seq_len in [512, 1024, 2048, 4096]:
            for lora_r in [8, 16, 32, 64]:
                for grad_accum in [1, 4, 8, 16]:
                    mem = calculate_70b_memory(
                        batch_size=batch_size,
                        seq_len=seq_len,
                        lora_r=lora_r,
                        gradient_checkpointing=True
                    )
                    
                    if mem["total"] <= memory_budget_gb:
                        effective_batch = batch_size * grad_accum
                        throughput_score = effective_batch * seq_len / mem["total"]
                        
                        configs.append({
                            "batch_size": batch_size,
                            "seq_len": seq_len,
                            "lora_r": lora_r,
                            "grad_accum": grad_accum,
                            "effective_batch": effective_batch,
                            "memory_gb": mem["total"],
                            "throughput_score": throughput_score
                        })
    
    # Sort by throughput score
    configs.sort(key=lambda x: x["throughput_score"], reverse=True)
    
    print(f"\nTop 5 Configurations (budget: {memory_budget_gb}GB):")
    print("=" * 80)
    print(f"{'Batch':<8} {'SeqLen':<8} {'LoRA r':<8} {'Accum':<8} {'Eff.Batch':<10} {'Memory':<10} {'Score':<10}")
    print("-" * 80)
    
    for config in configs[:5]:
        print(f"{config['batch_size']:<8} {config['seq_len']:<8} {config['lora_r']:<8} "
              f"{config['grad_accum']:<8} {config['effective_batch']:<10} "
              f"{config['memory_gb']:<10.1f} {config['throughput_score']:<10.2f}")
    
    return configs[:5]

# Suppress individual calculations for this search
import sys
from io import StringIO

# Redirect output
old_stdout = sys.stdout
sys.stdout = StringIO()

optimal_configs = find_optimal_config(memory_budget_gb=120)

# Restore output
sys.stdout = old_stdout

# Show results
print("\nTop 5 Configurations (budget: 120GB):")
print("=" * 80)
print(f"{'Batch':<8} {'SeqLen':<8} {'LoRA r':<8} {'Accum':<8} {'Eff.Batch':<10} {'Memory':<10}")
print("-" * 80)
for config in optimal_configs:
    print(f"{config['batch_size']:<8} {config['seq_len']:<8} {config['lora_r']:<8} "
          f"{config['grad_accum']:<8} {config['effective_batch']:<10} "
          f"{config['memory_gb']:<10.1f}")

## Exercise 3: Complete 70B Training Script

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
import gc

def train_70b_qlora(
    model_id: str = "meta-llama/Llama-3.1-70B",
    dataset_name: str = "tatsu-lab/alpaca",
    output_dir: str = "./llama-70b-qlora",
    max_samples: int = 500,
    epochs: int = 1
):
    """
    Production-ready 70B QLoRA training script for DGX Spark.
    """
    print("="*60)
    print("70B MODEL QLORA TRAINING")
    print("="*60)
    
    # Clear memory
    gc.collect()
    torch.cuda.empty_cache()
    
    # 1. Quantization - Critical for 70B
    print("\n1. Setting up 4-bit quantization...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True  # Nested quantization
    )
    
    # 2. Load model with memory optimizations
    print("2. Loading 70B model (this takes a few minutes)...")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
        # Memory optimization
        low_cpu_mem_usage=True,
    )
    
    # Prepare for training
    model = prepare_model_for_kbit_training(
        model,
        use_gradient_checkpointing=True
    )
    
    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()
    
    print(f"   Model loaded. Memory: {torch.cuda.memory_allocated()/1e9:.1f}GB")
    
    # 3. Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    
    # 4. Conservative LoRA config for 70B
    print("3. Configuring LoRA (conservative for 70B)...")
    lora_config = LoraConfig(
        r=16,  # Conservative rank
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ],
        bias="none",
        task_type="CAUSAL_LM",
        use_dora=True,  # DoRA for better performance
    )
    
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    # 5. Load dataset
    print("4. Loading dataset...")
    dataset = load_dataset(dataset_name, split=f"train[:{max_samples}]")
    
    def format_alpaca(example):
        if example.get("input", "").strip():
            return {"text": f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{example['instruction']}

Input: {example['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{example['output']}<|eot_id|>"""}
        else:
            return {"text": f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{example['instruction']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{example['output']}<|eot_id|}"""}
    
    dataset = dataset.map(format_alpaca)
    
    # 6. Memory-optimized training config
    print("5. Setting up training...")
    training_args = SFTConfig(
        output_dir=output_dir,
        num_train_epochs=epochs,
        
        # Memory-optimized batch settings
        per_device_train_batch_size=1,  # Single sample per step
        gradient_accumulation_steps=16,  # Effective batch of 16
        
        # Learning rate
        learning_rate=1e-4,  # Conservative for large models
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        
        # Precision
        bf16=True,
        
        # Memory optimizations
        gradient_checkpointing=True,
        optim="paged_adamw_8bit",  # 8-bit optimizer
        
        # Sequence length (conservative)
        max_seq_length=1024,
        
        # NEFTune
        neftune_noise_alpha=5.0,
        
        # Logging
        logging_steps=5,
        save_strategy="steps",
        save_steps=100,
        save_total_limit=2,
        
        # Disable
        report_to="none",
        push_to_hub=False,
    )
    
    # 7. Create trainer
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer,
        dataset_text_field="text",
    )
    
    print(f"\nMemory before training: {torch.cuda.memory_allocated()/1e9:.1f}GB")
    print("="*60)
    print("Ready to train! Call trainer.train()")
    
    return trainer, model, tokenizer

# Uncomment to run (requires access to Llama 70B)
# trainer, model, tokenizer = train_70b_qlora()
# trainer.train()

## Exercise 4: Memory Monitoring During Training

In [None]:
import threading
import time
import matplotlib.pyplot as plt

class MemoryMonitor:
    """
    Monitor GPU memory during training.
    """
    def __init__(self, interval: float = 1.0):
        self.interval = interval
        self.memory_log = []
        self.timestamps = []
        self.running = False
        self.thread = None
    
    def _monitor(self):
        start_time = time.time()
        while self.running:
            if torch.cuda.is_available():
                allocated = torch.cuda.memory_allocated() / 1e9
                reserved = torch.cuda.memory_reserved() / 1e9
                self.memory_log.append((allocated, reserved))
                self.timestamps.append(time.time() - start_time)
            time.sleep(self.interval)
    
    def start(self):
        self.running = True
        self.memory_log = []
        self.timestamps = []
        self.thread = threading.Thread(target=self._monitor)
        self.thread.start()
        print("Memory monitoring started...")
    
    def stop(self):
        self.running = False
        if self.thread:
            self.thread.join()
        print("Memory monitoring stopped.")
    
    def plot(self, save_path: str = None):
        if not self.memory_log:
            print("No data to plot.")
            return
        
        allocated = [m[0] for m in self.memory_log]
        reserved = [m[1] for m in self.memory_log]
        
        fig, ax = plt.subplots(figsize=(12, 6))
        ax.plot(self.timestamps, allocated, label='Allocated', linewidth=2)
        ax.plot(self.timestamps, reserved, label='Reserved', linewidth=2, alpha=0.7)
        ax.axhline(y=128, color='r', linestyle='--', label='DGX Spark Limit')
        
        ax.set_xlabel('Time (seconds)')
        ax.set_ylabel('Memory (GB)')
        ax.set_title('GPU Memory Usage During Training')
        ax.legend()
        ax.grid(True, alpha=0.3)
        
        if save_path:
            plt.savefig(save_path, dpi=150)
        plt.show()
        
        # Statistics
        print(f"\nMemory Statistics:")
        print(f"  Peak Allocated: {max(allocated):.1f} GB")
        print(f"  Peak Reserved: {max(reserved):.1f} GB")
        print(f"  Average Allocated: {sum(allocated)/len(allocated):.1f} GB")

# Usage example:
# monitor = MemoryMonitor(interval=0.5)
# monitor.start()
# trainer.train()
# monitor.stop()
# monitor.plot('memory_usage.png')

## Exercise 5: Troubleshooting OOM Errors

In [None]:
def diagnose_oom():
    """
    Diagnose and fix out-of-memory issues.
    """
    print("OOM TROUBLESHOOTING GUIDE")
    print("=" * 60)
    
    fixes = [
        {
            "issue": "OOM during model loading",
            "solutions": [
                "Clear system cache: sudo sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches'",
                "Use load_in_4bit=True in BitsAndBytesConfig",
                "Add low_cpu_mem_usage=True to from_pretrained()",
                "Kill other GPU processes: nvidia-smi then kill PID"
            ]
        },
        {
            "issue": "OOM during forward pass",
            "solutions": [
                "Reduce batch_size to 1",
                "Reduce max_seq_length (try 512 first)",
                "Enable gradient_checkpointing=True",
                "Use attn_implementation='flash_attention_2'"
            ]
        },
        {
            "issue": "OOM during backward pass",
            "solutions": [
                "Use gradient_checkpointing=True",
                "Reduce LoRA rank (r=8 instead of r=16)",
                "Use paged_adamw_8bit optimizer",
                "Increase gradient_accumulation_steps instead of batch_size"
            ]
        },
        {
            "issue": "Memory keeps growing during training",
            "solutions": [
                "Clear cache periodically: torch.cuda.empty_cache()",
                "Check for memory leaks in callbacks",
                "Disable evaluation during training",
                "Save checkpoints less frequently"
            ]
        }
    ]
    
    for fix in fixes:
        print(f"\nüìç {fix['issue']}")
        for i, solution in enumerate(fix['solutions'], 1):
            print(f"   {i}. {solution}")
    
    print("\n" + "=" * 60)
    print("EMERGENCY MEMORY RECOVERY:")
    print("""
import gc
import torch

# Delete model and clear references
del model
del trainer
gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()

# Check memory
print(f"Memory freed: {torch.cuda.memory_allocated()/1e9:.1f}GB used")
    """)

diagnose_oom()

## Key Takeaways

1. **Memory Planning**: Calculate before loading - 70B needs careful planning
2. **Essential Optimizations**: 4-bit + gradient checkpointing + paged optimizer
3. **Conservative Settings**: batch_size=1, seq_len=1024, r=16 for safety
4. **Monitoring**: Track memory throughout training
5. **Recovery**: Know how to free memory when things go wrong