# Lab 3.1.1: LoRA Theory - Solutions

**Module:** 3.1 - Large Language Model Fine-Tuning  
**Difficulty:** ⭐⭐⭐☆☆ (Intermediate)  
**Exercises:** 3 (LoRA with Bias, Target Module Comparison, Memory Analysis)

This notebook contains solutions to the exercises in the LoRA Theory notebook.

---

## Exercise 1: Implement LoRA with Bias

Modify the `LoRALayer` class to optionally train the bias term as well.

In [None]:
import torch
import torch.nn as nn
import numpy as np

class LoRALayerWithBias(nn.Module):
    """
    LoRA layer that optionally trains bias.
    
    Extends the standard LoRA to allow training the bias term,
    which can improve performance for some tasks.
    """
    
    def __init__(
        self,
        original_layer: nn.Linear,
        rank: int = 8,
        alpha: float = 16.0,
        dropout: float = 0.0,
        train_bias: bool = False,  # NEW: option to train bias
    ):
        super().__init__()
        
        self.original_layer = original_layer
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank
        self.train_bias = train_bias
        
        in_features = original_layer.in_features
        out_features = original_layer.out_features
        
        # Freeze original weights
        self.original_layer.weight.requires_grad = False
        
        # Handle bias - optionally train it
        if self.original_layer.bias is not None:
            if train_bias:
                self.original_layer.bias.requires_grad = True  # Train bias!
            else:
                self.original_layer.bias.requires_grad = False
        
        # LoRA matrices
        self.lora_A = nn.Parameter(torch.zeros(rank, in_features))
        self.lora_B = nn.Parameter(torch.zeros(out_features, rank))
        
        # Initialize A with Kaiming uniform
        nn.init.kaiming_uniform_(self.lora_A, a=np.sqrt(5))
        
        # Optional dropout
        self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Original path
        result = self.original_layer(x)
        
        # LoRA path
        lora_x = self.dropout(x)
        lora_output = lora_x @ self.lora_A.T @ self.lora_B.T
        
        return result + self.scaling * lora_output
    
    @property
    def trainable_params(self) -> int:
        params = self.lora_A.numel() + self.lora_B.numel()
        if self.train_bias and self.original_layer.bias is not None:
            params += self.original_layer.bias.numel()
        return params

# Test the implementation
linear = nn.Linear(512, 512, bias=True)

# Without training bias
lora_no_bias = LoRALayerWithBias(linear, rank=16, train_bias=False)
print(f"Without bias training: {lora_no_bias.trainable_params:,} trainable params")

# With training bias
linear2 = nn.Linear(512, 512, bias=True)
lora_with_bias = LoRALayerWithBias(linear2, rank=16, train_bias=True)
print(f"With bias training: {lora_with_bias.trainable_params:,} trainable params")

# Verify bias is trainable
print(f"\nBias requires_grad: {lora_with_bias.original_layer.bias.requires_grad}")

## Exercise 2: Compare Different Target Modules

Compare training with LoRA on different sets of target modules.

In [None]:
import torch.nn.functional as F

class LoRALayer(nn.Module):
    """Standard LoRA layer for comparison."""
    def __init__(self, original_layer, rank=8, alpha=16.0):
        super().__init__()
        self.original_layer = original_layer
        self.rank = rank
        self.scaling = alpha / rank
        
        in_f = original_layer.in_features
        out_f = original_layer.out_features
        
        original_layer.weight.requires_grad = False
        if original_layer.bias is not None:
            original_layer.bias.requires_grad = False
        
        self.lora_A = nn.Parameter(torch.zeros(rank, in_f))
        self.lora_B = nn.Parameter(torch.zeros(out_f, rank))
        nn.init.kaiming_uniform_(self.lora_A, a=np.sqrt(5))
    
    def forward(self, x):
        result = self.original_layer(x)
        lora_out = x @ self.lora_A.T @ self.lora_B.T
        return result + self.scaling * lora_out


class SimpleTransformerBlock(nn.Module):
    """Simple transformer block for testing."""
    def __init__(self, d_model=256, n_heads=8):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        
        # Attention projections
        self.q_proj = nn.Linear(d_model, d_model, bias=False)
        self.k_proj = nn.Linear(d_model, d_model, bias=False)
        self.v_proj = nn.Linear(d_model, d_model, bias=False)
        self.o_proj = nn.Linear(d_model, d_model, bias=False)
        
        # MLP
        self.gate_proj = nn.Linear(d_model, d_model * 4, bias=False)
        self.up_proj = nn.Linear(d_model, d_model * 4, bias=False)
        self.down_proj = nn.Linear(d_model * 4, d_model, bias=False)
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
    def forward(self, x):
        B, T, C = x.shape
        
        # Attention
        normed = self.norm1(x)
        q = self.q_proj(normed).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(normed).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(normed).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        
        attn = F.scaled_dot_product_attention(q, k, v)
        attn = attn.transpose(1, 2).contiguous().view(B, T, C)
        x = x + self.o_proj(attn)
        
        # MLP (SwiGLU style)
        normed = self.norm2(x)
        x = x + self.down_proj(F.silu(self.gate_proj(normed)) * self.up_proj(normed))
        
        return x


def add_lora_to_model(model, rank=8, alpha=16, target_modules=None):
    """Add LoRA to specific modules."""
    if target_modules is None:
        target_modules = []
    
    for name, module in list(model.named_modules()):
        if isinstance(module, nn.Linear):
            if target_modules and not any(t in name for t in target_modules):
                continue
            
            parts = name.split('.')
            parent = model
            for part in parts[:-1]:
                parent = getattr(parent, part)
            
            setattr(parent, parts[-1], LoRALayer(module, rank, alpha))
    
    return model


def train_and_evaluate(target_modules, n_epochs=50):
    """Train model with specific target modules and return final loss."""
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # Create model
    model = SimpleTransformerBlock().to(device)
    model = add_lora_to_model(model, rank=8, target_modules=target_modules)
    
    # Count params
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    # Create data
    x_train = torch.randn(100, 32, 256, device=device)
    y_train = torch.randn(100, 32, 256, device=device)
    
    # Train
    optimizer = torch.optim.AdamW(
        [p for p in model.parameters() if p.requires_grad], 
        lr=1e-3
    )
    
    losses = []
    for epoch in range(n_epochs):
        model.train()
        for i in range(0, 100, 10):
            optimizer.zero_grad()
            output = model(x_train[i:i+10])
            loss = F.mse_loss(output, y_train[i:i+10])
            loss.backward()
            optimizer.step()
        losses.append(loss.item())
    
    return trainable, losses[-1]


# Compare different target module configurations
configs = [
    ("Q and V only", ["q_proj", "v_proj"]),
    ("All attention", ["q_proj", "k_proj", "v_proj", "o_proj"]),
    ("Attention + MLP", ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]),
]

print("Comparing different target module configurations:")
print("=" * 60)

for name, targets in configs:
    params, loss = train_and_evaluate(targets)
    print(f"{name:20s}: {params:,} params, Final Loss: {loss:.4f}")

## Exercise 3: Memory Analysis

Calculate and visualize memory savings of LoRA for different model sizes.

In [None]:
import matplotlib.pyplot as plt

def calculate_memory_requirements(
    model_params_billions: float,
    lora_rank: int = 16,
    dtype_bytes: int = 2,  # float16
    lora_target_ratio: float = 0.3,  # ~30% of params are in target modules
) -> dict:
    """
    Calculate memory requirements for full fine-tuning vs LoRA.
    
    Memory components:
    - Model weights
    - Gradients (same size as trainable params)
    - Optimizer states (Adam: 2x trainable params for momentum + variance)
    - Activations (varies, but typically 2-4x model size during training)
    
    Args:
        model_params_billions: Number of parameters in billions
        lora_rank: LoRA rank
        dtype_bytes: Bytes per parameter (2 for float16, 4 for float32)
        lora_target_ratio: Fraction of parameters in LoRA target modules
    
    Returns:
        Dictionary with memory estimates
    """
    total_params = model_params_billions * 1e9
    
    # Estimate dimensions (rough approximation)
    # Assuming d_model proportional to sqrt(params)
    d_model = int((total_params / 100) ** 0.5)  # Rough estimate
    
    # LoRA parameters per adapted layer
    # Each layer adds: d_model * rank + rank * d_model = 2 * d_model * rank
    lora_params_per_layer = 2 * d_model * lora_rank
    
    # Number of adapted layers (estimate)
    n_layers = int(total_params / (12 * d_model * d_model))  # Rough transformer formula
    n_adapted = int(n_layers * 4 * lora_target_ratio)  # ~4 projections per layer * target ratio
    
    total_lora_params = n_adapted * lora_params_per_layer
    
    # Full fine-tuning memory
    full_weights = total_params * dtype_bytes / 1e9  # GB
    full_gradients = total_params * dtype_bytes / 1e9
    full_optimizer = total_params * 2 * dtype_bytes / 1e9  # Adam states
    full_activations = full_weights * 2  # Rough estimate
    full_total = full_weights + full_gradients + full_optimizer + full_activations
    
    # LoRA memory
    lora_weights = total_params * dtype_bytes / 1e9  # Still need base model
    lora_trainable = total_lora_params * dtype_bytes / 1e9
    lora_gradients = total_lora_params * dtype_bytes / 1e9  # Only for LoRA params
    lora_optimizer = total_lora_params * 2 * dtype_bytes / 1e9
    lora_activations = lora_weights * 0.5  # Less activation memory with gradient checkpointing
    lora_total = lora_weights + lora_trainable + lora_gradients + lora_optimizer + lora_activations
    
    return {
        'model_params_B': model_params_billions,
        'lora_rank': lora_rank,
        'lora_params': total_lora_params,
        'lora_params_ratio': total_lora_params / total_params,
        'full_finetuning_GB': full_total,
        'lora_finetuning_GB': lora_total,
        'memory_savings': 1 - (lora_total / full_total),
        'breakdown_full': {
            'weights': full_weights,
            'gradients': full_gradients,
            'optimizer': full_optimizer,
            'activations': full_activations,
        },
        'breakdown_lora': {
            'weights': lora_weights,
            'lora_params': lora_trainable,
            'gradients': lora_gradients,
            'optimizer': lora_optimizer,
            'activations': lora_activations,
        },
    }


# Analyze different model sizes
model_sizes = [1, 3, 7, 13, 30, 70, 120]
results = []

print("Memory Requirements Analysis")
print("=" * 80)
print(f"{'Model Size':>12} | {'Full FT (GB)':>12} | {'LoRA (GB)':>12} | {'Savings':>10} | {'LoRA Params':>15}")
print("-" * 80)

for size in model_sizes:
    result = calculate_memory_requirements(size, lora_rank=16)
    results.append(result)
    print(f"{size:>10}B | {result['full_finetuning_GB']:>10.1f}GB | {result['lora_finetuning_GB']:>10.1f}GB | {result['memory_savings']:>9.1%} | {result['lora_params']/1e6:>12.1f}M")

# Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Memory comparison
full_mem = [r['full_finetuning_GB'] for r in results]
lora_mem = [r['lora_finetuning_GB'] for r in results]

x = range(len(model_sizes))
width = 0.35

axes[0].bar([i - width/2 for i in x], full_mem, width, label='Full Fine-tuning', color='coral')
axes[0].bar([i + width/2 for i in x], lora_mem, width, label='LoRA', color='steelblue')
axes[0].axhline(y=128, color='green', linestyle='--', label='DGX Spark (128GB)')
axes[0].axhline(y=24, color='red', linestyle='--', label='RTX 4090 (24GB)')
axes[0].set_xlabel('Model Size')
axes[0].set_ylabel('Memory (GB)')
axes[0].set_title('Memory Requirements')
axes[0].set_xticks(x)
axes[0].set_xticklabels([f'{s}B' for s in model_sizes])
axes[0].legend()
axes[0].set_yscale('log')

# Savings
savings = [r['memory_savings'] * 100 for r in results]
axes[1].bar(x, savings, color='seagreen')
axes[1].set_xlabel('Model Size')
axes[1].set_ylabel('Memory Savings (%)')
axes[1].set_title('LoRA Memory Savings')
axes[1].set_xticks(x)
axes[1].set_xticklabels([f'{s}B' for s in model_sizes])

# LoRA params
lora_params = [r['lora_params'] / 1e6 for r in results]
axes[2].bar(x, lora_params, color='purple')
axes[2].set_xlabel('Model Size')
axes[2].set_ylabel('LoRA Parameters (M)')
axes[2].set_title('LoRA Trainable Parameters')
axes[2].set_xticks(x)
axes[2].set_xticklabels([f'{s}B' for s in model_sizes])

plt.tight_layout()
plt.savefig('memory_analysis_solution.png', dpi=150)
plt.show()

print("\n\nKey Insights:")
print("1. LoRA reduces memory by 60-80% across all model sizes")
print("2. DGX Spark (128GB) can handle 70B+ with LoRA, but not full fine-tuning")
print("3. RTX 4090 (24GB) can only handle ~7B with LoRA")
print("4. LoRA parameters scale sublinearly with model size")

---

## Summary

These solutions demonstrate:

1. **Exercise 1**: How to extend LoRA to optionally train bias terms
2. **Exercise 2**: Comparing different target module configurations (Q/V only vs full attention vs attention + MLP)
3. **Exercise 3**: Comprehensive memory analysis showing LoRA's efficiency across model sizes

Key takeaways:
- LoRA with bias can provide marginal improvements for some tasks
- More target modules = more capacity but more memory
- LoRA enables training of 70B+ models on DGX Spark (impossible with full fine-tuning)