# Task 3.4 Solutions: SVD for LoRA Intuition

This notebook contains solutions for the exercises in the SVD for LoRA lab.

---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

print("SVD for LoRA Solutions")
print("=" * 50)

## Helper Functions

In [None]:
def reconstruct_low_rank(U, S, Vt, rank):
    """Reconstruct matrix using only top 'rank' singular values"""
    return U[:, :rank] @ np.diag(S[:rank]) @ Vt[:rank, :]

def relative_error(original, reconstructed):
    """Compute relative reconstruction error (Frobenius norm)"""
    return np.linalg.norm(original - reconstructed) / np.linalg.norm(original)

def create_low_rank_matrix(d=768, true_rank=64, noise_level=0.01):
    """Create a simulated neural network weight matrix with low effective rank"""
    A = np.random.randn(d, true_rank) / np.sqrt(true_rank)
    B = np.random.randn(true_rank, d) / np.sqrt(true_rank)
    noise = np.random.randn(d, d) * noise_level
    return A @ B + noise

print("Helper functions defined!")

---

## Exercise Solution: Find the Optimal Rank

### ðŸ§’ ELI5: What We're Doing

> **Imagine you're compressing a photo...**
>
> - Too much compression: Photo looks blurry (high error)
> - Too little compression: File is still huge (no savings)
> - Sweet spot: Photo looks great AND file is small!
>
> We're finding the "sweet spot" for matrix compression!

### The Task

Find the minimum rank needed to achieve less than 1% reconstruction error.

In [None]:
def find_optimal_rank(W, target_error=0.01):
    """
    Find minimum rank needed to achieve target reconstruction error.
    
    The math:
    - SVD gives us W = U @ diag(S) @ Vt
    - Low-rank approximation: W_r = U[:,:r] @ diag(S[:r]) @ Vt[:r,:]
    - Error = ||W - W_r|| / ||W||
    
    We find the smallest r where error < target_error.
    
    Args:
        W: Input matrix
        target_error: Maximum acceptable relative error (default 1%)
    
    Returns:
        Optimal rank
    """
    # Step 1: Perform SVD
    U, S, Vt = np.linalg.svd(W, full_matrices=False)
    
    # Step 2: Try each rank from 1 to full
    for r in range(1, len(S) + 1):
        # Reconstruct with rank r
        W_approx = reconstruct_low_rank(U, S, Vt, r)
        
        # Compute error
        error = relative_error(W, W_approx)
        
        # Check if we've achieved target
        if error < target_error:
            return r
    
    # If no rank achieves target, return full rank
    return len(S)

print("find_optimal_rank() function defined!")

In [None]:
# Test on our simulated neural network weight
d_model = 768  # Like BERT-base
true_rank = 64  # Simulated effective rank

W_neural = create_low_rank_matrix(d=d_model, true_rank=true_rank, noise_level=0.01)

# Find optimal rank for 1% error
optimal_r = find_optimal_rank(W_neural, target_error=0.01)

print(f"Matrix shape: {W_neural.shape}")
print(f"Total parameters: {W_neural.size:,}")
print(f"Simulated true rank: {true_rank}")
print(f"\nOptimal rank for <1% error: {optimal_r}")
print(f"Parameters with low-rank: {2 * d_model * optimal_r:,}")
print(f"Compression ratio: {W_neural.size / (2 * d_model * optimal_r):.1f}x")

### Why Does the Optimal Rank Exceed the True Rank?

We added noise to our simulated matrix! The noise has "full rank", so we need a bit more than the true rank to capture it.

In real neural networks:
- The effective rank is usually much smaller than the matrix dimension
- But it's not perfectly low-rank due to training noise

In [None]:
# Visualize error vs rank trade-off

# Compute error for a range of ranks
U, S, Vt = np.linalg.svd(W_neural, full_matrices=False)

ranks = list(range(1, 150))
errors = []
for r in ranks:
    W_approx = reconstruct_low_rank(U, S, Vt, r)
    errors.append(relative_error(W_neural, W_approx) * 100)  # As percentage

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Linear scale
axes[0].plot(ranks, errors, 'b-', linewidth=2)
axes[0].axhline(y=1, color='red', linestyle='--', label='1% error threshold')
axes[0].axvline(x=optimal_r, color='green', linestyle='--', 
               label=f'Optimal rank = {optimal_r}')
axes[0].axvline(x=true_rank, color='orange', linestyle=':', 
               label=f'True rank = {true_rank}')
axes[0].set_xlabel('Rank', fontsize=12)
axes[0].set_ylabel('Reconstruction Error (%)', fontsize=12)
axes[0].set_title('Error vs Rank Trade-off', fontsize=14)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].set_xlim(0, 150)

# Log scale for detail
axes[1].semilogy(ranks, errors, 'b-', linewidth=2)
axes[1].axhline(y=1, color='red', linestyle='--', label='1% error threshold')
axes[1].axvline(x=optimal_r, color='green', linestyle='--', 
               label=f'Optimal rank = {optimal_r}')
axes[1].set_xlabel('Rank', fontsize=12)
axes[1].set_ylabel('Reconstruction Error (%) - Log Scale', fontsize=12)
axes[1].set_title('Error vs Rank (Log Scale)', fontsize=14)
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_xlim(0, 150)

plt.tight_layout()
plt.show()

---

## Bonus: Efficient Implementation Using Singular Values

We can compute the error without actually reconstructing the matrix!

In [None]:
def find_optimal_rank_efficient(W, target_error=0.01):
    """
    Find optimal rank efficiently using singular values directly.
    
    Key insight: The Frobenius norm of the error is:
    ||W - W_r||_FÂ² = sum(S[r:]Â²)
    
    So we can compute error from singular values alone!
    
    This is MUCH faster for large matrices.
    """
    # Step 1: Perform SVD (or just get singular values)
    S = np.linalg.svd(W, compute_uv=False)
    
    # Total energy (Frobenius norm squared)
    total_energy = np.sum(S ** 2)
    
    # Step 2: Compute cumulative energy from the BACK
    # (energy NOT captured by keeping top r components)
    residual_energy = np.cumsum(S[::-1] ** 2)[::-1]
    
    # Step 3: Compute relative error for each rank
    # error_rÂ² = residual_energy[r] / total_energy
    relative_errors = np.sqrt(residual_energy / total_energy)
    
    # Step 4: Find first rank where error < target
    for r, error in enumerate(relative_errors, 1):
        if error < target_error:
            return r
    
    return len(S)

# Verify it gives the same answer
optimal_r_efficient = find_optimal_rank_efficient(W_neural, target_error=0.01)

print(f"Original method: rank = {optimal_r}")
print(f"Efficient method: rank = {optimal_r_efficient}")
print(f"Match: {optimal_r == optimal_r_efficient}")

In [None]:
# Time comparison
import time

# Original method
start = time.time()
for _ in range(10):
    _ = find_optimal_rank(W_neural, target_error=0.01)
original_time = (time.time() - start) / 10

# Efficient method
start = time.time()
for _ in range(10):
    _ = find_optimal_rank_efficient(W_neural, target_error=0.01)
efficient_time = (time.time() - start) / 10

print(f"Original method: {original_time*1000:.2f} ms per call")
print(f"Efficient method: {efficient_time*1000:.2f} ms per call")
print(f"Speedup: {original_time/efficient_time:.1f}x")

---

## Bonus 2: Testing on Different Matrix Types

In [None]:
# Test on different types of matrices

test_matrices = {
    'Low-rank (r=16)': create_low_rank_matrix(768, true_rank=16, noise_level=0.005),
    'Low-rank (r=64)': create_low_rank_matrix(768, true_rank=64, noise_level=0.005),
    'Low-rank (r=128)': create_low_rank_matrix(768, true_rank=128, noise_level=0.005),
    'Random (full rank)': np.random.randn(768, 768) / np.sqrt(768),
}

results = []

for name, W in test_matrices.items():
    optimal_r = find_optimal_rank_efficient(W, target_error=0.01)
    full_params = W.size
    lora_params = 2 * W.shape[0] * optimal_r
    compression = full_params / lora_params
    
    results.append({
        'name': name,
        'optimal_rank': optimal_r,
        'compression': compression
    })

print("Optimal Rank Analysis")
print("=" * 60)
print(f"{'Matrix Type':<25} {'Optimal Rank':<15} {'Compression':<15}")
print("-" * 60)

for r in results:
    print(f"{r['name']:<25} {r['optimal_rank']:<15} {r['compression']:.1f}x")

In [None]:
# Visualize singular value spectra for each matrix type

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

colors = plt.cm.tab10(np.linspace(0, 1, len(test_matrices)))

for (name, W), color in zip(test_matrices.items(), colors):
    S = np.linalg.svd(W, compute_uv=False)
    S_normalized = S / S[0]  # Normalize by largest
    
    # Linear plot
    axes[0].plot(S_normalized[:150], '-', color=color, linewidth=2, 
                label=name, alpha=0.8)
    
    # Log plot
    axes[1].semilogy(S_normalized[:150], '-', color=color, linewidth=2, 
                    label=name, alpha=0.8)

axes[0].set_xlabel('Singular Value Index', fontsize=12)
axes[0].set_ylabel('Normalized Singular Value', fontsize=12)
axes[0].set_title('Singular Value Decay (Linear)', fontsize=14)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('Singular Value Index', fontsize=12)
axes[1].set_ylabel('Normalized Singular Value (Log)', fontsize=12)
axes[1].set_title('Singular Value Decay (Log Scale)', fontsize=14)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nðŸ“Š Key Observations:")
print("  - Low-rank matrices: Sharp drop after true rank")
print("  - Random matrices: Slow, gradual decay (all components important)")
print("  - This is why LoRA works for neural networks!")

---

## Bonus 3: Memory Savings Calculator for Real Models

In [None]:
def calculate_lora_savings(model_name, d_model, n_layers, lora_rank, 
                           apply_to=['q', 'v'], bytes_per_param=4):
    """
    Calculate memory savings from using LoRA.
    
    Args:
        model_name: Name for display
        d_model: Hidden dimension
        n_layers: Number of transformer layers
        lora_rank: LoRA rank
        apply_to: Which projections to apply LoRA to
        bytes_per_param: Bytes per parameter (4 for fp32, 2 for fp16)
    """
    # Standard attention has 4 projections: Q, K, V, O
    # Each is d_model Ã— d_model
    params_per_projection = d_model * d_model
    
    # Full fine-tuning
    full_trainable = 4 * params_per_projection * n_layers
    
    # LoRA (only applied to specified projections)
    lora_per_projection = 2 * d_model * lora_rank
    lora_trainable = len(apply_to) * lora_per_projection * n_layers
    
    # Memory
    full_memory_mb = full_trainable * bytes_per_param / 1e6
    lora_memory_mb = lora_trainable * bytes_per_param / 1e6
    
    return {
        'model': model_name,
        'full_params': full_trainable,
        'lora_params': lora_trainable,
        'savings_percent': (1 - lora_trainable/full_trainable) * 100,
        'full_memory_mb': full_memory_mb,
        'lora_memory_mb': lora_memory_mb
    }

# Common models
models = [
    ('BERT-base', 768, 12),
    ('BERT-large', 1024, 24),
    ('GPT-2', 768, 12),
    ('GPT-2 Medium', 1024, 24),
    ('LLaMA-7B', 4096, 32),
    ('LLaMA-13B', 5120, 40),
    ('LLaMA-70B', 8192, 80),
]

# LoRA ranks to try
lora_ranks = [8, 16, 32, 64]

print("LoRA Memory Savings (rank=16, applied to Q and V)")
print("=" * 70)
print(f"{'Model':<15} {'Full Params':<15} {'LoRA Params':<15} {'Savings':<12} {'LoRA Size':<12}")
print("-" * 70)

for model_name, d_model, n_layers in models:
    result = calculate_lora_savings(model_name, d_model, n_layers, lora_rank=16)
    print(f"{result['model']:<15} "
          f"{result['full_params']:>13,} "
          f"{result['lora_params']:>13,} "
          f"{result['savings_percent']:>10.1f}% "
          f"{result['lora_memory_mb']:>10.1f} MB")

In [None]:
# Visualize savings for LLaMA-7B with different ranks

ranks = [4, 8, 16, 32, 64, 128, 256]
savings_data = []

for rank in ranks:
    result = calculate_lora_savings('LLaMA-7B', 4096, 32, lora_rank=rank)
    savings_data.append(result)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Savings percentage
axes[0].bar([str(r) for r in ranks], 
           [s['savings_percent'] for s in savings_data],
           color='green', alpha=0.7)
axes[0].axhline(y=95, color='red', linestyle='--', label='95% threshold')
axes[0].set_xlabel('LoRA Rank', fontsize=12)
axes[0].set_ylabel('Memory Savings (%)', fontsize=12)
axes[0].set_title('LLaMA-7B: LoRA Memory Savings by Rank', fontsize=14)
axes[0].legend()
axes[0].set_ylim(80, 100)
axes[0].grid(True, alpha=0.3, axis='y')

# LoRA adapter size
axes[1].bar([str(r) for r in ranks], 
           [s['lora_memory_mb'] for s in savings_data],
           color='steelblue', alpha=0.7)
axes[1].set_xlabel('LoRA Rank', fontsize=12)
axes[1].set_ylabel('LoRA Adapter Size (MB)', fontsize=12)
axes[1].set_title('LLaMA-7B: LoRA Adapter Size by Rank', fontsize=14)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("\nðŸ“Š LLaMA-7B LoRA Insights:")
print(f"  - Full attention fine-tuning: {savings_data[0]['full_params']/1e9:.2f}B params")
print(f"  - Rank 16 LoRA: {savings_data[2]['lora_params']/1e6:.1f}M params ({savings_data[2]['savings_percent']:.1f}% savings)")
print(f"  - Rank 64 LoRA: {savings_data[4]['lora_params']/1e6:.1f}M params ({savings_data[4]['savings_percent']:.1f}% savings)")

---

## Key Takeaways

1. **Finding optimal rank** is straightforward:
   - Perform SVD
   - Try ranks from 1 to full
   - Return first rank with error < threshold

2. **Efficient implementation** uses singular values directly:
   - Error can be computed from residual singular values
   - No need to reconstruct the full matrix
   - Much faster for large matrices

3. **Low-rank matrices** require small rank for good approximation
   - Neural network weights tend to be low-rank after training
   - This is why LoRA works so well!

4. **Real-world savings** are dramatic:
   - LLaMA-7B: ~96% fewer trainable parameters with rank 16
   - LoRA adapters are typically 10-100MB vs GB for full fine-tuning

In [None]:
import gc
gc.collect()
print("\nâœ… Solution notebook complete!")