# Lab 1.1.2: Memory Architecture Lab - SOLUTIONS

This notebook contains solutions to the exercises in the Memory Architecture Lab.

---

## Try It Yourself #1 Solution

**Task:** Try allocating larger tensors (60-100 GB). What's the maximum you can allocate?

In [None]:
import torch
import gc
import time

def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

def allocate_tensor_gb(size_gb: float, dtype=torch.float32):
    bytes_per_element = torch.tensor([], dtype=dtype).element_size()
    num_elements = int(size_gb * 1e9 / bytes_per_element)
    
    start_time = time.time()
    tensor = torch.empty(num_elements, dtype=dtype, device='cuda')
    torch.cuda.synchronize()
    elapsed = time.time() - start_time
    
    actual_size = tensor.element_size() * tensor.nelement() / 1e9
    return tensor, actual_size, elapsed

# Solution: Test large allocations
large_sizes = [60, 70, 80, 90, 100]

print("Large Tensor Allocation Test")
print("=" * 50)
print(f"{'Size (GB)':<12} {'Status':<15} {'Time (s)':<12}")
print("-" * 39)

max_successful = 0

for size_gb in large_sizes:
    clear_memory()
    
    try:
        tensor, actual_size, alloc_time = allocate_tensor_gb(size_gb)
        print(f"{size_gb:<12} {'SUCCESS':<15} {alloc_time:.3f}")
        max_successful = size_gb
        del tensor
    except RuntimeError as e:
        print(f"{size_gb:<12} {'FAILED':<15} -")

clear_memory()
print(f"\nMaximum successful allocation: {max_successful} GB")

**Expected Results:**
- On a fresh system with cleared buffer cache, you should be able to allocate ~100GB
- The exact limit depends on:
  - Current buffer cache usage (run `sync; echo 3 > /proc/sys/vm/drop_caches` to clear)
  - Other running processes
  - PyTorch memory overhead

**Typical output:**
```
Size (GB)    Status          Time (s)    
---------------------------------------
60           SUCCESS         0.015
70           SUCCESS         0.018
80           SUCCESS         0.021
90           SUCCESS         0.024
100          SUCCESS         0.027

Maximum successful allocation: 100 GB
```

---

## Try It Yourself #2 Solution

**Task:** Calculate memory requirements for a 70B model in different precisions.

In [None]:
# Solution: Calculate memory requirements

params_70b = 70_000_000_000  # 70 billion parameters

# Memory calculation for different precisions
# Formula: memory = num_params * bytes_per_param

precisions = {
    "FP32 (float32)": 4,       # 4 bytes per parameter
    "FP16 (float16)": 2,       # 2 bytes per parameter
    "BF16 (bfloat16)": 2,      # 2 bytes per parameter
    "INT8 (int8)": 1,          # 1 byte per parameter
    "INT4 (int4)": 0.5,        # 0.5 bytes per parameter
}

print("Memory Requirements for 70B Parameter Model")
print("=" * 55)
print(f"{'Precision':<20} {'Bytes/Param':<15} {'Total Memory':<15} {'Fits in 128GB?'}")
print("-" * 55)

for name, bytes_per_param in precisions.items():
    memory_bytes = params_70b * bytes_per_param
    memory_gb = memory_bytes / 1e9
    fits = "YES" if memory_gb < 120 else "NO"  # Leave some headroom
    
    print(f"{name:<20} {bytes_per_param:<15} {memory_gb:.1f} GB{'':<8} {fits}")

print("\n" + "=" * 55)
print("\nKey Insights:")
print("- FP32 requires 280GB - DOES NOT fit")
print("- FP16/BF16 requires 140GB - TIGHT FIT (may work with optimizations)")
print("- INT8 requires 70GB - FITS COMFORTABLY")
print("- INT4 requires 35GB - FITS WITH ROOM TO SPARE")
print("\nThis is why quantization is essential for running large models!")

**Additional Considerations:**

1. **Inference Memory Overhead:**
   - KV Cache: Additional memory for attention caching
   - Activation memory: Intermediate computation results
   - Typically add 10-20% overhead

2. **Training Memory Overhead:**
   - Gradients: Same size as model (2x)
   - Optimizer states: Adam needs 2x more (4x total with model)
   - Activations for backprop: Variable, can be huge
   - **Training 70B on DGX Spark**: Requires gradient checkpointing, LoRA, or similar

---

## Challenge Solution

**Task:** Create a memory stress test that allocates until failure.

In [None]:
def memory_stress_test(increment_gb: float = 5.0, max_attempts: int = 30):
    """
    Stress test GPU memory by allocating tensors until failure.
    
    Args:
        increment_gb: Size of each allocation in GB
        max_attempts: Maximum number of allocations to attempt
    
    Returns:
        Maximum memory allocated in GB
    """
    import torch
    import gc
    
    print("Memory Stress Test")
    print("=" * 40)
    print(f"Increment: {increment_gb} GB per allocation")
    print("-" * 40)
    
    tensors = []
    total_gb = 0
    
    # Clear before starting
    gc.collect()
    torch.cuda.empty_cache()
    
    try:
        for i in range(max_attempts):
            # Allocate tensor
            bytes_per_element = 4  # float32
            num_elements = int(increment_gb * 1e9 / bytes_per_element)
            
            tensor = torch.empty(num_elements, dtype=torch.float32, device='cuda')
            tensors.append(tensor)
            
            total_gb += increment_gb
            
            # Get actual memory stats
            allocated = torch.cuda.memory_allocated() / 1e9
            reserved = torch.cuda.memory_reserved() / 1e9
            
            print(f"  Allocation {i+1}: +{increment_gb} GB (Total: {total_gb:.1f} GB, Reserved: {reserved:.1f} GB)")
            
    except RuntimeError as e:
        print(f"\n  ALLOCATION FAILED at {total_gb + increment_gb:.1f} GB")
        print(f"  Error: {str(e)[:60]}...")
    
    finally:
        # Cleanup
        print("\nCleaning up...")
        for t in tensors:
            del t
        tensors.clear()
        gc.collect()
        torch.cuda.empty_cache()
    
    print(f"\n✅ Maximum successful allocation: {total_gb:.1f} GB")
    return total_gb

# Run the stress test
# Note: This may take a while and use a lot of memory!
# max_mem = memory_stress_test(increment_gb=10.0)

In [None]:
# Alternative: Binary search for maximum allocation
def find_max_allocation(low_gb: float = 50, high_gb: float = 120) -> float:
    """
    Binary search to find maximum single allocation size.
    
    Returns:
        Maximum allocation size in GB
    """
    import torch
    import gc
    
    print("Finding maximum single allocation...")
    print("-" * 40)
    
    gc.collect()
    torch.cuda.empty_cache()
    
    max_successful = 0
    
    while high_gb - low_gb > 1:
        mid = (low_gb + high_gb) / 2
        
        try:
            gc.collect()
            torch.cuda.empty_cache()
            
            num_elements = int(mid * 1e9 / 4)
            tensor = torch.empty(num_elements, dtype=torch.float32, device='cuda')
            del tensor
            
            print(f"  {mid:.1f} GB: SUCCESS")
            max_successful = mid
            low_gb = mid
            
        except RuntimeError:
            print(f"  {mid:.1f} GB: FAILED")
            high_gb = mid
    
    gc.collect()
    torch.cuda.empty_cache()
    
    print(f"\n✅ Maximum single allocation: ~{max_successful:.0f} GB")
    return max_successful

# Uncomment to run:
# find_max_allocation()

---

## Key Takeaways

1. **Unified Memory** allows allocations up to ~100GB on DGX Spark
2. **Buffer cache** must be cleared before large model loading
3. **BFloat16** is the recommended dtype for DGX Spark (native Blackwell support)
4. **70B models** fit in INT4/INT8 quantization easily
5. **Memory overhead** from PyTorch caching is typically 5-10%

---

## Cleanup

In [None]:
# Cleanup resources and release GPU memory
import gc
import torch

# Cleanup any remaining tensors
gc.collect()

# Clear CUDA cache if available
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    
print("Cleanup complete!")