In [None]:
# ==============================================================================
# MEMORY FOOTPRINT EVALUATION - TWO-WHEELER MODEL (BSA)
# Metrics: GPU Memory, Model Size, Parameter Count, Memory Breakdown
# ==============================================================================

# ========== CELL 1: Install Packages ==========
!pip install -q accelerate bitsandbytes peft transformers

# Note: After running Cell 1, restart runtime then run cells 2-11


# ========== CELL 2: Import Libraries ==========
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import pandas as pd
import gc


# ========== CELL 3: Memory Utility Functions ==========
def get_gpu_memory():
    """Get current GPU memory usage"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / (1024**3)
        reserved = torch.cuda.memory_reserved() / (1024**3)
        max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
        total = torch.cuda.get_device_properties(0).total_memory / (1024**3)
        return {
            'allocated_gb': allocated,
            'reserved_gb': reserved,
            'max_allocated_gb': max_allocated,
            'total_gpu_gb': total
        }
    return {'allocated_gb': 0, 'reserved_gb': 0, 'max_allocated_gb': 0, 'total_gpu_gb': 0}

def get_model_size(model):
    """Calculate model size in memory"""
    param_size = 0
    buffer_size = 0
    
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    
    total_size = param_size + buffer_size
    return {
        'param_size_mb': param_size / (1024**2),
        'buffer_size_mb': buffer_size / (1024**2),
        'total_size_mb': total_size / (1024**2),
        'total_size_gb': total_size / (1024**3)
    }

def count_parameters(model):
    """Count trainable and total parameters"""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return {
        'total_params': total_params,
        'trainable_params': trainable_params,
        'total_params_millions': total_params / 1e6,
        'trainable_params_millions': trainable_params / 1e6
    }

def clear_memory():
    """Clear GPU memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()


# ========== CELL 4: Clear Memory and Get Baseline ==========
clear_memory()
baseline_memory = get_gpu_memory()

print("="*70)
print("MEMORY FOOTPRINT MEASUREMENT - TWO-WHEELER MODEL")
print("="*70)
print(f"\nBaseline GPU Memory: {baseline_memory['allocated_gb']:.4f} GB")
print(f"Total GPU Memory: {baseline_memory['total_gpu_gb']:.2f} GB\n")


# ========== CELL 5: Configure Model ==========
adapter_name = "Prithwiraj731/Gemma2-2b_Two-Wheeler"
base_model_name = "google/gemma-2-2b"

print("Model Configuration:")
print(f"  Adapter: {adapter_name}")
print(f"  Base Model: {base_model_name}\n")


# ========== CELL 6: Load Tokenizer ==========
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(adapter_name)
after_tokenizer = get_gpu_memory()
print(f"After tokenizer: {after_tokenizer['allocated_gb']:.4f} GB\n")


# ========== CELL 7: Load Base Model ==========
print("Loading base model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.float16
)

after_base = get_gpu_memory()
base_model_size = get_model_size(base_model)
base_params = count_parameters(base_model)

print(f"After base model: {after_base['allocated_gb']:.4f} GB")
print(f"Base model size: {base_model_size['total_size_mb']:.2f} MB")
print(f"Base parameters: {base_params['total_params_millions']:.2f} M\n")


# ========== CELL 8: Load LoRA Adapter ==========
print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, adapter_name)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

after_adapter = get_gpu_memory()
full_model_size = get_model_size(model)
full_params = count_parameters(model)

print(f"After adapter: {after_adapter['allocated_gb']:.4f} GB")
print(f"Full model size: {full_model_size['total_size_mb']:.2f} MB")
print(f"Trainable parameters: {full_params['trainable_params_millions']:.2f} M\n")


# ========== CELL 9: Run Inference to Measure Peak Memory ==========
print("Running inference to measure peak memory...")
test_prompt = "What is the recommended lubrication for the engine?\n"
inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.1,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2
    )

after_inference = get_gpu_memory()
print(f"After inference: {after_inference['allocated_gb']:.4f} GB")
print(f"Peak memory: {after_inference['max_allocated_gb']:.4f} GB\n")


# ========== CELL 10: Display Summary Results ==========
print("="*70)
print("TWO-WHEELER MODEL - MEMORY FOOTPRINT RESULTS")
print("="*70)

print(f"\nGPU Memory Usage:")
print(f"  Baseline:              {baseline_memory['allocated_gb']:.4f} GB")
print(f"  After Tokenizer:       {after_tokenizer['allocated_gb']:.4f} GB")
print(f"  After Base Model:      {after_base['allocated_gb']:.4f} GB")
print(f"  After LoRA Adapter:    {after_adapter['allocated_gb']:.4f} GB")
print(f"  After Inference:       {after_inference['allocated_gb']:.4f} GB")
print(f"  Peak Memory:           {after_inference['max_allocated_gb']:.4f} GB")
print(f"  Reserved Memory:       {after_inference['reserved_gb']:.4f} GB")
print(f"  Total GPU Capacity:    {baseline_memory['total_gpu_gb']:.2f} GB")

print(f"\nModel Size:")
print(f"  Model in Memory:       {full_model_size['total_size_mb']:.2f} MB ({full_model_size['total_size_gb']:.4f} GB)")
print(f"  Parameters:            {full_model_size['param_size_mb']:.2f} MB")
print(f"  Buffers:               {full_model_size['buffer_size_mb']:.2f} MB")

print(f"\nParameter Count:")
print(f"  Total Parameters:      {full_params['total_params_millions']:.2f} M ({full_params['total_params']:,})")
print(f"  Trainable (LoRA):      {full_params['trainable_params_millions']:.2f} M ({full_params['trainable_params']:,})")
print(f"  Frozen:                {(full_params['total_params_millions'] - full_params['trainable_params_millions']):.2f} M")
print(f"  Trainable Ratio:       {(full_params['trainable_params'] / full_params['total_params'] * 100):.2f}%")

tokenizer_overhead = after_tokenizer['allocated_gb'] - baseline_memory['allocated_gb']
base_overhead = after_base['allocated_gb'] - after_tokenizer['allocated_gb']
adapter_overhead = after_adapter['allocated_gb'] - after_base['allocated_gb']
inference_overhead = after_inference['max_allocated_gb'] - after_adapter['allocated_gb']

print(f"\nMemory Breakdown:")
print(f"  Tokenizer Overhead:    {tokenizer_overhead:.4f} GB ({tokenizer_overhead*1024:.2f} MB)")
print(f"  Base Model Memory:     {base_overhead:.4f} GB ({base_overhead*1024:.2f} MB)")
print(f"  LoRA Adapter Overhead: {adapter_overhead:.4f} GB ({adapter_overhead*1024:.2f} MB)")
print(f"  Inference Overhead:    {inference_overhead:.4f} GB ({inference_overhead*1024:.2f} MB)")

gpu_utilization = (after_inference['max_allocated_gb'] / baseline_memory['total_gpu_gb']) * 100
print(f"\nGPU Utilization:       {gpu_utilization:.2f}%")


# ========== CELL 11: Memory Usage DataFrame ==========
memory_df = pd.DataFrame({
    'Stage': ['Baseline', 'After Tokenizer', 'After Base Model', 'After LoRA', 'After Inference', 'Peak'],
    'Allocated (GB)': [
        f"{baseline_memory['allocated_gb']:.4f}",
        f"{after_tokenizer['allocated_gb']:.4f}",
        f"{after_base['allocated_gb']:.4f}",
        f"{after_adapter['allocated_gb']:.4f}",
        f"{after_inference['allocated_gb']:.4f}",
        f"{after_inference['max_allocated_gb']:.4f}"
    ],
    'Delta (MB)': [
        "0.00",
        f"{tokenizer_overhead*1024:.2f}",
        f"{base_overhead*1024:.2f}",
        f"{adapter_overhead*1024:.2f}",
        f"{(after_inference['allocated_gb'] - after_adapter['allocated_gb'])*1024:.2f}",
        f"{inference_overhead*1024:.2f}"
    ]
})

print("\n" + "="*70)
print("DETAILED MEMORY USAGE TABLE")
print("="*70)
display(memory_df)


# ========== CELL 12: Save Results (Optional) ==========
# Uncomment to save and download results

# memory_df.to_csv('memory_footprint_2wheeler_results.csv', index=False)
# print("\nResults saved to 'memory_footprint_2wheeler_results.csv'")

# from google.colab import files
# files.download('memory_footprint_2wheeler_results.csv')

