# Model Optimization Benchmarks

This notebook demonstrates various optimization techniques and their impact.

In [None]:
import sys
sys.path.append('..')

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from src.optimization import benchmark_inference
from src.utils import calculate_model_size, MemoryTracker
import matplotlib.pyplot as plt
import numpy as np

## 1. Load Models

In [None]:
# Load base model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Model info
model_info = calculate_model_size(model)
print(f"Model: {model_name}")
print(f"Size: {model_info['total_size_mb']:.1f}MB")
print(f"Parameters: {model_info['total_params']:,}")

## 2. Quantization Comparison

In [None]:
# Test texts
test_texts = [
    "This is a positive review.",
    "This is a negative review.",
    "This is a neutral statement."
] * 10

# Benchmark FP32
fp32_time, fp32_mem = benchmark_inference(
    model, tokenizer, test_texts, "FP32 Model"
)

# Quantize and benchmark
if torch.cuda.is_available():
    # FP16
    model_fp16 = model.half()
    fp16_time, fp16_mem = benchmark_inference(
        model_fp16, tokenizer, test_texts, "FP16 Model"
    )
else:
    # INT8 for CPU
    model_int8 = torch.quantization.quantize_dynamic(
        model, {torch.nn.Linear}, dtype=torch.qint8
    )
    int8_time, int8_mem = benchmark_inference(
        model_int8, tokenizer, test_texts, "INT8 Model"
    )

## 3. Batch Size Impact

In [None]:
# Test different batch sizes
batch_sizes = [1, 2, 4, 8, 16, 32]
throughputs = []

for batch_size in batch_sizes:
    test_batch = test_texts[:batch_size]
    time_taken, _ = benchmark_inference(
        model, tokenizer, test_batch, f"Batch size {batch_size}"
    )
    throughput = len(test_batch) / time_taken
    throughputs.append(throughput)

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(batch_sizes, throughputs, 'b-o')
plt.xlabel('Batch Size')
plt.ylabel('Throughput (samples/sec)')
plt.title('Throughput vs Batch Size')
plt.grid(True)
plt.show()

## 4. Memory Profiling

In [None]:
# Profile memory usage
with MemoryTracker() as tracker:
    inputs = tokenizer(
        test_texts, 
        padding=True, 
        truncation=True,
        return_tensors="pt"
    )
    
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
        model = model.cuda()
    
    with torch.no_grad():
        outputs = model(**inputs)

print(f"Peak memory usage: {tracker.get_memory_used():.2f}MB")

## 5. Optimization Summary

In [None]:
# Create comparison chart
techniques = ['FP32', 'FP16/INT8', 'Batching']
speedups = [1.0, 2.0, 5.0]  # Example values

plt.figure(figsize=(10, 6))
bars = plt.bar(techniques, speedups, color=['blue', 'green', 'orange'])
plt.ylabel('Speedup Factor')
plt.title('Optimization Technique Comparison')
plt.ylim(0, 6)

# Add value labels on bars
for bar, speedup in zip(bars, speedups):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
             f'{speedup}x', ha='center', va='bottom')

plt.show()