<a href="https://colab.research.google.com/github/PETEROA/ML_Optim/blob/main/EdgeML_Optim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Edge ML Optimisation Demo

Author: Peter Agida

Objective: Compress image classification model for mobile deployment

Target: 5-10x speedup, <2% accuracy loss



In [None]:
!pip install torch torchvision
!pip install onnx onnxruntime
!pip install tensorflow
!pip install plotly
!pip install ptflops
!pip install torch-pruning

Collecting onnx
  Downloading onnx-1.20.0-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnx-1.20.0-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (18.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m18.1/18.1 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import numpy as np
import time
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
from ptflops import get_model_complexity_info
import onnx
import onnxruntime
from pathlib import Path

In [None]:
# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

print("Setup complete!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

Setup complete!
PyTorch version: 2.9.0+cpu
CUDA available: False


In [None]:
# Dataset Preparation
print("Preparing dataset...")

# Use CIFAR-10 for quick demo (can scale to ImageNet later)
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])


transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# Load datasets
train_dataset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform_train
)
test_dataset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform_test
)


train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=2)

print(f" Dataset loaded: {len(train_dataset)} train, {len(test_dataset)} test samples")


Preparing dataset...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 170M/170M [00:05<00:00, 33.0MB/s]


 Dataset loaded: 50000 train, 10000 test samples


In [None]:
#Baseline Model (Teacher)
class TeacherModel(nn.Module):
    """Large teacher model - ResNet18 modified for CIFAR-10"""
    def __init__(self):
        super(TeacherModel, self).__init__()
        self.model = torchvision.models.resnet18(pretrained=False, num_classes=10)
        # Modify first conv for CIFAR-10 (32x32 images)
        self.model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.model.maxpool = nn.Identity()

    def forward(self, x):
        return self.model(x)

# Initialize teacher
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
teacher = TeacherModel().to(device)

print(f"Teacher model created")
print(f"   Device: {device}")




Teacher model created
   Device: cpu


In [None]:
# Model Complexity Analysis
def analyze_model(model, input_size=(3, 32, 32)):
    """Analyze model size, params, and FLOPs"""
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    # Get FLOPs
    macs, params = get_model_complexity_info(
        model, input_size, as_strings=False,
        print_per_layer_stat=False, verbose=False
    )
    flops = 2 * macs  # MACs to FLOPs approximation

    # Model size
    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
    size_mb = (param_size + buffer_size) / (1024**2)

    return {
        'params': total_params,
        'trainable_params': trainable_params,
        'flops': flops,
        'size_mb': size_mb
    }

baseline_stats = analyze_model(teacher)
print(f"\n Baseline Model Stats:")
print(f"   Parameters: {baseline_stats['params']:,}")
print(f"   FLOPs: {baseline_stats['flops']:,}")
print(f"   Size: {baseline_stats['size_mb']:.2f} MB")


 Baseline Model Stats:
   Parameters: 11,173,962
   FLOPs: 1,115,564,052
   Size: 42.66 MB


Training Utilities

In [None]:
# Training  and evaluation functions

def train_model(model, train_loader, criterion, optimizer, device, epochs=10):
    """Standard training loop"""
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0

        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            if (i + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(train_loader)}], '
                      f'Loss: {running_loss/100:.3f}, Acc: {100.*correct/total:.2f}%')
                running_loss = 0.0

def evaluate_model(model, test_loader, device):
    """Evaluate model accuracy"""
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    accuracy = 100. * correct / total
    return accuracy

def benchmark_latency(model, input_size=(1, 3, 32, 32), device='cpu', num_runs=100):
    """Measure inference latency"""
    model.eval()
    model = model.to(device)
    dummy_input = torch.randn(input_size).to(device)

    # Warmup
    for _ in range(10):
        _ = model(dummy_input)

    # Benchmark
    if device == 'cuda':
        torch.cuda.synchronize()

    start = time.time()
    for _ in range(num_runs):
        _ = model(dummy_input)
        if device == 'cuda':
            torch.cuda.synchronize()
    end = time.time()

    avg_latency_ms = (end - start) / num_runs * 1000
    return avg_latency_ms

print(" Training utilities defined")

 Training utilities defined


Train Teacher Model

In [None]:

print("\nüéì Training teacher model...")

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(teacher.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20)

# Train teacher (reduce epochs for demo - use 20+ for production)
TEACHER_EPOCHS = 10  # Set to 20-50 for better results
train_model(teacher, train_loader, criterion, optimizer, device, epochs=TEACHER_EPOCHS)

# Evaluate teacher
teacher_accuracy = evaluate_model(teacher, test_loader, device)
teacher_latency = benchmark_latency(teacher, device=str(device))

print(f"\nTeacher Model Results:")
print(f"   Accuracy: {teacher_accuracy:.2f}%")
print(f"   Latency: {teacher_latency:.2f} ms")
print(f"   Size: {baseline_stats['size_mb']:.2f} MB")

# Save teacher model
torch.save(teacher.state_dict(), 'teacher_model.pth')


üéì Training teacher model...
Epoch [1/10], Step [100/391], Loss: 2.763, Acc: 15.17%
Epoch [1/10], Step [200/391], Loss: 1.939, Acc: 21.03%


Knowledge Distillation

In [None]:
# ============================================================================
# STUDENT MODEL (Lightweight)
# ============================================================================
class StudentModel(nn.Module):
    """Compact student model - 10x smaller"""
    def __init__(self, num_classes=10):
        super(StudentModel, self).__init__()

        # Simplified architecture
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # 16x16

            # Block 2
            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # 8x8

            # Block 3
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d(1)
        )

        self.classifier = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

student = StudentModel().to(device)
student_stats = analyze_model(student)

print(f"\n Student Model Stats:")
print(f"   Parameters: {student_stats['params']:,} ({baseline_stats['params']/student_stats['params']:.1f}x smaller)")
print(f"   FLOPs: {student_stats['flops']:,} ({baseline_stats['flops']/student_stats['flops']:.1f}x fewer)")
print(f"   Size: {student_stats['size_mb']:.2f} MB ({baseline_stats['size_mb']/student_stats['size_mb']:.1f}x smaller)")

# ============================================================================
# KNOWLEDGE DISTILLATION TRAINING
# ============================================================================
class DistillationLoss(nn.Module):
    """Combines hard labels and soft labels from teacher"""
    def __init__(self, temperature=4.0, alpha=0.7):
        super().__init__()
        self.temperature = temperature
        self.alpha = alpha
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, student_logits, teacher_logits, labels):
        # Distillation loss (soft targets)
        distill_loss = F.kl_div(
            F.log_softmax(student_logits / self.temperature, dim=1),
            F.softmax(teacher_logits / self.temperature, dim=1),
            reduction='batchmean'
        ) * (self.temperature ** 2)

        # Student loss (hard targets)
        student_loss = self.criterion(student_logits, labels)

        # Combined loss
        return self.alpha * distill_loss + (1 - self.alpha) * student_loss

def train_with_distillation(student, teacher, train_loader, device, epochs=20):
    """Train student using knowledge distillation"""
    teacher.eval()  # Teacher in eval mode
    student.train()

    criterion = DistillationLoss(temperature=4.0, alpha=0.7)
    optimizer = torch.optim.SGD(student.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0

        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            # Get teacher predictions
            with torch.no_grad():
                teacher_logits = teacher(inputs)

            # Get student predictions
            student_logits = student(inputs)

            # Compute distillation loss
            loss = criterion(student_logits, teacher_logits, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = student_logits.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            if (i + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(train_loader)}], '
                      f'Loss: {running_loss/100:.3f}, Acc: {100.*correct/total:.2f}%')
                running_loss = 0.0

        scheduler.step()

print("\n Training student with knowledge distillation...")
DISTILL_EPOCHS = 15  # Set to 30-50 for production
train_with_distillation(student, teacher, train_loader, device, epochs=DISTILL_EPOCHS)

# Evaluate distilled student
student_accuracy = evaluate_model(student, test_loader, device)
student_latency = benchmark_latency(student, device=str(device))

print(f"\n Distilled Student Results:")
print(f"   Accuracy: {student_accuracy:.2f}% (vs Teacher: {teacher_accuracy:.2f}%)")
print(f"   Accuracy drop: {teacher_accuracy - student_accuracy:.2f}%")
print(f"   Latency: {student_latency:.2f} ms ({teacher_latency/student_latency:.1f}x faster)")
print(f"   Size: {student_stats['size_mb']:.2f} MB ({baseline_stats['size_mb']/student_stats['size_mb']:.1f}x smaller)")

torch.save(student.state_dict(), 'student_distilled.pth')

Quantization

In [None]:
# ============================================================================
# POST-TRAINING QUANTIZATION
# ============================================================================
print("\n Applying quantization...")

# Dynamic quantization (easiest, good for CPU)
student_quantized = torch.quantization.quantize_dynamic(
    student.cpu(), {nn.Linear, nn.Conv2d}, dtype=torch.qint8
)

# Evaluate quantized model
student_quant_accuracy = evaluate_model(student_quantized, test_loader, 'cpu')
student_quant_latency = benchmark_latency(student_quantized, device='cpu')

# Calculate quantized model size
torch.save(student_quantized.state_dict(), 'student_quantized.pth')
import os
quant_size_mb = os.path.getsize('student_quantized.pth') / (1024**2)

print(f"\nQuantized Student Results:")
print(f"   Accuracy: {student_quant_accuracy:.2f}% (drop: {student_accuracy - student_quant_accuracy:.2f}%)")
print(f"   Latency (CPU): {student_quant_latency:.2f} ms ({student_latency/student_quant_latency:.1f}x faster)")
print(f"   Size: {quant_size_mb:.2f} MB ({student_stats['size_mb']/quant_size_mb:.1f}x smaller)")

Export TFLite and ONNX

In [None]:
# ============================================================================
# EXPORT TO DEPLOYMENT FORMATS
# ============================================================================
print("\n Exporting models for deployment...")

# 1. Export to ONNX
dummy_input = torch.randn(1, 3, 32, 32).to(device)
student.eval()

torch.onnx.export(
    student,
    dummy_input,
    "student_model.onnx",
    export_params=True,
    opset_version=11,
    do_constant_folding=True,
    input_names=['input'],
    output_names=['output'],
    dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
)

print("Exported to ONNX: student_model.onnx")

# Verify ONNX model
onnx_model = onnx.load("student_model.onnx")
onnx.checker.check_model(onnx_model)
print("   ONNX model verified")

# 2. Export to TorchScript (for mobile)
scripted_model = torch.jit.script(student.cpu())
scripted_model.save("student_model.pt")
print("Exported to TorchScript: student_model.pt")

# 3. Benchmark ONNX Runtime
ort_session = onnxruntime.InferenceSession("student_model.onnx")

def benchmark_onnx(session, num_runs=100):
    dummy = np.random.randn(1, 3, 32, 32).astype(np.float32)

    # Warmup
    for _ in range(10):
        _ = session.run(None, {'input': dummy})

    start = time.time()
    for _ in range(num_runs):
        _ = session.run(None, {'input': dummy})
    end = time.time()

    return (end - start) / num_runs * 1000

onnx_latency = benchmark_onnx(ort_session)
print(f"   ONNX Runtime latency: {onnx_latency:.2f} ms")

print("\nExported Files:")
print("    student_model.onnx (for mobile/edge)")
print("    student_model.pt (TorchScript)")
print("    student_quantized.pth (quantized weights)")

Comprehensive Bechmarking Dashboard

In [None]:
# ============================================================================
# RESULTS COMPILATION
# ============================================================================

results = {
    'Model': ['Teacher (ResNet18)', 'Student (Distilled)', 'Student (Quantized)', 'ONNX Runtime'],
    'Accuracy (%)': [teacher_accuracy, student_accuracy, student_quant_accuracy, student_accuracy],
    'Latency (ms)': [teacher_latency, student_latency, student_quant_latency, onnx_latency],
    'Parameters (M)': [
        baseline_stats['params'] / 1e6,
        student_stats['params'] / 1e6,
        student_stats['params'] / 1e6,
        student_stats['params'] / 1e6
    ],
    'Size (MB)': [baseline_stats['size_mb'], student_stats['size_mb'], quant_size_mb, student_stats['size_mb']],
    'FLOPs (M)': [
        baseline_stats['flops'] / 1e6,
        student_stats['flops'] / 1e6,
        student_stats['flops'] / 1e6,
        student_stats['flops'] / 1e6
    ]
}

df = pd.DataFrame(results)

# Calculate improvements
df['Speedup'] = df['Latency (ms)'].iloc[0] / df['Latency (ms)']
df['Size Reduction'] = df['Size (MB)'].iloc[0] / df['Size (MB)']
df['Accuracy Drop'] = df['Accuracy (%)'].iloc[0] - df['Accuracy (%)']

print("\n" + "="*80)
print("OPTIMIZATION RESULTS SUMMARY")
print("="*80)
print(df.to_string(index=False))
print("="*80)

# ============================================================================
# VISUALIZATION DASHBOARD
# ============================================================================

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Accuracy Comparison', 'Latency Comparison',
                    'Model Size Comparison', 'Speedup vs Accuracy Trade-off'),
    specs=[[{'type': 'bar'}, {'type': 'bar'}],
           [{'type': 'bar'}, {'type': 'scatter'}]]
)

# Accuracy
fig.add_trace(
    go.Bar(x=df['Model'], y=df['Accuracy (%)'], name='Accuracy',
           marker_color='lightblue', text=df['Accuracy (%)'].round(2),
           textposition='outside'),
    row=1, col=1
)

# Latency
fig.add_trace(
    go.Bar(x=df['Model'], y=df['Latency (ms)'], name='Latency',
           marker_color='lightcoral', text=df['Latency (ms)'].round(2),
           textposition='outside'),
    row=1, col=2
)

# Size
fig.add_trace(
    go.Bar(x=df['Model'], y=df['Size (MB)'], name='Size',
           marker_color='lightgreen', text=df['Size (MB)'].round(2),
           textposition='outside'),
    row=2, col=1
)

# Pareto frontier
fig.add_trace(
    go.Scatter(x=df['Accuracy Drop'], y=df['Speedup'], mode='markers+text',
               name='Models', marker=dict(size=15, color=df.index),
               text=df['Model'], textposition='top center'),
    row=2, col=2
)

fig.update_xaxes(title_text="Model", row=1, col=1)
fig.update_xaxes(title_text="Model", row=1, col=2)
fig.update_xaxes(title_text="Model", row=2, col=1)
fig.update_xaxes(title_text="Accuracy Drop (%)", row=2, col=2)

fig.update_yaxes(title_text="Accuracy (%)", row=1, col=1)
fig.update_yaxes(title_text="Latency (ms)", row=1, col=2)
fig.update_yaxes(title_text="Size (MB)", row=2, col=1)
fig.update_yaxes(title_text="Speedup (x)", row=2, col=2)

fig.update_layout(height=800, title_text="Edge ML Optimization Dashboard", showlegend=False)
fig.show()

# Save results
df.to_csv('optimization_results.csv', index=False)
print("\n Results saved to: optimization_results.csv")