<a href="https://colab.research.google.com/github/ShreyashN16/AI-Inference-Engine-with-CPU-vs-GPU-Optimization/blob/main/AI_Inference_Engine_with_CPU_vs_GPU_Optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ================================
# INSTALL DEPENDENCIES
# ================================

!pip install torch torchvision gradio psutil --quiet

# ================================
# IMPORT LIBRARIES
# ================================

import torch
import torchvision.models as models
import time
import numpy as np
import psutil
import gradio as gr
import gc

# ================================
# DEVICE CHECK
# ================================

GPU_AVAILABLE = torch.cuda.is_available()

if GPU_AVAILABLE:
    GPU_NAME = torch.cuda.get_device_name(0)
else:
    GPU_NAME = "No GPU Available"

print("GPU Status:", GPU_NAME)


# ================================
# LOAD MODEL FUNCTION
# ================================

def load_model(device):

    model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
    model.eval()
    model.to(device)

    return model


# ================================
# MEMORY MEASUREMENT FUNCTIONS
# ================================

def get_cpu_memory():

    process = psutil.Process()
    memory = process.memory_info().rss / 1024**2
    return memory


def get_gpu_memory():

    if torch.cuda.is_available():
        memory = torch.cuda.memory_allocated() / 1024**2
        return memory

    return 0


# ================================
# BENCHMARK FUNCTION
# ================================

def benchmark(device, batch_size, iterations):

    # Clean memory
    gc.collect()

    if device == "cuda":
        torch.cuda.empty_cache()

    device_obj = torch.device(device)

    # Load model
    model = load_model(device_obj)

    # Create input
    input_tensor = torch.randn(
        batch_size, 3, 224, 224
    ).to(device_obj)

    # Warmup
    for _ in range(10):
        with torch.no_grad():
            model(input_tensor)

    if device == "cuda":
        torch.cuda.synchronize()

    # Memory before
    cpu_mem_before = get_cpu_memory()
    gpu_mem_before = get_gpu_memory()

    # Start timing
    start_time = time.time()

    for _ in range(iterations):
        with torch.no_grad():
            output = model(input_tensor)

    if device == "cuda":
        torch.cuda.synchronize()

    end_time = time.time()

    # Memory after
    cpu_mem_after = get_cpu_memory()
    gpu_mem_after = get_gpu_memory()

    # Calculate metrics
    total_time = end_time - start_time

    latency = (total_time / iterations) * 1000

    throughput = iterations / total_time

    cpu_memory_used = cpu_mem_after - cpu_mem_before
    gpu_memory_used = gpu_mem_after - gpu_mem_before

    # Cleanup
    del model
    del input_tensor
    gc.collect()

    if device == "cuda":
        torch.cuda.empty_cache()

    return latency, throughput, cpu_memory_used, gpu_memory_used


# ================================
# GUI FUNCTION
# ================================

def run_inference(device, batch_size, iterations):

    if device == "GPU" and not torch.cuda.is_available():
        return "GPU not available", "", "", ""

    device_name = "cuda" if device == "GPU" else "cpu"

    latency, throughput, cpu_mem, gpu_mem = benchmark(
        device_name,
        batch_size,
        iterations
    )

    result = f"""
Device: {device}

Latency: {latency:.2f} ms

Throughput: {throughput:.2f} inferences/sec

CPU Memory Used: {cpu_mem:.2f} MB

GPU Memory Used: {gpu_mem:.2f} MB
"""

    return result


# ================================
# CPU vs GPU COMPARISON FUNCTION
# ================================

def compare_devices(batch_size, iterations):

    cpu_latency, cpu_throughput, cpu_mem, _ = benchmark(
        "cpu", batch_size, iterations
    )

    if torch.cuda.is_available():

        gpu_latency, gpu_throughput, _, gpu_mem = benchmark(
            "cuda", batch_size, iterations
        )

    else:

        return "GPU not available"

    comparison = f"""
CPU vs GPU Comparison

CPU Latency: {cpu_latency:.2f} ms
GPU Latency: {gpu_latency:.2f} ms

CPU Throughput: {cpu_throughput:.2f} inf/sec
GPU Throughput: {gpu_throughput:.2f} inf/sec

CPU Memory: {cpu_mem:.2f} MB
GPU Memory: {gpu_mem:.2f} MB

Speedup: {cpu_latency/gpu_latency:.2f}x faster on GPU
"""

    return comparison


# ================================
# BUILD GUI
# ================================

with gr.Blocks() as app:

    gr.Markdown("# AI Inference Engine: CPU vs GPU Benchmark")

    gr.Markdown(f"GPU Status: {GPU_NAME}")

    device_dropdown = gr.Dropdown(
        ["CPU", "GPU"],
        value="CPU",
        label="Select Device"
    )

    batch_slider = gr.Slider(
        minimum=1,
        maximum=64,
        value=1,
        step=1,
        label="Batch Size"
    )

    iteration_slider = gr.Slider(
        minimum=10,
        maximum=500,
        value=100,
        step=10,
        label="Iterations"
    )

    run_button = gr.Button("Run Benchmark")

    compare_button = gr.Button("Compare CPU vs GPU")

    output_box = gr.Textbox(
        label="Results",
        lines=15
    )

    run_button.click(
        run_inference,
        inputs=[
            device_dropdown,
            batch_slider,
            iteration_slider
        ],
        outputs=output_box
    )

    compare_button.click(
        compare_devices,
        inputs=[
            batch_slider,
            iteration_slider
        ],
        outputs=output_box
    )


# ================================
# LAUNCH GUI
# ================================

app.launch()

GPU Status: No GPU Available
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d7ff991e071bc5bb1e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


