In [1]:
import torch
import torchvision
from PIL import Image
import numpy as np

In [2]:
# We will load ResNet 18 in this video
resnet = torchvision.models.resnet18(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 172MB/s]


In [3]:
# We will download an example image from PyTorhc
import urllib
url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg") # Notebook Link will be in description
urllib.request.urlretrieve(url, filename)

('dog.jpg', <http.client.HTTPMessage at 0x7b5398f45e40>)

In [4]:
from torchvision import transforms
inp_image = Image.open(filename)

In [5]:
preprocess = transforms.Compose([
                                 transforms.Resize(256),
                                 transforms.CenterCrop(224),
                                 transforms.ToTensor(),
                                 transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [6]:
input_tensor = preprocess(inp_image)
inp_batch = input_tensor.unsqueeze(0)

In [7]:
# I am not using a GPU here, if you are, move it to cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inp_batch.to(device)
resnet.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [9]:
# Check if CUDA is available and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the device (if not already done)
resnet.to(device)

# Move the input tensor to the same device
inp_batch = inp_batch.to(device)

# Run inference
with torch.no_grad():
    output = resnet(inp_batch)

print(output[0])


tensor([-5.7528e-01, -5.2365e-01, -5.9628e-01, -1.5868e+00, -8.1130e-01,
        -2.5077e-01, -5.4371e-01,  4.9039e-01,  3.4190e-01, -6.8873e-01,
        -1.1116e+00, -1.0390e+00, -4.1218e-01, -1.0470e+00, -1.2576e+00,
        -7.3200e-01, -8.5439e-01, -3.2121e-01, -6.5513e-01, -6.1870e-01,
        -1.6483e+00, -7.3418e-01, -1.6252e+00,  1.6268e-01, -9.8598e-01,
        -1.2476e+00, -9.5957e-01, -1.2001e+00, -9.1442e-01, -3.1844e-01,
        -9.0930e-01, -8.9713e-01, -5.4904e-01, -5.5394e-01, -3.5682e-01,
        -5.3780e-01,  5.3506e-01, -7.7740e-01, -6.0174e-01, -7.9070e-02,
        -7.8334e-01, -1.0649e+00, -1.2019e+00, -5.5904e-01, -8.1106e-01,
        -5.4282e-01, -8.6169e-01, -5.3682e-01, -1.3275e+00, -1.2748e+00,
        -6.0428e-01,  4.9281e-01, -4.4329e-01, -7.2886e-01, -4.1746e-01,
        -1.3161e+00, -5.1891e-01, -1.5703e+00, -7.9879e-01, -7.2835e-01,
         6.1612e-01,  6.1869e-02, -2.7226e-01,  6.1549e-02, -8.8719e-01,
        -3.8378e-01, -4.2725e-01, -5.2556e-01, -9.6

In [11]:
!pip install onnx


Collecting onnx
  Downloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m98.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx
Successfully installed onnx-1.17.0


In [12]:
# Output of shape 1000, confidence scores for each of the imagenet classes
# Now we will save this model.
import torch.onnx
torch.onnx.export(resnet,
                  inp_batch,
                  "resnet18.onnx",
                  export_params=True,
                  opset_version=10)

In [None]:
# Now our model is saved to onnx format.
# I will cover loading onnx models in a later tutorial

In [13]:
!pip install onnx onnxruntime


Collecting onnxruntime
  Downloading onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pack

In [14]:
import onnx
import onnxruntime as ort
import numpy as np

# Load the ONNX model
onnx_model = onnx.load("resnet18.onnx")

# Check the model
onnx.checker.check_model(onnx_model)

# Create an ONNX Runtime session
ort_session = ort.InferenceSession("resnet18.onnx")

# Prepare the input tensor (make sure it matches the input shape expected by the model)
# Example: Assuming input shape is (1, 3, 224, 224) for a single image
# You may need to preprocess your image data accordingly.
inp_batch = np.random.rand(1, 3, 224, 224).astype(np.float32)  # Replace with your actual input

# Run inference
ort_inputs = {ort_session.get_inputs()[0].name: inp_batch}
ort_outs = ort_session.run(None, ort_inputs)

# Print the output
print(ort_outs[0])  # Output will have shape (1, 1000) for ImageNet classes


[[-1.74610233e+00  7.52400279e-01  1.44568992e+00  1.20903885e+00
   1.00830436e+00 -3.72980177e-01  9.35472846e-01 -2.54331440e-01
  -1.69022393e+00 -3.49500120e-01  9.76200104e-01  2.09227777e+00
   1.27292275e+00  2.64228868e+00  2.52153134e+00  1.04383612e+00
   1.37361670e+00  1.27904058e-01  1.15513086e+00  1.56909895e+00
   5.06293058e-01  1.85575235e+00  1.63468373e+00  1.46408570e+00
   2.09236413e-01  5.63738048e-01  1.05344415e+00  1.18055689e+00
   3.33715796e-01 -1.03683901e+00 -4.15844560e-01  1.36346972e+00
  -4.00536209e-01  1.04197407e+00  2.36583853e+00 -4.95091438e-01
  -4.94675279e-01 -1.51020885e-01  1.66281486e+00  3.23169619e-01
   2.55100393e+00  1.24703646e+00  2.07989788e+00  4.45683539e-01
   1.82828331e+00  4.75498408e-01  2.23872757e+00 -3.40613127e-01
   4.19860482e-02  2.60864735e-01  1.44900918e+00 -1.46857440e+00
   1.97769988e+00  1.72681808e+00  7.83777893e-01  9.69367027e-01
   6.57507777e-01  4.67452943e-01  1.71592116e+00  2.08936405e+00
   1.55284

In [15]:
from PIL import Image
import torchvision.transforms as transforms

# Load and preprocess an image
image = Image.open("/content/dog.jpg")
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),  # Converts to (C, H, W) format
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalization
])

# Apply the transformations
inp_batch = transform(image).unsqueeze(0).numpy()  # Add batch dimension


In [17]:
import torch

# Assuming inp_batch is already prepared as a NumPy array, convert it to a PyTorch tensor
inp_batch_tensor = torch.from_numpy(inp_batch)

# Move the tensor to the same device as the model (e.g., GPU if using)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inp_batch_tensor = inp_batch_tensor.to(device)

# Ensure the model is also on the same device
resnet.to(device)

# Run inference with the PyTorch model
with torch.no_grad():
    pytorch_output = resnet(inp_batch_tensor)

# Convert output to NumPy array for comparison
pytorch_output_np = pytorch_output.cpu().numpy()  # Move back to CPU and convert to NumPy



In [18]:
import onnx
import onnxruntime as ort
import numpy as np

# Load the ONNX model
onnx_model = onnx.load("resnet18.onnx")
onnx.checker.check_model(onnx_model)
ort_session = ort.InferenceSession("resnet18.onnx")

# Ensure the input for ONNX is also prepared correctly (already as NumPy array)
ort_inputs = {ort_session.get_inputs()[0].name: inp_batch}  # Ensure this is a NumPy array
onnx_output = ort_session.run(None, ort_inputs)[0]  # Get the output


In [19]:
# Compare the outputs
difference = np.abs(pytorch_output_np - onnx_output)
max_difference = np.max(difference)
mean_difference = np.mean(difference)

print(f'Max difference: {max_difference}')
print(f'Mean difference: {mean_difference}')

# Optionally, check if the outputs are close within a tolerance
tolerance = 1e-5
if np.all(difference < tolerance):
    print("The outputs from the PyTorch model and ONNX model are close enough!")
else:
    print("The outputs differ beyond the tolerance level.")


Max difference: 6.67572021484375e-06
Mean difference: 1.3396246458796668e-06
The outputs from the PyTorch model and ONNX model are close enough!


In [20]:
import time

# Measure inference time for the PyTorch model
start_time_pytorch = time.time()

with torch.no_grad():
    pytorch_output = resnet(inp_batch_tensor)

end_time_pytorch = time.time()
pytorch_inference_time = end_time_pytorch - start_time_pytorch

print(f'PyTorch inference time: {pytorch_inference_time:.6f} seconds')


PyTorch inference time: 0.006326 seconds


In [21]:
import torch
import numpy as np
import onnx
import onnxruntime as ort
import time

# Load the ONNX model
onnx_model = onnx.load("resnet18.onnx")
onnx.checker.check_model(onnx_model)
ort_session = ort.InferenceSession("resnet18.onnx")

# Prepare the input tensor for PyTorch (as a PyTorch tensor)
inp_batch_tensor = torch.from_numpy(inp_batch).to(device)  # Ensure it is on the right device

# Measure inference time for the PyTorch model
start_time_pytorch = time.time()
with torch.no_grad():
    pytorch_output = resnet(inp_batch_tensor)
end_time_pytorch = time.time()
pytorch_inference_time = end_time_pytorch - start_time_pytorch

# Convert output to NumPy for comparison
pytorch_output_np = pytorch_output.cpu().numpy()

# Prepare input for ONNX model (as a NumPy array)
ort_inputs = {ort_session.get_inputs()[0].name: inp_batch}

# Measure inference time for the ONNX model
start_time_onnx = time.time()
onnx_output = ort_session.run(None, ort_inputs)[0]
end_time_onnx = time.time()
onnx_inference_time = end_time_onnx - start_time_onnx

# Print the inference times
print(f'PyTorch inference time: {pytorch_inference_time:.6f} seconds')
print(f'ONNX inference time: {onnx_inference_time:.6f} seconds')


PyTorch inference time: 0.003366 seconds
ONNX inference time: 0.035591 seconds


In [24]:
import time

# Define the number of runs for averaging
num_runs = 100

# Function to measure average inference time for PyTorch
def measure_inference_time(model, input_tensor, num_runs=100):
    total_time = 0.0
    for _ in range(num_runs):
        start_time = time.time()
        with torch.no_grad():
            _ = model(input_tensor)
        total_time += time.time() - start_time
    return total_time / num_runs

# Measure average times for PyTorch
pytorch_average_time = measure_inference_time(resnet, inp_batch_tensor, num_runs)
print(f'Average PyTorch inference time over {num_runs} runs: {pytorch_average_time:.6f} seconds')




Average PyTorch inference time over 100 runs: 0.002912 seconds


In [25]:
# Function to measure average inference time for ONNX
def measure_onnx_inference_time(session, inputs, num_runs=100):
    total_time = 0.0
    for _ in range(num_runs):
        start_time = time.time()
        _ = session.run(None, inputs)
        total_time += time.time() - start_time
    return total_time / num_runs

# Measure average times for ONNX
onnx_average_time = measure_onnx_inference_time(ort_session, ort_inputs, num_runs)
print(f'Average ONNX inference time over {num_runs} runs: {onnx_average_time:.6f} seconds')

Average ONNX inference time over 100 runs: 0.043631 seconds


In [26]:
!pip install torch-tensorrt -U


Collecting torch-tensorrt
  Downloading torch_tensorrt-2.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_34_x86_64.whl.metadata (16 kB)
Collecting tensorrt==10.1.0 (from torch-tensorrt)
  Downloading tensorrt-10.1.0.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorrt-cu12-bindings==10.1.0 (from torch-tensorrt)
  Downloading tensorrt_cu12_bindings-10.1.0-cp310-none-manylinux_2_17_x86_64.whl.metadata (627 bytes)
Collecting tensorrt-cu12-libs==10.1.0 (from torch-tensorrt)
  Downloading tensorrt_cu12_libs-10.1.0.tar.gz (630 bytes)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tensorrt-cu12 (from tensorrt==10.1.0->torch-tensorrt)
  Downloading tensorrt-cu12-10.5.0.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading torch_tensorrt-2.4.0-cp310-cp310-manylinux_2_17_x86_64.ma

In [27]:
import torch

# Make sure the model is in evaluation mode
resnet.eval()

# Move the model to GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
resnet = resnet.to(device)


In [28]:
import torch_tensorrt

# Prepare your input example (on the right device)
input_data = inp_batch_tensor  # Ensure this is a PyTorch tensor on the right device

# Convert the PyTorch model to TensorRT using Torch-TensorRT
trt_model = torch_tensorrt.compile(resnet, inputs=[torch_tensorrt.Input(input_data.shape)], enabled_precisions={torch.float})

print("Model has been successfully converted to TensorRT!")




Model has been successfully converted to TensorRT!


In [29]:
# Run inference with the TensorRT model
with torch.no_grad():
    trt_output = trt_model(input_data)

# Convert the output to CPU (if necessary) and compare with the original model
trt_output_np = trt_output.cpu().numpy()


In [30]:
# Compare TensorRT and PyTorch outputs
difference = np.abs(pytorch_output_np - trt_output_np)
max_difference = np.max(difference)
mean_difference = np.mean(difference)

print(f'Max difference between TensorRT and PyTorch outputs: {max_difference}')
print(f'Mean difference between TensorRT and PyTorch outputs: {mean_difference}')


Max difference between TensorRT and PyTorch outputs: 5.7220458984375e-06
Mean difference between TensorRT and PyTorch outputs: 1.3225153452367522e-06


In [31]:
trt_model = torch_tensorrt.compile(resnet, inputs=[torch_tensorrt.Input(input_data.shape)], enabled_precisions={torch.half})




In [32]:
import time
import torch

# Make sure the model is in evaluation mode and on the right device
resnet.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
resnet = resnet.to(device)

# Prepare input tensor (ensure it's on the right device)
inp_batch_tensor = inp_batch_tensor.to(device)

# Measure inference time for the PyTorch model
start_time_pytorch = time.time()

with torch.no_grad():
    pytorch_output = resnet(inp_batch_tensor)

end_time_pytorch = time.time()
pytorch_inference_time = end_time_pytorch - start_time_pytorch

print(f'PyTorch inference time: {pytorch_inference_time:.6f} seconds')


PyTorch inference time: 0.004895 seconds


In [33]:
import torch_tensorrt

# Convert the model to TensorRT (if you haven't done this already)
trt_model = torch_tensorrt.compile(resnet, inputs=[torch_tensorrt.Input(inp_batch_tensor.shape)], enabled_precisions={torch.float})

# Measure inference time for the TensorRT model
start_time_trt = time.time()

with torch.no_grad():
    trt_output = trt_model(inp_batch_tensor)

end_time_trt = time.time()
trt_inference_time = end_time_trt - start_time_trt

print(f'TensorRT inference time: {trt_inference_time:.6f} seconds')




TensorRT inference time: 0.000971 seconds


In [34]:
import time
import torch
import torch_tensorrt

# Ensure models are in evaluation mode and moved to the appropriate device (CUDA)
resnet.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
resnet = resnet.to(device)
inp_batch_tensor = inp_batch_tensor.to(device)

# Measure PyTorch inference time
start_time_pytorch = time.time()
with torch.no_grad():
    pytorch_output = resnet(inp_batch_tensor)
end_time_pytorch = time.time()
pytorch_inference_time = end_time_pytorch - start_time_pytorch
print(f'PyTorch inference time: {pytorch_inference_time:.6f} seconds')

# Convert PyTorch model to TensorRT model
trt_model = torch_tensorrt.compile(resnet, inputs=[torch_tensorrt.Input(inp_batch_tensor.shape)], enabled_precisions={torch.float})

# Measure TensorRT inference time
start_time_trt = time.time()
with torch.no_grad():
    trt_output = trt_model(inp_batch_tensor)
end_time_trt = time.time()
trt_inference_time = end_time_trt - start_time_trt
print(f'TensorRT inference time: {trt_inference_time:.6f} seconds')

# Compare outputs
pytorch_output_np = pytorch_output.cpu().numpy()
trt_output_np = trt_output.cpu().numpy()

max_difference = np.max(np.abs(pytorch_output_np - trt_output_np))
mean_difference = np.mean(np.abs(pytorch_output_np - trt_output_np))
print(f'Max difference between TensorRT and PyTorch outputs: {max_difference}')
print(f'Mean difference between TensorRT and PyTorch outputs: {mean_difference}')




PyTorch inference time: 0.011577 seconds




TensorRT inference time: 0.001370 seconds
Max difference between TensorRT and PyTorch outputs: 5.7220458984375e-06
Mean difference between TensorRT and PyTorch outputs: 1.3225153452367522e-06


#Inference Times:
* PyTorch Inference Time: 0.011577 seconds
* ONNX Inference Time: 0.035591 seconds
* TensorRT Inference Time: 0.001370 seconds
#Max and Mean Differences in Outputs (compared to PyTorch):
ONNX Model:
* Max difference: 6.67572021484375e-06
* Mean difference: 1.3396246458796668e-06
* TensorRT Model:
* Max difference: 5.7220458984375e-06
* Mean difference: 1.3225153452367522e-06
#Performance:
TensorRT is the fastest, with an inference time of 0.001370 seconds, roughly 8.4 times faster than PyTorch and about 26 times faster than ONNX.
PyTorch is faster than ONNX, but still slower than TensorRT.
Accuracy:
The differences in output between the models are minimal, indicating that both ONNX and TensorRT models provide results very close to the original PyTorch model. The accuracy is well-preserved during conversion.