In [None]:
import torch.nn.functional as F
from torchvision.models.mobilenetv2 import InvertedResidual
import torch.nn as nn

class BranchedBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(BranchedBlock, self).__init__()
        # MobileNetV2 block
        self.mobilenet_block = InvertedResidual(in_channels, out_channels, stride, expand_ratio=6)
        # Regular convolution block
        self.conv_block = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        mobilenet_out = self.mobilenet_block(x)
        conv_out = self.conv_block(x)
        out = mobilenet_out + conv_out
        return out


In [None]:
!pip install onnx

Collecting onnx
  Downloading onnx-1.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: onnx
Successfully installed onnx-1.16.1


In [None]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.branch1 = BranchedBlock(3, 6)
        self.pool = nn.MaxPool2d(2, 2)
        self.branch2 = BranchedBlock(6, 16)
        self.fc1 = nn.Linear(16 * 8 * 8, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(self.branch1(x))
        x = self.pool(self.branch2(x))
        x = x.view(-1, 16 * 8 * 8)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [None]:
# Training script remains the same
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

# Load and preprocess the CIFAR-10 dataset
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) # Added std dev

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=100,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=100,
                                         shuffle=False, num_workers=2)

# Initialize the network, loss function, and optimizer
net = SimpleCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# Train the network
for epoch in range(1):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # Get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 200 == 199:  # Print every 200 mini-batches
            print(f'[Epoch {epoch + 1}, Batch {i + 1}] loss: {running_loss / 200:.3f}')
            running_loss = 0.0

print('Finished Training')

# Save the trained model to ONNX format
dummy_input = torch.randn(1, 3, 32, 32)
torch.onnx.export(net, dummy_input, "branched_cnn.onnx")


Files already downloaded and verified
Files already downloaded and verified
[Epoch 1, Batch 200] loss: 2.156
[Epoch 1, Batch 400] loss: 1.764
Finished Training


In [None]:
!pip install tensorrt

Collecting tensorrt
  Downloading tensorrt-10.1.0.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorrt-cu12 (from tensorrt)
  Downloading tensorrt-cu12-10.1.0.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tensorrt, tensorrt-cu12
  Building wheel for tensorrt (setup.py) ... [?25l[?25hdone
  Created wheel for tensorrt: filename=tensorrt-10.1.0-py2.py3-none-any.whl size=16332 sha256=90e19b6e6c88d51e15cc3e8a87b225d4396688ffe5eb63e733b70d5b384c8dc5
  Stored in directory: /root/.cache/pip/wheels/f5/55/f5/a1836546c0d92da062e9365a0323953f5e6a0a5f51d46da503
  Building wheel for tensorrt-cu12 (setup.py) ... [?25l[?25hdone
  Created wheel for tensorrt-cu12: filename=tensorrt_cu12-10.1.0-py2.py3-none-any.whl size=17554 sha256=5ef0568929bdfa65d993b161832910cdc82cc0509162a61ade49fbe6925866bc
  Stored in directory: /root/.cache/pip/wheels/15/96/43/6559f5cfe251d64e7a7b49efb429ae5258eff95976e5f12312
Suc

In [None]:
import tensorrt as trt

# Load ONNX model
onnx_file_path = 'branched_cnn.onnx'
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
#network = builder.create_network(trt.NetworkDefinitionCreationFlags.EXPLICIT_BATCH)
parser = trt.OnnxParser(network, TRT_LOGGER)

with open(onnx_file_path, 'rb') as model:
    if not parser.parse(model.read()):
        print('Failed to load ONNX file!')
        for error in range(parser.num_errors):
            print(parser.get_error(error))

# Set precision for specific layers
#builder.fp16_mode = True
config = builder.create_builder_config()
config.set_flag(trt.BuilderFlag.INT8)
config.set_flag(trt.BuilderFlag.FP16)
max_workspace_size = 1 << 30  # 1 GiB
#config.max_workspace_size = max_workspace_size

config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, max_workspace_size)

for layer_idx in range(network.num_layers):
    layer = network.get_layer(layer_idx)
    if isinstance(layer, trt.IConvolutionLayer):
        # Check if the layer is part of the regular conv branch or the MobileNet branch
        if "mobilenet" in layer.name:
            layer.precision = trt.DataType.FLOAT  # FP32 for MobileNetV2 block
            layer.set_output_type(0, trt.DataType.FLOAT)
        else:
            layer.precision = trt.DataType.INT8  # INT8 for regular conv block
            layer.set_output_type(0, trt.DataType.INT8)

# # Build the engine
# engine = builder.build_engine(network, config)
# #engine = builder.build_serialized_network(network, config)

# # Save the engine to disk
# with open('branched_cnn.trt', 'wb') as f:
#     f.write(engine.serialize())

# Build and serialize the engine (for TensorRT versions < 8.0)
serialized_engine = builder.build_serialized_network(network, config)

# Save the engine to disk
with open('branched_cnn.trt', 'wb') as f:
    f.write(serialized_engine)


TypeError: a bytes-like object is required, not 'NoneType'

In [None]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2024.1.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2024.1.5-py2.py3-none-any.whl (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.1/88.1 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting appdirs>=1.4.0 (from pycuda)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting mako (from pycuda)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pycuda
  Building wheel for pycuda (pyproject.toml) ... [?25l[?25hdone
  

In [None]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import time

# Load ONNX model
onnx_file_path = 'branched_cnn.onnx'
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(trt.NetworkDefinitionCreationFlags.EXPLICIT_BATCH)
parser = trt.OnnxParser(network, TRT_LOGGER)

with open(onnx_file_path, 'rb') as model:
    if not parser.parse(model.read()):
        print('Failed to load ONNX file!')
        for error in range(parser.num_errors):
            print(parser.get_error(error))

# Set precision for specific layers
config = builder.create_builder_config()
config.set_flag(trt.BuilderFlag.INT8)
config.set_flag(trt.BuilderFlag.FP16)
config.max_workspace_size = 1 << 30  # 1 GiB

# Create a calibration cache for INT8 layers (needed for INT8 precision)
def build_int8_calibrator():
    # This is a placeholder function for building an INT8 calibrator.
    # Implement this function based on your dataset.
    pass

int8_calibrator = build_int8_calibrator()
if int8_calibrator:
    config.int8_calibrator = int8_calibrator

for layer_idx in range(network.num_layers):
    layer = network.get_layer(layer_idx)
    if isinstance(layer, trt.IConvolutionLayer):
        # Check if the layer is part of the regular conv branch or the MobileNet branch
        if "mobilenet" in layer.name:
            layer.precision = trt.DataType.FLOAT  # FP32 for MobileNetV2 block
            layer.set_output_type(0, trt.DataType.FLOAT)
        else:
            layer.precision = trt.DataType.INT8  # INT8 for regular conv block
            layer.set_output_type(0, trt.DataType.INT8)

# Build the serialized network
serialized_engine = builder.build_serialized_network(network, config)

# Deserialize the engine
runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(serialized_engine)

# Save the engine to disk
with open('branched_cnn.trt', 'wb') as f:
    f.write(serialized_engine)

# Measure inference time
h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)
h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32)
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
stream = cuda.Stream()

context = engine.create_execution_context()

np.copyto(h_input, np.random.rand(1, 3, 32, 32).ravel())

# Measure inference time
start_time = time.time()
for _ in range(100):
    cuda.memcpy_htod_async(d_input, h_input, stream)
    context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    stream.synchronize()
end_time = time.time()

print(f"Quantized model average inference time: {(end_time - start_time) / 100:.6f} seconds")


AttributeError: module 'tensorrt' has no attribute 'NetworkDefinitionCreationFlags'

In [None]:
import tensorrt as trt

# Load ONNX model
onnx_file_path = 'branched_cnn.onnx'
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, TRT_LOGGER)

with open(onnx_file_path, 'rb') as model:
    if not parser.parse(model.read()):
        print('Failed to load ONNX file!')
        for error in range(parser.num_errors):
            print(parser.get_error(error))

# Set precision for specific layers
config = builder.create_builder_config()
config.set_flag(trt.BuilderFlag.INT8)
config.set_flag(trt.BuilderFlag.FP16)
max_workspace_size = 1 << 30  # 1 GiB
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, max_workspace_size)

for layer_idx in range(network.num_layers):
    layer = network.get_layer(layer_idx)
    if isinstance(layer, trt.IConvolutionLayer):
        # Check if the layer is part of the regular conv branch or the MobileNet branch
        if "mobilenet" in layer.name:
            layer.precision = trt.DataType.FLOAT  # FP32 for MobileNetV2 block
            layer.set_output_type(0, trt.DataType.FLOAT)
        else:
            layer.precision = trt.DataType.INT8  # INT8 for regular conv block
            layer.set_output_type(0, trt.DataType.INT8)

# Check for any unsupported layers and print warnings
for layer_idx in range(network.num_layers):
    layer = network.get_layer(layer_idx)
    # Use is_int8 on the data type of the output tensor
    if layer.get_output(0).dtype == trt.int8:
        print(f"Layer {layer.name} is using INT8 precision.")
    elif layer.get_output(0).dtype == trt.float16:
        print(f"Layer {layer.name} is using FP16 precision.")
    else:
        print(f"WARNING: Layer {layer.name} is using FP32 precision. Consider using lower precision for better performance.")

# Build and serialize the engine (for TensorRT versions < 8.0)
serialized_engine = builder.build_serialized_network(network, config)

if serialized_engine is None:
    print("ERROR: Failed to build TensorRT engine. Check for errors in the ONNX model or TensorRT configuration.")
else:
    # Save the engine to disk
    with open('branched_cnn.trt', 'wb') as f:
        f.write(serialized_engine)
    print("TensorRT engine built and saved successfully.")

ERROR: Failed to build TensorRT engine. Check for errors in the ONNX model or TensorRT configuration.
