# This notebook explores TensorRT

## Importing the required libraries

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler

from torchvision import datasets, models
import torchvision.transforms as transforms

from onnx import ModelProto
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda

import numpy as np
import matplotlib.pyplot as plt
from time import time

## Checking if CUDA is available and assigning the compute available to device variable

In [None]:
if torch.cuda.is_available():
    device = 'cuda'
    
else:
    device = 'cpu'
    
device

## Using data augmentation for better tuning

In [None]:
train_transform = transforms.Compose([transforms.RandomHorizontalFlip(p=0.5),
                                      transforms.RandomRotation(20),
                                      transforms.RandomGrayscale(p=0.2),
                                      transforms.ToTensor(), 
                                      transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])
test_transform = transforms.Compose([transforms.ToTensor(),
                               transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])

## Let's load the data

In [None]:
train_data = datasets.CIFAR10(root='data', train = True, download = True, transform = train_transform)
test_data = datasets.CIFAR10(root='data', train = False, download = True, transform = test_transform)

In [None]:
len(test_data)

In [None]:
num_train = len(train_data)
num_train

In [None]:
indices = list(range(num_train))

In [None]:
np.random.shuffle(indices)

In [None]:
val_size = 0.04
split = int(np.floor((val_size * num_train)))
split

In [None]:
train_idx, val_idx = indices[split:],indices[:split]

In [None]:
train_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(val_idx)

In [None]:
batch_size = 100

In [None]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size = batch_size, sampler = train_sampler)
val_loader = torch.utils.data.DataLoader(train_data, batch_size = batch_size, sampler = val_sampler)
test_loader = torch.utils.data.DataLoader(test_data, batch_size = batch_size)

In [None]:
len(train_loader)

In [None]:
len(val_loader)

In [None]:
no_batches = len(test_loader)
no_batches

In [None]:
classes = ['airplane', 'automobile', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck']
no_classes = len(classes)

## Visualizing data

In [None]:
def imshow(img):
    img = img / 2 + 0.5
    plt.imshow(np.transpose(img,(1,2,0)))

In [None]:
data_iter = iter(train_loader)
images, labels = data_iter.next()
images = images.numpy()

In [None]:
fig = plt.figure(figsize=(25, 4))

for im in np.arange(batch_size):
    ax = fig.add_subplot(2, batch_size/2, im+1, xticks=[], yticks=[])
    imshow(images[im])
    ax.set_title(classes[labels[im]])

## Custom model definition just to experiment TensorRT 

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 32, 3, padding = 1)
        self.conv2 = nn.Conv2d(32,32, 3, padding = 1)
        
        self.conv3 = nn.Conv2d(32, 64, 3, padding = 1)
        self.conv4 = nn.Conv2d(64, 64, 3, padding = 1)
        
        self.conv5 = nn.Conv2d(64, 128, 3, padding = 1)
        self.conv6 = nn.Conv2d(128,128, 3, padding = 1)
        
        self.pool = nn.MaxPool2d(2)
        self.drop = nn.Dropout(0.4)
        
        self.fc1 = nn.Linear(2048,512)
        self.fc2 = nn.Linear(512,128)
        self.fc3 = nn.Linear(128, 10)
        
    def forward (self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = self.drop(x)
        
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv4(x))
        x = self.pool(x)
        x = self.drop(x)
        
        x = F.relu(self.conv5(x))
        x = F.relu(self.conv6(x))
        
        x = self.pool(x)
        x = self.drop(x)
        
        x = x.view(x.shape[0], -1)
        
        x = F.relu(self.fc1(x))
        x = self.drop(x)
        x = F.relu(self.fc2(x))
        x = self.drop(x)
        x = F.log_softmax(self.fc3(x), dim =1)
        
        return x

In [None]:
model = Net()

model.parameters()

In [None]:
def init_weights(m):
    if type(m) == nn.Conv2d or type(m) == nn.Linear:
        nn.init.xavier_uniform(m.weight)
        
model.apply(init_weights)

In [None]:
model = model.to(device)

In [None]:
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
no_epochs = 1+10

val_loss_min = np.Inf

for epoch in range (1, no_epochs):
    
    start = time()
    
    train_loss = 0
    train_acc = 0
    val_loss = 0
    val_acc = 0
    
    model.train()
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        
                
        output = model(data)
        
        loss = criterion(output, target)
        
        optimizer.zero_grad()
        
        loss.backward()
        
        #with amp.scale_loss(loss,optimizer) as scaled_loss:
        #    scaled_loss.backward()
        
        optimizer.step()
        
        train_loss += loss.item()
        
        _,pred = torch.max(output, dim=1)
        
        equals = pred == target.view(*pred.shape)
        
        train_acc += torch.mean(equals.type(torch.FloatTensor))
        
    model.eval()
    for data, target in val_loader:
        data, target = data.to(device), target.to(device)
        
        output = model(data)
        
        loss = criterion(output, target)
    
        val_loss += loss.item()
        
        _,pred = torch.max(output, dim=1)
        
        equals = pred == target.view(*pred.shape)
        
        val_acc += torch.mean(equals.type(torch.FloatTensor))
        
    train_loss = train_loss / len(train_loader)
    train_acc = train_acc * 100 / len(train_loader)
    val_loss = val_loss / len(val_loader)
    val_acc = val_acc * 100 / len(val_loader)
    
    end = time()
    taken = end - start
    
    print('Epoch: {} \tTime: {:.3f} \nTraining Loss: {:.6f} \tTraining Acc: {:.2f} \tValidation Loss: {:.6f} \tValidation Acc: {:.2f}'.format(epoch, taken, train_loss, train_acc, val_loss, val_acc))
    
    if val_loss <= val_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(val_loss_min, val_loss))
        torch.save(model.state_dict(), 'model_cifar.pth')
        val_loss_min = val_loss

In [None]:
model_test = Net()
model_test.load_state_dict(torch.load('model_cifar.pth'))
model_test = model_test.to(device)
model_test.eval()

## Inference with native PyTorch model

In [None]:
test_acc = 0
start = time()
b_epoch = 10
for i in range (b_epoch):

    for data, target in test_loader:
    
        data, target = data.to(device), target.to(device)
    
        #optimizer.zero_grad()
    
        output = model_test(data)
    
        _,pred = torch.max(output, dim=1)
    
        equal = pred == target.view(*pred.shape)
    
        test_acc += torch.mean(equal.type(torch.FloatTensor))

test_acc /= b_epoch
taken = time() - start
print("Accuracy is: {:.2f}%".format(test_acc * 100 /len(test_loader)))
print("Time taken: {:.2f}s".format(taken))

## Converting model in .pth to .onnx format

In [None]:
onnx_path = "../workspace/model_cifar.onnx"

In [None]:
engine_name = "../workspace/model_fp16.plan"

In [None]:
input_shape = (batch_size,3,32,32)
inputs = torch.ones(*input_shape)
inputs = inputs.to(device)
inputs.shape

In [None]:
torch.onnx.export(model_test, inputs, onnx_path, input_names = None, output_names = None, dynamic_axes = None)

## TensorRT flow

In [None]:
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)

In [None]:
def build_engine(onnx_path, shape = [32,3,32,32]):

    with trt.Builder(TRT_LOGGER) as builder,builder.create_builder_config() as config, builder.create_network(1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.fp16_mode = True
        
        profile = builder.create_optimization_profile()
        config.max_workspace_size = (3072 << 20)
        config.add_optimization_profile(profile)
        with open(onnx_path, 'rb') as model:
            parser.parse(model.read())
        network.get_input(0).shape = shape
        engine = builder.build_cuda_engine(network)
        return engine

In [None]:
def save_engine(engine, file_name):
    buf = engine.serialize()
    with open(file_name, 'wb') as f:
        f.write(buf)

In [None]:
def load_engine(trt_runtime, plan_path):
    with open(engine_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

In [None]:
model = ModelProto()
with open(onnx_path, "rb") as f:
    model.ParseFromString(f.read())

In [None]:
d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
shape = [batch_size , d0, d1 ,d2]

In [None]:
shape

In [None]:
engine = build_engine(onnx_path, shape= shape)
save_engine(engine, engine_name) 

In [None]:
def load_images_to_buffer(pics, pagelocked_buffer):
    preprocessed = np.asarray(pics).ravel()
    np.copyto(pagelocked_buffer, preprocessed) 

In [None]:
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size):
    
    load_images_to_buffer(pics_1, h_input_1)
     
    with engine.create_execution_context() as context:
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input_1, h_input_1, stream)

        # Run inference.

        context.profiler = trt.Profiler()
        context.execute(batch_size=batch_size, bindings=[int(d_input_1), int(d_output)])

        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream
        stream.synchronize()
        # Return the host output.
        out = (h_output)
        return out 

In [None]:
def allocate_buffers(engine):
        
    # host cpu mem
    h_in_size = trt.volume(engine.get_binding_shape(0))
    h_out_size = trt.volume(engine.get_binding_shape(1))
        
    h_in_dtype = trt.nptype(engine.get_binding_dtype(0))
    h_out_dtype = trt.nptype(engine.get_binding_dtype(1))
    
    h_input = cuda.pagelocked_empty(h_in_size, h_in_dtype)
    h_output = cuda.pagelocked_empty(h_out_size, h_out_dtype)
    
    # allocate gpu mem
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    stream = cuda.Stream()
    
    return h_input, d_input, h_output, d_output, stream

## Inference with TensorRT engine

In [None]:
test_acc = 0
start = time()

for i in range(b_epoch):
    for image,label in test_loader:
        temp = np.asarray(image,dtype=np.float32)
        
        h_input, d_input, h_output, d_output, stream = allocate_buffers(engine)
        out = do_inference(engine, temp, h_input, d_input, h_output, d_output, stream, batch_size)
        out = torch.from_numpy(out.reshape(batch_size,-1))
        
        _,pred = torch.max(out, dim=1)
        equal = pred == label.view(*pred.shape)
    
        test_acc += torch.mean(equal.type(torch.FloatTensor))

test_acc /= b_epoch
taken = time() - start
print("Accuracy is: {:.2f}%".format(test_acc * 100 /len(test_loader)))
print("Time taken: {:.2f}s".format(taken))