In [2]:
# Ensure CUDA and cuDNN are installed
!nvcc --version
!nvidia-smi
# Install the required dependencies for building TensorFlow with TensorRT support
!sudo apt-get update
!sudo apt-get install -y libnvinfer8 libnvinfer-dev libnvinfer-plugin8
# (Install other necessary packages as mentioned in TensorFlow documentation)
# Clone the TensorFlow repository and checkout the desired branch
!git clone https://github.com/tensorflow/tensorflow.git
%cd tensorflow
!git checkout r2.10 # Check the TensorFlow-TensorRT compatibility matrix for the correct branch.
# Configure TensorFlow build with TensorRT enabled
# ./configure
# (During configuration, enable TensorRT support when prompted)
# If you are using a virtual environment, activate it before building TensorFlow.
# Build and install TensorFlow
!bazel build --config=cuda --config=monolithic ... (Specify the build target with TensorRT support)
!bazel install ... (Install the built TensorFlow package)
# After successful installation, restart the runtime to ensure the new TensorFlow installation is used.

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Fri Dec 27 05:01:48 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                      

In [3]:
%cd ~

/root


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import tensorflow as tf
import time

In [5]:
from tensorflow.keras.models import load_model

In [9]:
saved_model_dir = '/content/drive/MyDrive/model/model_trt'
loaded_model = tf.saved_model.load(saved_model_dir)

In [11]:
params = tf.experimental.tensorrt.ConversionParams(precision_mode='FP16')  # Use FP32 or INT8 as needed
converter = tf.experimental.tensorrt.Converter(input_saved_model_dir='/content/drive/MyDrive/model/model_trt', conversion_params=params)
converter.convert()

<ConcreteFunction (*, inputs: TensorSpec(shape=(None, 32, 32, 3), dtype=tf.float32, name='inputs')) -> Dict[['output_0', TensorSpec(shape=(None, 10), dtype=tf.float32, name='output_0')]] at 0x7E061DDA14B0>

In [13]:
optimized_model_dir = '/content/drive/MyDrive/test'
converter.save(optimized_model_dir)

In [14]:
model = loaded_model

In [16]:
# Original model
original_model = loaded_model

# TensorRT-optimized model
optimized_model = tf.saved_model.load('/content/drive/MyDrive/test')

In [17]:
original_serving_fn = original_model.signatures['serving_default']
optimized_serving_fn = optimized_model.signatures['serving_default']

In [18]:
import numpy as np

# Sample batch of data for benchmarking
input_shape = (32, 32, 3)
batch_size = 32
dummy_input = np.random.random((batch_size, *input_shape)).astype(np.float32)
original_input = tf.convert_to_tensor(dummy_input)
optimized_input = tf.convert_to_tensor(dummy_input)

In [19]:
def measure_latency(model_fn, input_key, input_tensor, warmup=10, iterations=100):

    # Warm-up runs
    for _ in range(warmup):
        _ = model_fn(**{input_key: input_tensor})

    # Measure latency
    start_time = time.time()
    for _ in range(iterations):
        _ = model_fn(**{input_key: input_tensor})
    end_time = time.time()

    # Calculate average latency
    avg_latency = (end_time - start_time) / iterations
    return avg_latency * 1000  # Convert to milliseconds

In [20]:
original_latency = measure_latency(
    original_serving_fn, "inputs", original_input
)

In [21]:
optimized_latency = measure_latency(
    optimized_serving_fn, "inputs", optimized_input
)

In [22]:
print(f"Original Model Latency: {original_latency:.2f} ms")
print(f"Optimized Model Latency: {optimized_latency:.2f} ms")

Original Model Latency: 0.49 ms
Optimized Model Latency: 0.45 ms
