<a href="https://colab.research.google.com/github/PacktPublishing/Modern-Computer-Vision-with-PyTorch-2E/blob/main/Chapter18/quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
try:
  from torch_snippets import *
except:
  %pip install torch-snippets gitPython lovely-tensors
  from torch_snippets import *

from git import Repo

repository_url = 'https://github.com/sizhky/quantization'
destination_directory = '/content/quantization'
if exists(destination_directory):
  repo = Repo(destination_directory)
else:
  repo = Repo.clone_from(repository_url, destination_directory)

%cd {destination_directory}
%pip install -qq -r requirements.txt # this will take about 5 min of time
%pip install -U torchvision
%pip install -U torch-tensorrt
# print(repo.git.pull('origin', 'main'))

# Train

In [2]:
import torch_tensorrt
torch_tensorrt.__version__

'2.2.0'

In [3]:
# Change to `Debug=false` in the line below
# to train on a larger dataset
%env DEBUG=true
!make train

env: DEBUG=true
python -m src.defect_classification.train
Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100% 528M/528M [00:05<00:00, 108MB/s]
Downloading readme: 100% 495/495 [00:00<00:00, 3.35MB/s]
Downloading data: 100% 306M/306M [00:01<00:00, 171MB/s]
Downloading data: 100% 305M/305M [00:01<00:00, 176MB/s]
Downloading data: 100% 263M/263M [00:03<00:00, 71.0MB/s]
Generating train split: 100% 2331/2331 [00:04<00:00, 529.93 examples/s]
Generating valid split: 100% 1004/1004 [00:01<00:00, 715.76 examples/s]
Class Balance
 
```↯ AttrDict ↯
train
  non_defect - [1;36m50[0m [1m([0mint[1m)[0m
  defect - [1;36m50[0m [1m([0mint[1m)[0m
valid
  non_defect - [1;36m50[0m [1m([0mint[1m)[0m
  defect - [1;36m50[0m [1m([0mint[1m)[0m

```

Map: 100% 100/100 [00:19<00:00,  5.24 examples/s]
Map: 100% 100/100 [00:17<00:00,  5.67 examples/s]
Epoch: [1;36m1[0m [33mtrain_epoch_loss[0m=[1;36m0[0m[1

# Benchmarks

In [4]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!make basic-benchmark
# visit makefile for the actual python command

python -m src.defect_classification.basic_benchmark
Started computing roc auc score[33m...[0m
Map: 100% 100/100 [00:18<00:00,  5.38 examples/s]
ROC AUC Score: [1;36m0.89[0m
Started benchmarks[33m...[0m
Average batch time: [1;36m118.61[0m ms


```python
model = torch.load('model.pth').eval()

input_shape = (32,3,224,224)
trt_model_hp = torch_tensorrt.compile(
    model,
    inputs=[torch_tensorrt.Input(input_shape)],
    enabled_precisions= {torch_tensorrt.dtype.half} # Run with FP16
)

from sklearn.metrics import roc_auc_score
from datasets import load_dataset
from src.defect_classification.train import process_example, DefectsDataset

def get_roc_auc_score(model):
    print("Started computing roc auc score...")
    predictions, actuals = [], []

    val_ds = load_dataset('sizhkhy/kolektor_sdd2', split="valid[:50]+valid[-50:]")
    val_ds = val_ds.map(process_example).remove_columns(['split', 'path'])
    val_ds.set_format("pt", columns=["image", "label"], output_all_columns=True)
    val_ds = DefectsDataset(val_ds)
    val_dl = DataLoader(val_ds, batch_size=32, shuffle=True, drop_last=True)

    for ix, batch in enumerate(iter(val_dl)):
        x, y = batch
        if isinstance(model, nn.Module):
          prediction = model(x.cuda()).detach().cpu().numpy().tolist()
        else: # half/int8 model
          prediction = model(x.cuda())[0].detach().cpu().numpy().tolist()
        predictions.extend(prediction)
        actuals.extend(y.detach().cpu().numpy().tolist())

    actuals = flatten(actuals)
    predictions = flatten(predictions)
    print(f"ROC AUC Score: {roc_auc_score(actuals, predictions):.2f}")

get_roc_auc_score(trt_model_hp)

import time

@torch.no_grad()
def benchmark(model, input_shape=(32, 3, 224, 224), nwarmup=5, nruns=100):
    print("Started benchmarks...")
    input_data = torch.randn(input_shape)
    input_data = input_data.to("cuda")
    for _ in range(nwarmup):
        model(input_data)
    torch.cuda.synchronize()

    timings = []
    for _ in range(nruns):
        start_time = time.perf_counter()
        model(input_data)
        end_time = time.perf_counter()
        timings.append(end_time - start_time)
    timing = np.mean(timings)*1000
    print(f'Average batch time: {timing:.2f} ms')

benchmark(trt_model_hp)
```

In [5]:
# visit makefile for the actual python command
!make fp16-benchmark

python -m src.defect_classification.fp16_benchmark
INFO:datasets:PyTorch version 2.2.1+cu121 available.
INFO:datasets:Polars version 0.20.2 available.
INFO:datasets:TensorFlow version 2.15.0 available.
INFO:datasets:JAX version 0.4.26 available.
Loading trt model...
INFO:torch_tensorrt._compile:ir was set to default, using dynamo as ir
INFO:torch_tensorrt.dynamo._compiler:Compilation Settings: CompilationSettings(precision=torch.float16, debug=False, workspace_size=0, min_block_size=5, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_long_and_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=<EngineCapability.DEFAULT: 0>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size