# **Suryakanta Karan (M22AIE207) m22aie207@iitj.ac.in**



#Step 1: Fine-tune a ViT model with an image dataset

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms, models
import numpy as np

# Define the data transformations
transform = transforms.Compose([
    transforms.Resize(224),  # Resize the images to 224x224
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))  # Normalize dataset
])

# Load the CIFAR-10 dataset
train_dataset = datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)

# Sub-sample the training dataset
num_samples = 1000  # Number of samples to use for fine-tuning (you can adjust this number)
indices = np.random.choice(len(train_dataset), num_samples, replace=False)
train_subset = Subset(train_dataset, indices)

# Create data loaders
train_loader = DataLoader(train_subset, batch_size=16, shuffle=True)  # Reduced batch size
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Load a pre-trained ViT model
vit_model = models.vit_b_16(pretrained=True)

# Replace the head of the ViT model with a new head for the specific dataset
vit_model.heads = nn.Sequential(
    nn.Linear(vit_model.heads[-1].in_features, 512),
    nn.ReLU(),
    nn.Dropout(0.5),  # Add dropout layer for regularization
    nn.Linear(512, 10)  # 10 classes for CIFAR-10 dataset
)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vit_model.to(device)  # Move the model to the available device

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vit_model.parameters(), lr=0.0001, weight_decay=0.0001)  # Adjusted learning rate and added weight decay

# Define learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)  # Reduce learning rate every 3 epochs

# Fine-tune the model using the smaller subset of the data
num_epochs = 15  # Increased number of epochs
vit_model.train()
for epoch in range(num_epochs):
    epoch_loss = 0
    num_batches = 0

    for images, labels in train_loader:
        # Move images and labels to the device
        images, labels = images.to(device), labels.to(device)

        # Perform a training step
        optimizer.zero_grad()
        outputs = vit_model(images)
        loss = criterion(outputs, labels)

        loss.backward()

        # Gradient clipping to avoid exploding gradients
        nn.utils.clip_grad_norm_(vit_model.parameters(), max_norm=1.0)

        optimizer.step()

        # Update running loss
        epoch_loss += loss.item()
        num_batches += 1

    # Calculate and print average loss for the epoch
    average_loss = epoch_loss / num_batches
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss:.4f}")

    # Step the learning rate scheduler
    scheduler.step()

# Save the fine-tuned model
torch.save(vit_model.state_dict(), 'vit_finetuned.pth')


Files already downloaded and verified
Files already downloaded and verified
Epoch 1/15, Loss: 1.2794
Epoch 2/15, Loss: 0.4598
Epoch 3/15, Loss: 0.3020
Epoch 4/15, Loss: 0.0773
Epoch 5/15, Loss: 0.0326
Epoch 6/15, Loss: 0.0148
Epoch 7/15, Loss: 0.0160
Epoch 8/15, Loss: 0.0121
Epoch 9/15, Loss: 0.0090
Epoch 10/15, Loss: 0.0087
Epoch 11/15, Loss: 0.0086
Epoch 12/15, Loss: 0.0086
Epoch 13/15, Loss: 0.0087
Epoch 14/15, Loss: 0.0103
Epoch 15/15, Loss: 0.0086


#2. Save the model to ONNX format

In [8]:
!pip install onnx --upgrade


Collecting onnx
  Downloading onnx-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: onnx
Successfully installed onnx-1.16.0


In [9]:
import torch.onnx

# Define the input shape (batch size, channels, height, width)
dummy_input = torch.randn(1, 3, 224, 224).to(device)  # Use the same device as the model

# Load the fine-tuned model
vit_model.load_state_dict(torch.load('vit_finetuned.pth'))
vit_model.to(device)  # Move the model to the device

# Convert the model to ONNX format with opset_version 13
torch.onnx.export(
    vit_model,
    dummy_input,
    'vit_finetuned.onnx',
    input_names=['input'],
    output_names=['output'],
    opset_version=14  # Updated opset version to 14
    ,dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}  # Allow dynamic batch size
)



  assert condition, message


#3. Run the ONNX Model with ONNX Runtime

In [10]:
!pip install onnxruntime

Collecting onnxruntime
  Downloading onnxruntime-1.17.3-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: humanfriendly, coloredlogs, onnxruntime
Successfully installed coloredlogs-15.0.1 humanfriendly-10.0 onnxruntime-1.17.3


In [11]:
import onnxruntime as ort
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Load the ONNX model
ort_session = ort.InferenceSession('vit_finetuned.onnx')

# Define input transformation
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
])

# Load the CIFAR-10 test dataset
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)

# Sub-sample the training dataset
num_samples = 200  # Number of samples to use for fine-tuning (you can adjust this number)
indices = np.random.choice(len(test_dataset), num_samples, replace=False)  # Randomly select samples
test_dataset = Subset(test_dataset, indices)

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Run inference with the ONNX model
# Adjust the input shape by iterating over the test loader and running inference one batch at a time with batch size of 1
for images, labels in test_loader:
    # Iterate over each image in the batch
    for i in range(len(images)):
        # Get a single image
        single_image = images[i:i + 1]
        single_image_np = single_image.numpy()

        # Perform inference with a single image
        outputs = ort_session.run(None, {'input': single_image_np})
        prediction = np.argmax(outputs[0], axis=1)

        # Print or process the prediction as needed
        print(f"Prediction: {prediction}, Ground Truth: {labels[i].item()}")



Files already downloaded and verified
Prediction: [7], Ground Truth: 7
Prediction: [4], Ground Truth: 3
Prediction: [6], Ground Truth: 6
Prediction: [5], Ground Truth: 5
Prediction: [3], Ground Truth: 6
Prediction: [8], Ground Truth: 8
Prediction: [2], Ground Truth: 2
Prediction: [0], Ground Truth: 0
Prediction: [3], Ground Truth: 5
Prediction: [2], Ground Truth: 2
Prediction: [0], Ground Truth: 0
Prediction: [8], Ground Truth: 8
Prediction: [8], Ground Truth: 8
Prediction: [3], Ground Truth: 4
Prediction: [7], Ground Truth: 7
Prediction: [2], Ground Truth: 2
Prediction: [4], Ground Truth: 4
Prediction: [1], Ground Truth: 1
Prediction: [3], Ground Truth: 5
Prediction: [1], Ground Truth: 1
Prediction: [3], Ground Truth: 3
Prediction: [0], Ground Truth: 5
Prediction: [3], Ground Truth: 3
Prediction: [6], Ground Truth: 9
Prediction: [6], Ground Truth: 6
Prediction: [9], Ground Truth: 9
Prediction: [9], Ground Truth: 9
Prediction: [0], Ground Truth: 2
Prediction: [8], Ground Truth: 8
Predi

#4. Use TorchScript to Convert Your Code for Inference to a C++ Readable Format
To convert the model to TorchScript format:

Load the fine-tuned model.
Convert the model to TorchScript using the torch.jit.trace or torch.jit.script functions.
Save the TorchScript model.

In [12]:
# Convert the model to TorchScript
vit_model_scripted = torch.jit.script(vit_model)
vit_model_scripted.save('vit_finetuned_scripted.pt')

#5. Load the Torchscript model in C++ for the inference on the test set of your dataset

SyntaxError: invalid syntax (<ipython-input-16-1bf71f993edb>, line 4)