This notebook allows you to run inference with a DINOv2 model.

In [None]:
import GPUtil
import torch
from torchvision import transforms
import numpy as np
import requests
from PIL import Image
from io import BytesIO

torch.cuda.empty_cache()

gpus = GPUtil.getGPUs()
for gpu in gpus:
    print(f"GPU ID: {gpu.id}, GPU Name: {gpu.name}")
    print(f"Total GPU memory: {gpu.memoryTotal} MB")
    print(f"Free GPU memory: {gpu.memoryFree} MB")
    print(f"Used GPU memory: {gpu.memoryUsed} MB")

# Load DINOv2 model

In [None]:
dinov2_vit14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14_lc')
# dinov2_vit14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14_lc')
# dinov2_vit14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14_lc')
# dinov2_vit14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14_lc')

In [None]:
# move model to GPU
dinov2_vit14 = dinov2_vit14.to('cuda')

# Load image to run inference on

In [None]:
# load image from URL
response = requests.get('https://m.media-amazon.com/images/M/MV5BMTM1MjQzMDA5NV5BMl5BanBnXkFtZTcwMDk5MDg3Mw@@._V1_.jpg')
img = Image.open(BytesIO(response.content))

# Define the transformation to convert PIL image to a PyTorch tensor
transform = transforms.ToTensor()

# Apply the transformation to convert PIL image to a PyTorch tensor
img = transform(img)  # will make channels_first torch tensor
img = img[None,...]  # add batch dimension

In [None]:
def resize(img, size, preserve_aspect_ratio=True):
    assert size[0] % 14 == 0, "DINOv2 expects input image with shapes that are multiples of 14. Height is not a multiple of 14."
    assert size[1] % 14 == 0, "DINOv2 expects input image with shapes that are multiples of 14. Width is not a multiple of 14."
    
    # Define the transformation to resize the image
    resize_transform = transforms.Resize(size, antialias=True)

    # Apply the resize transformation to the PIL image
    resized_image = resize_transform(img)
    return resized_image

In [None]:
# choose the image shape to do inference on
new_height = 14 * 5
new_width = round((img.shape[2] / img.shape[3]) * new_height / 14) * 14
print(new_height, new_width)

img_resized = resize(img, (new_height, new_width))

In [None]:
# run inference; this will yield the extracted features with which you can do other stuff
features = dinov2_vit14(img_resized.to('cuda'))