In [15]:
# Define the VOC2012 dataset directory
voc2012_dir = 'VOC2012'
voc_dataset = VOCSegmentation(root=voc2012_dir, year='2012', image_set='train', download=True)

Using downloaded and verified file: VOC2012\VOCtrainval_11-May-2012.tar
Extracting VOC2012\VOCtrainval_11-May-2012.tar to VOC2012


In [14]:
import timm
import torch
import urllib
from PIL import Image
from torchvision.datasets import VOCSegmentation
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform

In [17]:
with open("voc2012_labels.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]


In [18]:
# Create the MobileNetV3 model
model = timm.create_model('mobilenetv3_large_100', pretrained=True)
model.eval()

MobileNetV3(
  (conv_stem): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn1): BatchNormAct2d(
    16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
    (drop): Identity()
    (act): Hardswish()
  )
  (blocks): Sequential(
    (0): Sequential(
      (0): DepthwiseSeparableConv(
        (conv_dw): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
        (bn1): BatchNormAct2d(
          16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): ReLU(inplace=True)
        )
        (se): Identity()
        (conv_pw): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn2): BatchNormAct2d(
          16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): Identity()
        )
        (drop_path): Identity()
      )
    )
    (1): Sequential(
      (0): InvertedResidual(
 

In [20]:
# Process all images in the dataset
for image_index in range(len(voc_dataset)):
    # Load an image from the dataset
    image, _ = voc_dataset[image_index]

    # Preprocess the image
    input_image = transform(image).unsqueeze(0)  # transform and add batch dimension

    # Pass the image through the model
    with torch.no_grad():
        output = model(input_image)

    # Perform softmax on the output to get probabilities
    probabilities = torch.nn.functional.softmax(output[0], dim=0)

    # Print top categories and probabilities
    top5_prob, top5_catid = torch.topk(probabilities, 5)
    print(f"Image Index: {image_index}")
    for i in range(top5_prob.size(0)):
        print(categories[top5_catid[i]], top5_prob[i].item())

Image Index: 0
airliner 0.6553478837013245
wing 0.13278770446777344
aircraft carrier 0.04520115256309509
airship 0.030603652819991112
warplane 0.029921643435955048
Image Index: 1
screen 0.4549400210380554
desktop computer 0.23285773396492004
computer keyboard 0.07801167666912079
mouse 0.03563154861330986
monitor 0.030770277604460716
Image Index: 2
Great Dane 0.4239879250526428
Staffordshire bullterrier 0.17405590415000916
pug 0.044321272522211075
Labrador retriever 0.042057767510414124
Doberman 0.03423333168029785
Image Index: 3
crane 0.7668129205703735
little blue heron 0.02755310758948326
hornbill 0.011200114153325558
American egret 0.009406385943293571
ostrich 0.005504772998392582
Image Index: 4
desktop computer 0.2356037199497223
screen 0.22198599576950073
monitor 0.2142537385225296
desk 0.09304556995630264
computer keyboard 0.050354622304439545
Image Index: 5
beer glass 0.3517400324344635
goblet 0.23220233619213104
red wine 0.11897163838148117
wine bottle 0.04255139082670212
water

KeyboardInterrupt: 

In [21]:
# Save the model's state_dict
torch.save(model.state_dict(), 'mobilenetv3_large.pth')