# **VGG16 & Imagenette**

In [None]:
import torch.optim as optim
import torch.nn as nn
import torchvision
import torchvision.transforms as T
from torch.utils.data import DataLoader
import torch
import random
import numpy as np
from matplotlib import pyplot as plt
import torch.nn.functional as F
import torchvision
from torch.utils.data import Dataset
import os
from PIL import Image
import torchvision.transforms as T

## **Imagenette**
- 13394 RGB images
- 10 classi

In [None]:
_ = torchvision.datasets.Imagenette(
    root='./data',
    split="train",
    # download=True,
)

LABELS_MAP = {
    "n01440764": "fish",
    "n02102040": "dog",
    "n02979186": "speaker",
    "n03000684": "electric saw",
    "n03028079": "church",
    "n03394916": "trumpet",
    "n03417042": "truck",
    "n03425413": "gas pump",
    "n03445777": "golf ball",
    "n03888257": "parachute"
}

label2target = {
    label: idx for idx, label in enumerate(LABELS_MAP.keys())
}

In [None]:
class ImageFolderDataset(Dataset):

  def __init__(
      self,
      root_dir: str, # "./data/imagenette2"
      split: str = "train",
      labels_map: dict = LABELS_MAP,
      transform: callable = None
  ) -> None:

    self.split_dir = os.path.join(root_dir, split) # e.g. "./data/imagenette2/train"

    # mi serve perchè ad ogni label deve essere associato un numero
    # e.g. { "n01440764": 0, "n02102040": 1}
    label2target = {
        label: idx for idx, label in enumerate(labels_map.keys())
    }

    self.images = []
    self.targets = []
    self.labels = []
    for class_id in os.listdir(self.split_dir):
      class_dir_path = os.path.join(self.split_dir, class_id)
      images = os.listdir(class_dir_path)
      self.images += [
          os.path.join(class_dir_path, image) for image in images
      ]
      self.targets += [label2target[class_id]]*len(images)

      self.labels += [labels_map[class_id]]*len(images)

    self.transform = transform


  def __getitem__(self, index):

    image_path = self.images[index]
    image_target = self.targets[index]
    image_label = self.labels[index]

    image = Image.open(image_path).convert('RGB')
    if self.transform:
      image = self.transform(image)

    return image, image_target

  def __len__(self):
    return len(self.images)


In [None]:
train_transform = T.Compose([
    T.Resize((224, 224)), # image size di VGG
    T.RandomApply([
      T.GaussianBlur(3, (0.5, 5)),
      T.ColorJitter(0.5, 0.1, 0.5, 0.1),
    ], p=.5),
    T.RandomHorizontalFlip(p=0.5),
    T.ToTensor(),
    # T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = T.Compose([
    T.Resize((224, 224)), # image size di VGG
    T.ToTensor(),
    # T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
batch_size = 64


In [None]:
train_dataset = ImageFolderDataset(
    root_dir="./data/imagenette2",
    split="train",
    transform=train_transform
)

val_dataset = ImageFolderDataset(
    root_dir="./data/imagenette2",
    split="val",
    transform=val_transform
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

## **VGG16 from scratch**

In [None]:
class VGG16(nn.Module):
    def __init__(self, num_classes):
        super(VGG16, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2)
        )
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU()
        )
        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2)
        )
        self.layer5 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU()
        )
        self.layer6 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU()
        )
        self.layer7 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2)
        )
        self.layer8 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU()
        )
        self.layer9 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU()
        )
        self.layer10 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2)
        )
        self.layer11 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU()
        )
        self.layer12 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU()
        )
        self.layer13 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2)
        )
        # modifica per far andare più veloce
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(7*7*512, 4096), # (7*7*512, 4096)
            nn.ReLU()
        )

        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4096, 4096), # (4096, 4096)
            nn.ReLU()
        )
        self.fc2= nn.Sequential(
            nn.Linear(4096, num_classes) # (4096, num_classe)
        )

        self.flatten = nn.Flatten()

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = self.layer6(x)
        x = self.layer7(x)
        x = self.layer8(x)
        x = self.layer9(x)
        x = self.layer10(x)
        x = self.layer11(x)
        x = self.layer12(x)
        x = self.layer13(x)
        x = self.flatten(x)
        x = self.fc(x)
        x = self.fc1(x)
        logits = self.fc2(x)
        return logits

In [None]:
model = VGG16(num_classes=100)
# check if everythig is fine
x = torch.rand((2, 3, 96, 96))
logits = model(x)
print(logits.shape)

torch.Size([2, 100])


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)
device = "cpu"
model.to(device);

In [None]:
# Train the neural network
num_epochs = 5
log_every_n_iter = 100
for epoch in range(num_epochs):
  # train loop
  model.train()
  epoch_loss = 0.0
  print(f"Epoch: {epoch}")
  for i, batch in enumerate(train_loader, 0):
    # batch --> (images, targets)
    x, targets, _, _ = batch
    x = x.to(device)
    targets = targets.to(device)
    # set optimizer a zero
    optimizer.zero_grad()
    # forward pass
    logits = model(x)
    # calcolo della loss
    loss = criterion(logits, targets)
    # backward pass
    loss.backward()
    # optimizer step --> update weights
    optimizer.step()
    epoch_loss += loss.item()
    if (i+1) % log_every_n_iter == 0:    # Print every log_every_n_iter mini-batches
        print(f"> iter [{i+1}/{len(train_loader)}] - train_loss={epoch_loss/log_every_n_iter:.4f}")
        epoch_loss = 0.0
    # test loop
  model.eval()
  val_loss = []
  total, correct = 0, 0
  with torch.no_grad():
    for batch in val_loader:
      x, targets, _, _ = batch
      x = x.to(device)
      targets = targets.to(device)
      logits = model(x)
      loss = criterion(logits, targets)
      val_loss.append(loss.item())
      # ottengo gli indici dove trovo la max probabilità
      _, preds = torch.max(logits.data, 1)
      total += targets.size(0)
      correct += (preds == targets).sum().item()

  print("val report:")
  print(f"\t val_loss={sum(val_loss)/len(val_loss):.4f} - val_accuracy={correct/total:.4f}")

  print(" \n *************** \n")

print('Finished Training')

### Inference

In [None]:
idx = random.randint(0, len(val_dataset))
img, target = val_dataset[idx]
image_path = val_dataset.images[idx]
label = val_dataset.labels[idx]
print(img.shape)
# we need to augment the first dimension --> from (C, H, W) -> (B, C, H, W)
x = img.unsqueeze(0)
print(x.shape)

In [None]:
import torch.nn.functional as F

with torch.no_grad():
  model.eval()
  logits = model(x.to(device))
  probs = F.softmax(logits, 1)
  print(probs.data)
  pred_prob, pred_class= torch.max(probs.data, 1)
  print(pred_class, pred_prob)

tensor([[3.0209e-05, 9.6808e-01, 2.5137e-03, 6.9704e-03, 8.4398e-04, 1.5312e-03,
         1.1032e-03, 1.0808e-02, 3.0216e-03, 5.0998e-03]], device='cuda:0')
tensor([1], device='cuda:0') tensor([0.9681], device='cuda:0')


In [None]:
image = Image.open(image_path)

(28, 28, 1)


In [None]:
pred_label_key = list(LABELS_MAP)[pred_class[0]]
pred_label_name = LABELS_MAP[pred_label_key]

print(f"Predicted class is {pred_label_name} with prob={pred_prob[0]:.4f} - Ground Truth class is {gt_label}")
plt.imshow(image)