## Import packages

In [None]:
!pip install datasets --user

In [1]:
from datasets import load_dataset
from transformers import AutoFeatureExtractor
from functools import partial

import numpy as np
import torch
import torch.nn as nn

import timm.models.vision_transformer
import os

from tqdm import tqdm
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from timm.models.vision_transformer import PatchEmbed, Block

## Food101 Dataset 

original train set: 75,750

original validation set: 25,250

total: 101,000

In [2]:
food101 = load_dataset('food101')

In [3]:
food101['train'][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512>,
 'label': 6}

In [4]:
from tqdm import tqdm
import numpy as np

total_r = 0
total_g = 0
total_b = 0
total_pixels = 0

# Variables for standard deviation
sum_squared_r = 0
sum_squared_g = 0
sum_squared_b = 0

indices_to_drop = []

for i in tqdm(range(len(food101['train']))):
    img = np.array(food101['train'][i]['image'].resize((224, 224))) / 255.
    
    if img.shape == (224, 224, 3):
        total_r += img[:, :, 0].sum()
        total_g += img[:, :, 1].sum()
        total_b += img[:, :, 2].sum()
        
        # For standard deviation
        sum_squared_r += np.sum(np.square(img[:, :, 0]))
        sum_squared_g += np.sum(np.square(img[:, :, 1]))
        sum_squared_b += np.sum(np.square(img[:, :, 2]))

        total_pixels += 224 * 224
    else:
        indices_to_drop.append(i)

mean_r = total_r / total_pixels
mean_g = total_g / total_pixels
mean_b = total_b / total_pixels

# Compute std for each channel
std_r = np.sqrt((sum_squared_r / total_pixels) - (mean_r ** 2))
std_g = np.sqrt((sum_squared_g / total_pixels) - (mean_g ** 2))
std_b = np.sqrt((sum_squared_b / total_pixels) - (mean_b ** 2))

print(f"Mean RGB: {mean_r}, {mean_g}, {mean_b}")
print(f"Std RGB: {std_r}, {std_g}, {std_b}")

100%|██████████| 75750/75750 [05:05<00:00, 248.34it/s]

Mean RGB: 0.5449871888617703, 0.4434935563380693, 0.34361316599832514
Std RGB: 0.27093838406970966, 0.2734508551865403, 0.2780531622290323





In [5]:
len(indices_to_drop)

3

In [6]:
food101_mean = np.array([mean_r, mean_g, mean_b])
food101_std = np.array([std_r, std_g, std_b])

In [7]:
def transform(example_batch):
    """
    reshape the images into 224 * 224
    """
    inputs = {}
    
    pixel_values = []
    labels = []
    for i in range(len(example_batch['image'])):
        x = example_batch['image'][i]
        y = example_batch['label'][i]
        if np.array(x.resize((224, 224))).shape == (224, 224, 3):
            pixel_values.append(torch.tensor(((np.array(x.resize((224, 224))) / 255. - food101_mean) / food101_std), dtype = torch.float).permute(2, 0, 1))
            labels.append(y)

    inputs['pixel_values'] = pixel_values
    inputs['label'] = labels
    return inputs

In [8]:
np.array(food101['train'][0]['image'].resize((224, 224))).shape

(224, 224, 3)

In [9]:
processed_food101 = food101.with_transform(transform)

In [10]:
train_dataset = processed_food101['train']

In [11]:
indices_to_choose = list(set(range(len(train_dataset))) - set(indices_to_drop))
filtered_train_dataset = train_dataset.select(indices_to_choose)

In [12]:
validation_dataset = processed_food101['validation']

In [13]:
indices_to_choose = list(set(range(len(validation_dataset))) - set(indices_to_drop))
filtered_valid_dataset = validation_dataset.select(indices_to_choose)

## ResNet-50

In [14]:
class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes * self.expansion:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes * self.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * self.expansion)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

In [15]:
class ResNet50(nn.Module):
    def __init__(self, block, num_blocks, num_classes=101):
        super(ResNet50, self).__init__()
        self.in_planes = 128

        self.conv1 = nn.Conv2d(3, 128, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(128)

        self.layer1 = self._make_layer(block, 128, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 256, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 512, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 1024, num_blocks[3], stride=2)

        self.linear = nn.Linear(1024 * block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)

        # Global average pooling
        out = F.avg_pool2d(out, kernel_size=out.size()[2:])
        
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return F.log_softmax(out, dim=1)

## Fine-tune

In [16]:
from torch.optim import Adam, AdamW
from torch.utils.data import DataLoader
import torchvision
import numpy as np
from tqdm import tqdm
import torchvision.transforms as T
import matplotlib.pyplot as plt
from PIL import Image
from torchvision.transforms import ToTensor, Compose, Normalize

In [17]:
def finetune_epoch(clf,
                   train_dataset,
                   batch_size=128,
                   lr=5e-5,
                   device="cuda:0"):
    clf.train()
    loader = DataLoader(train_dataset, batch_size, drop_last=True, shuffle = True)
    
    params_to_update = []
    for name, param in clf.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)

    optimizer = torch.optim.AdamW(params_to_update, lr=lr)
    criterion = nn.CrossEntropyLoss().to(device)

    loss_list = []
    avg_loss = 0
    for batch in tqdm(loader, desc="Train"):
        # Train loop.
        optimizer.zero_grad()
        cls_logit = clf(batch['pixel_values'].to(device))
        
        loss_cls = criterion(cls_logit, batch['label'].to(device).squeeze())
        
        loss = loss_cls

        loss.backward()
        
        # Have gradients at this point.
        nn.utils.clip_grad_norm_(clf.parameters(), max_norm=5.0, norm_type=2)
        optimizer.step()
        
        avg_loss += loss.item()
        
    return loss_list

In [18]:
@torch.no_grad()
def evaluate(clf, eval_dataset, batch_size):
    clf.eval()
    loader = DataLoader(eval_dataset, batch_size=batch_size, drop_last=True)

    n_right_classes = 0
    n_total = 0

    for batch in tqdm(loader, desc="Eval"):
        # Compute accuracy.
        cls_logit = clf(batch['pixel_values'].to(device))
        
        pred = cls_logit.argmax(dim=1)
        
        n_right_classes_batch = sum(pred == batch['label'].to(device)).item()
        
        n_right_classes += n_right_classes_batch
        
        n_total += pred.numel()

    print("  Acc_cls:", n_right_classes / n_total)

    return n_right_classes / n_total

In [19]:
def finetune(clf, train_dataset, test_dataset, n_epochs: int = 1, model_name=None, **args):
    print("Using device:", args["device"])
    train = train_dataset

    valid = test_dataset
    loss = []
    acc = []
    for epoch in range(n_epochs):
        print(f"Starting epoch {epoch+1}...")
        loss_list = finetune_epoch(clf, train, **args)
        loss += loss_list

        # Save the final checkpoints of the model
        if model_name is not None:
            torch.save(clf, model_path + model_name + 'epoch_' + str(epoch+1) + '.pt')

        acc_i = evaluate(clf, valid, 32)
        acc.append(acc_i)
    
    return loss, acc

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 20
model = ResNet50(Bottleneck, [3, 4, 6, 3]).to(device)
classifier = model
classifier.to(device)
optimizer = AdamW(model.parameters(), lr = 5e-5, betas=(0.9, 0.95), weight_decay=0.05)

loss = finetune(classifier, filtered_train_dataset, filtered_valid_dataset, EPOCHS, batch_size=16, device=device)

Using device: cuda
Starting epoch 1...


Train: 100%|██████████| 4734/4734 [1:02:45<00:00,  1.26it/s]
Eval: 100%|██████████| 789/789 [09:29<00:00,  1.39it/s]


  Acc_cls: 0.1806479721166033
Starting epoch 2...


Train: 100%|██████████| 4734/4734 [57:42<00:00,  1.37it/s] 
Eval: 100%|██████████| 789/789 [08:00<00:00,  1.64it/s]


  Acc_cls: 0.3320659062103929
Starting epoch 3...


Train: 100%|██████████| 4734/4734 [57:38<00:00,  1.37it/s] 
Eval: 100%|██████████| 789/789 [07:17<00:00,  1.81it/s]


  Acc_cls: 0.37828738910012677
Starting epoch 4...


Train: 100%|██████████| 4734/4734 [53:03<00:00,  1.49it/s] 
Eval: 100%|██████████| 789/789 [07:21<00:00,  1.79it/s]


  Acc_cls: 0.49081115335868186
Starting epoch 5...


Train: 100%|██████████| 4734/4734 [52:59<00:00,  1.49it/s]
Eval: 100%|██████████| 789/789 [07:43<00:00,  1.70it/s]


  Acc_cls: 0.5135852344740177
Starting epoch 6...


Train: 100%|██████████| 4734/4734 [52:50<00:00,  1.49it/s]
Eval: 100%|██████████| 789/789 [07:17<00:00,  1.81it/s]


  Acc_cls: 0.5878089353612167
Starting epoch 7...


Train: 100%|██████████| 4734/4734 [53:25<00:00,  1.48it/s]
Eval: 100%|██████████| 789/789 [07:37<00:00,  1.72it/s]


  Acc_cls: 0.5878485424588086
Starting epoch 8...


Train: 100%|██████████| 4734/4734 [1:01:33<00:00,  1.28it/s]
Eval: 100%|██████████| 789/789 [09:58<00:00,  1.32it/s]


  Acc_cls: 0.597671102661597
Starting epoch 9...


Train: 100%|██████████| 4734/4734 [1:06:38<00:00,  1.18it/s]
Eval: 100%|██████████| 789/789 [11:01<00:00,  1.19it/s]


  Acc_cls: 0.6082461977186312
Starting epoch 10...


Train: 100%|██████████| 4734/4734 [1:01:25<00:00,  1.28it/s]
Eval: 100%|██████████| 789/789 [09:20<00:00,  1.41it/s]


  Acc_cls: 0.641318124207858
Starting epoch 11...


Train: 100%|██████████| 4734/4734 [1:04:16<00:00,  1.23it/s]
Eval: 100%|██████████| 789/789 [09:36<00:00,  1.37it/s]


  Acc_cls: 0.6550617870722434
Starting epoch 12...


Train: 100%|██████████| 4734/4734 [57:22<00:00,  1.38it/s] 
Eval: 100%|██████████| 789/789 [09:40<00:00,  1.36it/s]


  Acc_cls: 0.6824302915082383
Starting epoch 13...


Train: 100%|██████████| 4734/4734 [1:01:30<00:00,  1.28it/s]
Eval: 100%|██████████| 789/789 [09:10<00:00,  1.43it/s]


  Acc_cls: 0.6826679340937896
Starting epoch 14...


Train: 100%|██████████| 4734/4734 [1:02:42<00:00,  1.26it/s]
Eval: 100%|██████████| 789/789 [10:33<00:00,  1.25it/s]


  Acc_cls: 0.6680529150823827
Starting epoch 15...


Train: 100%|██████████| 4734/4734 [1:02:16<00:00,  1.27it/s]
Eval: 100%|██████████| 789/789 [10:21<00:00,  1.27it/s]


  Acc_cls: 0.653041825095057
Starting epoch 16...


Train:  20%|█▉        | 932/4734 [11:58<43:17,  1.46it/s]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Train: 100%|██████████| 4734/4734 [1:00:50<00:00,  1.30it/s]
Eval: 100%|██████████| 789/789 [10:39<00:00,  1.23it/s]


  Acc_cls: 0.680410329531052
Starting epoch 19...


Train:  39%|███▉      | 1864/4734 [24:43<41:37,  1.15it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [21]:
loss[1]

[0.1806479721166033,
 0.3320659062103929,
 0.37828738910012677,
 0.49081115335868186,
 0.5135852344740177,
 0.5878089353612167,
 0.5878485424588086,
 0.597671102661597,
 0.6082461977186312,
 0.641318124207858,
 0.6550617870722434,
 0.6824302915082383,
 0.6826679340937896,
 0.6680529150823827,
 0.653041825095057,
 0.6863513941698353,
 0.7082145120405576,
 0.680410329531052,
 0.6879356780735107,
 0.6656764892268695]