# Finetuning
**Race prediction**
- Reference: HuggingFace's finetuning tutorial

## Install/Import libraries

In [None]:
%%capture

! pip install transformers pytorch-lightning --quiet

In [None]:
import math
import pandas as pd
import ast
import os
from PIL import Image
from pathlib import Path
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
from torchmetrics import Accuracy
from torchvision.datasets import ImageFolder
from transformers import ViTFeatureExtractor, ViTForImageClassification
import torch.nn as nn

In [None]:
# Comment if not on google colab

# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Constants
label_dir = './labels_for_more_training_data.csv' #Path of the labels csv file
data_dir = './drive/MyDrive/private_test/data' #Path of the dataset
skintone_model_dir = '/content/drive/MyDrive/skintone.pth' #Path to the skintone model

In [None]:
df = pd.read_csv(label_dir)
df.head()

Unnamed: 0,file_name,height,width,image_id,bbox,skintone,age,race,emotion,gender,masked
0,10003832.jpg,2000,1459,1,"[584.1895944369563, 301.32785213219023, 265.74...",mid-light,20-30s,Mongoloid,Anger,Male,unmasked
1,10005259.jpg,1395,2000,2,"[1131.1132364709713, 312.5498771883628, 285.10...",light,20-30s,Mongoloid,Neutral,Male,unmasked
2,10005527.jpg,1507,2000,3,"[548.0171526364226, 265.9999999999995, 246.980...",mid-light,20-30s,Mongoloid,Sadness,Female,unmasked
3,100086002.jpg,1334,2000,4,"[900.5677208085174, 57.13482704531668, 163.848...",light,20-30s,Mongoloid,Neutral,Female,unmasked
4,100148503.jpg,1561,2000,5,"[862.5207825161339, 478.9999999999999, 210.264...",light,20-30s,Caucasian,Happiness,Female,unmasked


## Init Dataset and Split into Training and Validation Sets
- We create custom dataset to load our images, crop by bbox and resize them. Also includes the image's label upon output.
- Then, we'll split dataset into train set and validation set by the ratio (85/15)

In [None]:
  # Creating a custom dataset class
class ImageDataset(Dataset):
    def __init__(self, dir, labels_dir, target_attr, transform=None):
      self.data_dir = dir
      self.target_attr = target_attr
      self.labels = pd.read_csv(labels_dir)
      self.labels = self.labels[self.labels['race']!='Mongolid']
      self.images = os.listdir(dir)
      self.transform = transform
      self.new_size = (128,128)

  # Defining the length of the dataset
    def __len__(self):
      return len(self.labels['file_name'])

  # Defining the method to get an item from the dataset
    def __getitem__(self, index):
      image_path = os.path.join(self.data_dir, self.labels.iloc[index]['file_name'])
      image = Image.open(image_path).convert('RGB')

      label = self.labels.iloc[index][self.target_attr]
      bbox = self.labels.iloc[index]['bbox']
      bbox = ast.literal_eval(bbox)
      image = image.crop((bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]))
      image = image.resize(self.new_size)
      #image = pil_to_tensor(image)

      # Applying the transform
      if self.transform:
        image = self.transform(image)

      return (image, label)


In [None]:
ds = ImageDataset(data_dir, label_dir, 'race')
#Random splitting
indices = torch.randperm(len(ds)).tolist()
n_val = math.floor(len(indices) * .15)
train_ds = torch.utils.data.Subset(ds, indices[:-n_val])
val_ds = torch.utils.data.Subset(ds, indices[-n_val:])

## Preparing Labels for Our Model's Config

By adding `label2id` + `id2label` to our model's config, we'll get friendlier labels in the inference API.

In [None]:
skintone_label2id = {'dark': '0', 'light': '1', 'mid-dark': '2', 'mid-light': '3'} #labels for the skintone model used
skintone_id2label = {'0': 'dark', '1': 'light', '2': 'mid-dark', '3': 'mid-light'}

In [None]:
label2id = {'Caucasian': '0', 'Mongoloid': '1', 'Negroid': '2'}
id2label = {'0': 'Caucasian', '1': 'Mongoloid', '2': 'Negroid'}

## Image Classification Collator

To apply our transforms to images, we'll use a custom collator class. We'll initialize it using an instance of `ViTFeatureExtractor` and pass the collator instance to `torch.utils.data.DataLoader`'s `collate_fn` kwarg.

In [None]:
class ImageClassificationCollator:
    def __init__(self, feature_extractor):
        self.feature_extractor = feature_extractor

    def __call__(self, batch):
        encodings = self.feature_extractor([x[0] for x in batch], return_tensors='pt')
        encodings['labels'] = torch.tensor([int(label2id[x[1]]) for x in batch], dtype=torch.long)
        return encodings

## Init Feature Extractor, Model, Data Loaders
- The pretrained model we'll be using is `google/vit-base-patch16-224-in21k`
- We'll init **2** instances of the pretrained model:
  - A new instance
  - And one already finetuned and loaded with weights to predict the **skintone** (we'll combine them in our structure).
- We include the skintone model because there is a significant correlation between **skintone** and **race**.
- We'll also init `DataLoaders`.

In [None]:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

skintone_model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224-in21k',
    num_labels=4,
    label2id=skintone_label2id,
    id2label=skintone_id2label
)
skintone_model.load_state_dict(torch.load(skintone_model_dir))
skintone_model.eval();

model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224-in21k',
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label
)

collator = ImageClassificationCollator(feature_extractor)
train_loader = DataLoader(train_ds, batch_size=8, collate_fn=collator, num_workers=2, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=8, collate_fn=collator, num_workers=2)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training

We'll use [PyTorch Lightning](https://pytorchlightning.ai/) to fine-tune our model.

The model contains the newly init pretrained model and the **skintone prediction** model. Outputs of the two models will be ***concatenated*** and passed through a **Linear** layer to provide the final output.

The loss function is `CrossEntropyLoss`

In [None]:
class Classifier(pl.LightningModule):

    def __init__(self, skintone_model, model, lr: float = 2e-5, **kwargs):
        #super().__init__()
        super(Classifier, self).__init__()
        self.save_hyperparameters('lr', *list(kwargs))

        self.model = model
        self.skintone_model = skintone_model
        self.output = nn.Linear(skintone_model.config.num_labels + model.config.num_labels ,model.config.num_labels)
        #self.forward = self.model.forward
        self.val_acc = Accuracy(
            task='multiclass' if model.config.num_labels > 2 else 'binary',
            num_classes=model.config.num_labels
        )

    def forward(self, **batch):
        skintone = self.skintone_model(**batch).logits
        outputs = self.model(**batch).logits

        x = torch.cat((skintone.detach(),outputs), dim=1)
        x = self.output(x)
        return x

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        criterion = nn.CrossEntropyLoss()
        pred = outputs.softmax(1)
        loss = criterion(pred, batch['labels'])

        self.log(f"train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(**batch)
        criterion = nn.CrossEntropyLoss()
        pred = outputs.softmax(1)
        loss = criterion(pred, batch['labels'])
        self.log(f"val_loss", loss)
        acc = self.val_acc(outputs.argmax(1), batch['labels'])
        self.log(f"val_acc", acc, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

If resuming training from a previous model, load it here.

In [None]:
# classifier = torch.load('/content/drive/MyDrive/race_model 2024-01-14 10_03_44.805772.pth')
# classifier.eval();

In [None]:
pl.seed_everything(42)
classifier = Classifier(skintone_model, model, lr=2e-5)
trainer = pl.Trainer(accelerator='gpu', devices=1, precision=16, max_epochs=2)
trainer.fit(classifier, train_loader, val_loader)

INFO:lightning_fabric.utilities.seed:Seed set to 42
/usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:558: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type                      | Params
-------------------------------------------------------------
0 | model          | ViTForImageClassification | 85.8 M
1 | skintone_model | ViTForIm

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

## Save model

In [None]:
import datetime

torch.save(model, 'race_model ' + str(datetime.datetime.today()) + '.pth')