### Vision Transformer, Swin Transformer, ConvNeXt (Basic usage from the Huggingface Library)

In [None]:

## Install the required packages
#!pip install transformers --upgrade
#pip install datasets

### Task 1: Inference on the pre-trained ViT (1000 ImageNet classes)
- Load your own image
- Run the model in the inference mode. Use different models, namely, ViT, Swin, Swinv2, ConvNext

In [None]:
from transformers import ViTForImageClassification
from transformers import ViTImageProcessor
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load the model with the given checkpoint
## The checkpoint is the model name from the Hugging Face model hub
## https://huggingface.co/models
## TODO: Experiment with different models. To use Swin, you will need to export SwinForImageClassification and so on
model_checkpoint = "google/vit-base-patch16-224"
model = ViTForImageClassification.from_pretrained(model_checkpoint)
model.to(device)

from PIL import Image
## TODO: Load the image
image = None


processor = ViTImageProcessor.from_pretrained(model_checkpoint)
inputs = processor(images=image, return_tensors="pt").to(device)
pixel_values = inputs.pixel_values
print(pixel_values.shape)
     
import torch

with torch.no_grad():
  outputs = model(pixel_values)
logits = outputs.logits
logits.shape


prediction = logits.argmax(-1)
print("Predicted class:", model.config.id2label[prediction.item()])


### Task 2: Fine-tuning the models using HuggingFace library

In [None]:
from datasets import load_dataset 
# load cifar10 (only small portion for demonstration purposes) 
train_ds, test_ds = load_dataset('cifar10', split=['train[:5000]', 'test[:2000]'])
# split up training into training + validation
splits = train_ds.train_test_split(test_size=0.1)
train_ds = splits['train']
val_ds = splits['test']

id2label = {id:label for id, label in enumerate(train_ds.features['label'].names)}
label2id = {label:id for id,label in id2label.items()}
print(id2label)

In [None]:
## Define the transforms

from torchvision.transforms import (CenterCrop, 
                                    Compose, 
                                    Normalize, 
                                    RandomHorizontalFlip,
                                    RandomResizedCrop, 
                                    Resize, 
                                    ToTensor)

image_mean, image_std = processor.image_mean, processor.image_std
size = processor.size["height"]

normalize = Normalize(mean=image_mean, std=image_std)
_train_transforms = Compose(
        [
            RandomResizedCrop(size),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )

_val_transforms = Compose(
        [
            Resize(size),
            CenterCrop(size),
            ToTensor(),
            normalize,
        ]
    )

def train_transforms(examples):
    examples['pixel_values'] = [_train_transforms(image.convert("RGB")) for image in examples['img']]
    return examples

def val_transforms(examples):
    examples['pixel_values'] = [_val_transforms(image.convert("RGB")) for image in examples['img']]
    return examples
     

# Set the transforms
train_ds.set_transform(train_transforms)
val_ds.set_transform(val_transforms)
test_ds.set_transform(val_transforms)


from torch.utils.data import DataLoader
import torch

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

train_dataloader = DataLoader(train_ds, collate_fn=collate_fn, batch_size=4)


In [None]:

## TODO: Experiment with different models. To use Swin, you will need to export SwinForImageClassification and set ignore_mismatched_sizes=True to adjust the last layer
## Use Huggingface hub to find the model names: https://huggingface.co/models
## Most likely you will have to use the tiny versions only and relatively small batch sizes
## You can also use AutoModelForImageClassification to automatically load the correct model
from transformers import ViTForImageClassification, SwinForImageClassification, Swinv2ForImageClassification, ConvNextForImageClassification

model_checkpoint_swin = "microsoft/swin-tiny-patch4-window7-224"
model_checkpoint_vit = "google/vit-base-patch16-224-in21k"

model_vit = ViTForImageClassification.from_pretrained(model_checkpoint_vit,
                                                  id2label=id2label,
                                                  label2id=label2id)

model_swin = SwinForImageClassification.from_pretrained(model_checkpoint_swin,id2label=id2label,label2id=label2id, ignore_mismatched_sizes=True)

model_swinv2 = Swinv2ForImageClassification.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256",id2label=id2label,label2id=label2id, ignore_mismatched_sizes=True)


In [None]:
from transformers import TrainingArguments, Trainer

metric_name = "accuracy"

model_checkpoint = model_checkpoint_vit
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-cifar-10", #output directory
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=4,
    num_train_epochs=1,                 # for demonstration purposes, adjust as needed
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='logs',
    report_to='tensorboard',
    remove_unused_columns=False,
)
     
from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return dict(accuracy=accuracy_score(predictions, labels))


import torch

trainer = Trainer(
    model_vit,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)
train_results = trainer.train()

trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()
metrics = trainer.evaluate(val_ds)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [None]:
## use your fine-tuned model to predict
## TODO: Load the image and the preprocessor
from transformers import ViTImageProcessor

img = Image.open("path/to/your/image.jpg")
processor = ViTImageProcessor.from_pretrained ("path/to/your/Preprocessor.json") # e.g. vit-swin-test-cifar-10/checkpoint-4500
inputs = processor(images=img, return_tensors="pt").to(device)
pixel_values = inputs.pixel_values

outputs = model(**inputs)
print (outputs)
predicted_class_idx = torch.argmax(outputs.logits[0]).item()
print (predicted_class_idx)
print (id2label[predicted_class_idx])

### Another example of fine-tuning of the Hugging Face model using the standard Pytorch loop
### Task 3: create the validation set and change the training loop to compute the training and validation losses and accuracies after the epoch. Experiment with fine-tuning options of your model.

In [None]:



from transformers import AutoModelForImageClassification

model = AutoModelForImageClassification.from_pretrained("facebook/convnext-tiny-224",
                                                        id2label=id2label,
                                                        label2id=label2id,
                                                        ignore_mismatched_sizes=True)


from tqdm.notebook import tqdm
import torch


###########################################
## TODO: Experiment with fine-tuning the model: 
## 1. Freeze all layers except the classifier
## 2. Unfreeze the last few layers
## 3. Unfreeze all layers
###########################################
# freeze all layers except the classifier
print (model)
for param in model.parameters():
    param.requires_grad = False
for param in model.classifier.parameters():
    param.requires_grad = True

for param in model.parameters():
    print(param.requires_grad)



params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(params, lr=5e-5)

# move model to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)



model.train()
for epoch in range(10):
  #print("Epoch:", epoch)
  correct = 0
  total = 0
  loss_in_epoch = 0
  for idx, batch in enumerate(tqdm(train_dataloader)):
    # move batch to GPU
    batch = {k:v.to(device) for k,v in batch.items()}

    optimizer.zero_grad()

    # forward pass
    outputs = model(pixel_values=batch["pixel_values"], labels=batch["labels"])

    loss, logits = outputs.loss, outputs.logits
    loss.backward()
    optimizer.step()

    # metrics
    total += batch["labels"].shape[0]
    predicted = logits.argmax(-1)
    correct += (predicted == batch["labels"]).sum().item()

    accuracy = correct/total
    loss_in_epoch += loss.item()
    
    if idx % 100 == 0:
      print(f"Loss after {idx} steps:", loss.item())
      print(f"Accuracy after {idx} steps:", accuracy)
  print(f"Loss in epoch {epoch}:", loss_in_epoch/len(train_dataloader)) 
