# Efficient Fine-tune of BLIP-2 on Food Captions with Huggingface PEFT

#### Reference:
https://github.com/huggingface/notebooks/blob/main/peft/Fine_tune_BLIP2_on_an_image_captioning_dataset_PEFT.ipynb

https://github.com/huggingface/peft/blob/main/examples/int8_training/fine_tune_blip2_int8.py

https://discuss.huggingface.co/t/finetune-blip-on-customer-dataset-20893/28446/13

https://github.com/salesforce/LAVIS/blob/main/lavis/common/optims.py


## Setup

In [None]:
!pip install -q git+https://github.com/huggingface/peft.git transformers bitsandbytes datasets

Note: At the time of our experiments, cloning from peft repo installed PEFT 0.7.0.dev0. However, recently we found that a newer version of PEFT may result in nan loss during training. The solution is to call !pip install peft==0.6.1 instead of directly cloning from the github repo.

## Load dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("advancedcv/Food500Cap",split="train")

In [None]:
len(dataset)

In [None]:
dataset[0]["image"]

In [None]:
dataset[0]["caption"]

## Create PyTorch Dataset

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import v2

class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item["image"]
        transforms = v2.Compose([
          v2.RandomResizedCrop(size=(364, 364),scale=(0.75, 1.0), antialias=True),
          v2.RandomHorizontalFlip(p=0.5),
        ])
        image = transforms(image)
        encoding = self.processor(images=image, padding=True, return_tensors="pt",
                                  do_resize=True, size=(364,364),do_normalize = True)
        # remove batch dimension
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        encoding["caption"] = item["caption"]
        return encoding

def collate_fn(batch):
  # pad the input_ids and attention_mask
    processed_batch = {}
    for key in batch[0].keys():
        if key != "caption":
            processed_batch[key] = torch.stack([example[key] for example in batch])
        else:
            text_inputs = processor.tokenizer(
                [example["caption"] for example in batch], padding=True, return_tensors="pt"
            )
            processed_batch["input_ids"] = text_inputs["input_ids"]
            processed_batch["attention_mask"] = text_inputs["attention_mask"]
    return processed_batch

## Load Model

In [None]:
from transformers import AutoProcessor, Blip2ForConditionalGeneration

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",torch_dtype = torch.float16)

## Convert loaded model to PEFT for Lora fine-tuning

In [None]:
from peft import LoraConfig, get_peft_model

# Configureation for Lora
config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj","v_proj","fc2","lm_head"]
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

## Fine-tuning

In [None]:
import random
import numpy as np
np.random.seed(42)
random.seed(42)
torch.manual_seed(42)

In [None]:
import math
# Define cosine annealing learning rate scheduler with linear warmup
class LinearWarmupCosineLRSchedulerWithRestart:
    def __init__(self, optimizer, max_epoch, min_lr, init_lr, restart_step,warmup_steps=0, warmup_start_lr=-1):
        self.optimizer = optimizer

        self.max_epoch = max_epoch
        self.min_lr = min_lr

        self.init_lr = init_lr
        self.warmup_steps = warmup_steps
        self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
        self.restart_step = restart_step

    def step(self, cur_epoch, cur_step):
        # assuming the warmup iters less than one epoch
        if cur_epoch == 0:
            warmup_lr_schedule(
                step=cur_step,
                optimizer=self.optimizer,
                max_step=self.warmup_steps,
                init_lr=self.warmup_start_lr,
                max_lr=self.init_lr,
            )
        else:
            cosine_lr_schedule(
                epoch=cur_epoch % self.restart_step,
                optimizer=self.optimizer,
                max_epoch=self.max_epoch,
                init_lr=self.init_lr,
                min_lr=self.min_lr,
            )

def cosine_lr_schedule(optimizer, epoch, max_epoch, init_lr, min_lr):
    """Decay the learning rate"""
    lr = (init_lr - min_lr) * 0.5 * (
        1.0 + math.cos(math.pi * epoch / max_epoch)
    ) + min_lr
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr


def warmup_lr_schedule(optimizer, step, max_step, init_lr, max_lr):
    """Warmup the learning rate"""
    lr = min(max_lr, init_lr + (max_lr - init_lr) * step / max(max_step, 1))
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr

In [None]:
train_dataset = ImageCaptioningDataset(dataset, processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16, collate_fn=collate_fn)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
scheduler = LinearWarmupCosineLRSchedulerWithRestart(optimizer,max_epoch = 10, min_lr = 0,
                                          init_lr = 3e-5,warmup_steps=1000,warmup_start_lr=1e-8,
                                                     restart_step=10)
device = "cuda" if torch.cuda.is_available() else "cpu"
num_epochs = 30

# Gradient accumulation to accomodate small GPU VRam
gradient_accumulation_steps = 2

model.to(device)
for epoch in range(num_epochs):
  # Training phase
  model.train()
  train_loss = 0
  for idx, batch in enumerate(train_dataloader):
    scheduler.step(cur_epoch = epoch, cur_step = idx)
    input_ids = batch.pop("input_ids").to(device)
    pixel_values = batch.pop("pixel_values").to(device)
    with torch.autocast(device_type='cuda', dtype=torch.float16):
      outputs = model(input_ids=input_ids,
                      pixel_values=pixel_values,
                      labels=input_ids)
    loss = outputs.loss
    loss = loss / gradient_accumulation_steps
    loss.backward()
    train_loss += loss.item()
    if ((idx + 1) % gradient_accumulation_steps == 0) or ((idx + 1)== len(train_dataloader)):
      optimizer.step()
      optimizer.zero_grad()
    if idx % 100 == 0:
      print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{idx + 1}/{len(train_dataloader)}],\
      Training Loss: {train_loss/(idx+1):.7f}, Learning rate:{optimizer.param_groups[0]['lr']:.7f}")
  torch.cuda.empty_cache()

In [None]:
# Check training results
inputs = processor(images=dataset[0]['image'], return_tensors="pt").to(device, torch.float16)
pixel_values = inputs.pixel_values
generated_ids = model.generate(pixel_values=pixel_values, max_new_tokens=30)
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_caption)

## Upload fine-tuned model to Huaagingface

In [None]:
from huggingface_hub import notebook_login
notebook_login()
model.push_to_hub("advancedcv/blip2-opt-2.7b_Food500Cap_finetuned")