In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained BLIP model and processor
loc = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(loc)
model = BlipForConditionalGeneration.from_pretrained(loc)
model = model.to(device)

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [2]:
# Freezing the image encoder and unfreezing the decorder for training
for name, param in model.named_parameters():
    if "vision_model" in name:  
        param.requires_grad = False
    else:
        param.requires_grad = True

In [3]:
# Creating a Dataset class 
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.processor(
            images=item["image"], 
            text=item["text"], 
            padding="max_length", 
            return_tensors="pt"
        )
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        return encoding

In [4]:
from datasets import load_dataset
ds = load_dataset("parquet", data_files='/kaggle/input/luna-facad-custom/train_dataset-LUNA-FACAD.parquet', split='train')

# Creating the dataset and dataloader instances
train_dataset = ImageCaptioningDataset(ds, processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)

# Optimizer for fine-tuning 
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
# Training loop
model.train()
for epoch in range(2):
    print(f"Epoch: {epoch}")
    for idx, batch in enumerate(train_dataloader):
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device)

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
        loss = outputs.loss

        print(f"Loss at step {idx}: {loss.item()}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Saving the fine-tuned model
model.save_pretrained("path_to_save_finetuned_model")
processor.save_pretrained("path_to_save_finetuned_model")

Epoch: 0


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Loss at step 0: 12.893091201782227
Loss at step 1: 10.13398551940918
Loss at step 2: 10.159795761108398
Loss at step 3: 10.10242748260498
Loss at step 4: 10.073626518249512
Loss at step 5: 10.0761137008667
Loss at step 6: 10.115594863891602
Loss at step 7: 10.036064147949219
Loss at step 8: 9.9949369430542
Loss at step 9: 9.992846488952637
Loss at step 10: 9.966035842895508
Loss at step 11: 9.92912769317627
Loss at step 12: 9.60749626159668
Loss at step 13: 9.111699104309082
Loss at step 14: 8.657183647155762
Loss at step 15: 8.353034973144531
Loss at step 16: 7.991502285003662
Loss at step 17: 7.556130409240723
Loss at step 18: 7.180778980255127
Loss at step 19: 6.840671539306641
Loss at step 20: 6.534666061401367
Loss at step 21: 6.251461982727051
Loss at step 22: 5.993522644042969
Loss at step 23: 5.713780403137207
Loss at step 24: 5.463138103485107
Loss at step 25: 5.237843990325928
Loss at step 26: 4.939887523651123
Loss at step 27: 4.674023151397705
Loss at step 28: 4.43981790542

[]

In [6]:
# Logging into HuggingFace
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
# Creating a HuggingFace repository for my model
from huggingface_hub import create_repo

repo_id = "sagniksengupta/blip-finetuned-facad-v2"
create_repo(repo_id)

RepoUrl('https://huggingface.co/sagniksengupta/blip-finetuned-facad-v2', endpoint='https://huggingface.co', repo_type='model', repo_id='sagniksengupta/blip-finetuned-facad-v2')

In [8]:
from transformers import BlipForConditionalGeneration, BlipProcessor

# Loading my locally saved model and processor
model = BlipForConditionalGeneration.from_pretrained("path_to_save_finetuned_model")
processor = BlipProcessor.from_pretrained("path_to_save_finetuned_model")

# Pushing the model to the HuggingFace Hub
model.push_to_hub(repo_id)
processor.push_to_hub(repo_id)

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sagniksengupta/blip-finetuned-facad-v2/commit/551b506bf0838c16e85fd7eff0d13621fec28c93', commit_message='Upload processor', commit_description='', oid='551b506bf0838c16e85fd7eff0d13621fec28c93', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sagniksengupta/blip-finetuned-facad-v2', endpoint='https://huggingface.co', repo_type='model', repo_id='sagniksengupta/blip-finetuned-facad-v2'), pr_revision=None, pr_num=None)