In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import numpy as np
from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForCausalLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# Load pre-trained GIT model and processor
processor = AutoProcessor.from_pretrained("microsoft/git-base")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
model = model.to(device)

# Creating a Dataset class 
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.processor(
            images=item["image"], 
            text=item["text"], 
            padding="max_length", 
            return_tensors="pt"
        )
        encoding = {k: v.squeeze() for k, v in encoding.items()}
        return encoding

preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/707M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [3]:
from datasets import load_dataset
ds = load_dataset("parquet", data_files='/kaggle/input/luna-facad-custom/train_dataset-LUNA-FACAD.parquet', split='train')

# Creating the dataset and dataloader instances
train_dataset = ImageCaptioningDataset(ds, processor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)

# Optimizer for fine-tuning 
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# Optimizer for fine-tuning 
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Training loop
model.train()
for epoch in range(2):
    print(f"Epoch: {epoch}")
    for idx, batch in enumerate(train_dataloader):
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device)

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
        loss = outputs.loss

        print(f"Loss at step {idx}: {loss.item()}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Saving the fine-tuned model
model.save_pretrained("path_to_save_finetuned_model")
processor.save_pretrained("path_to_save_finetuned_model")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch: 0
Loss at step 0: 11.87518310546875
Loss at step 1: 10.422612190246582
Loss at step 2: 9.83010482788086
Loss at step 3: 9.62332820892334
Loss at step 4: 9.204504013061523
Loss at step 5: 8.874359130859375
Loss at step 6: 8.58261775970459
Loss at step 7: 8.393952369689941
Loss at step 8: 8.177132606506348
Loss at step 9: 7.965576171875
Loss at step 10: 7.771745204925537
Loss at step 11: 7.5962748527526855
Loss at step 12: 7.408730983734131
Loss at step 13: 7.223132133483887
Loss at step 14: 7.037720203399658
Loss at step 15: 6.8451433181762695
Loss at step 16: 6.651550769805908
Loss at step 17: 6.468019485473633
Loss at step 18: 6.297552108764648
Loss at step 19: 6.092267990112305
Loss at step 20: 5.890442848205566
Loss at step 21: 5.700406551361084
Loss at step 22: 5.499206066131592
Loss at step 23: 5.301516056060791
Loss at step 24: 5.081824779510498
Loss at step 25: 4.875389099121094
Loss at step 26: 4.70508337020874
Loss at step 27: 4.497386455535889
Loss at step 28: 4.296796

[]

In [5]:
# Logging into HuggingFace
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# Creating a HuggingFace repository for my model
from huggingface_hub import create_repo

repo_id = "sagniksengupta/git-finetuned-facad-v2"
create_repo(repo_id)

RepoUrl('https://huggingface.co/sagniksengupta/git-finetuned-facad-v2', endpoint='https://huggingface.co', repo_type='model', repo_id='sagniksengupta/git-finetuned-facad-v2')

In [7]:
# Loading my locally saved model and processor
model = AutoModelForCausalLM.from_pretrained("path_to_save_finetuned_model")
processor = AutoProcessor.from_pretrained("path_to_save_finetuned_model")

# Pushing the model to the HuggingFace Hub
model.push_to_hub(repo_id)
processor.push_to_hub(repo_id)

model.safetensors:   0%|          | 0.00/707M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sagniksengupta/git-finetuned-facad-v2/commit/ca5ae04bcd7aa76d75e59ffd33ff4b06035b7be1', commit_message='Upload processor', commit_description='', oid='ca5ae04bcd7aa76d75e59ffd33ff4b06035b7be1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sagniksengupta/git-finetuned-facad-v2', endpoint='https://huggingface.co', repo_type='model', repo_id='sagniksengupta/git-finetuned-facad-v2'), pr_revision=None, pr_num=None)