In [3]:
!nvidia-smi

Thu Jul 11 15:11:04 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
!pip install -q transformers flash_attn timm einops peft tensorboard
!pip install -q roboflow git+https://github.com/roboflow/supervision.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [5]:
# @title Imports

import io
import os
import re
import json
import torch
import html
import base64
import itertools

import numpy as np
import supervision as sv
import pandas as pd

from google.colab import userdata
from IPython.core.display import display, HTML
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForCausalLM,
    AutoProcessor,
    get_scheduler
)
from tqdm import tqdm
from typing import List, Dict, Any, Tuple, Generator
from peft import LoraConfig, get_peft_model
from PIL import Image
from roboflow import Roboflow

In [None]:
!pwd

/content


In [7]:
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('annotations.csv')

# Split into train and temp (validation + test)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Split temp into validation and test
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Function to load images
def load_image(image_id):
    return Image.open(f'{image_id}')

# Prepare datasets
train_images = [load_image(image_id) for image_id in train_df['image_id']]
train_captions = train_df['description'].tolist()

val_images = [load_image(image_id) for image_id in val_df['image_id']]
val_captions = val_df['description'].tolist()

test_images = [load_image(image_id) for image_id in test_df['image_id']]
test_captions = test_df['description'].tolist()

In [9]:
# @title Loading Pre-trained Model

CHECKPOINT = "microsoft/Florence-2-base-ft"
REVISION = 'refs/pr/6'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained(CHECKPOINT, trust_remote_code=True, revision=REVISION).to(DEVICE)
processor = AutoProcessor.from_pretrained(CHECKPOINT, trust_remote_code=True, revision=REVISION)

In [None]:
# ['<OD>', '<CAPTION_TO_PHRASE_GROUNDING>', '<DENSE_REGION_CAPTION>', '<REGION_PROPOSAL>', '<OCR_WITH_REGION>', '<REFERRING_EXPRESSION_SEGMENTATION>', '<REGION_TO_SEGMENTATION>', '<OPEN_VOCABULARY_DETECTION>', '<REGION_TO_CATEGORY>', '<REGION_TO_DESCRIPTION>']


In [10]:
from PIL import Image

# Load a sample image
sample_image = load_image(df['image_id'].iloc[0])

# Perform inference using the pre-trained model
inputs = processor(text="<DETAILED_CAPTION>", images=sample_image, return_tensors="pt").to(DEVICE)
generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=1024,
    num_beams=3
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# Display the result
print("Sample Inference Before Fine-Tuning:")
print(generated_text)

# Optionally, display the sample image
sample_image.show()


Sample Inference Before Fine-Tuning:
This is a collage image. In this image we can see a person walking on the footpath, vehicles on the road, buildings, trees, poles and sky.


In [11]:
# @title Loading Dataset

class CaptionDataset(Dataset):
    def __init__(self, images, captions, processor):
        self.images = images
        self.captions = captions
        self.processor = processor

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        caption = self.captions[idx]
        inputs = self.processor(text=caption, images=image, return_tensors="pt", padding=True, truncation=True)
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'pixel_values': inputs['pixel_values'].squeeze()
        }


def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    pixel_values = torch.stack([item['pixel_values'] for item in batch])

    # Pad input_ids
    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=processor.tokenizer.pad_token_id)

    # Create attention masks
    attention_masks = (input_ids_padded != processor.tokenizer.pad_token_id).long()

    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_masks,
        'pixel_values': pixel_values
    }

# Create datasets
train_dataset = CaptionDataset(train_images, train_captions, processor)
val_dataset = CaptionDataset(val_images, val_captions, processor)
test_dataset = CaptionDataset(test_images, test_captions, processor)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)


In [12]:
# @title Finetune Configuration

from peft import LoraConfig, get_peft_model

TARGET_MODULES = ["q_proj", "o_proj", "k_proj", "v_proj", "linear", "Conv2d", "lm_head", "fc2"]
config = LoraConfig(
    r=8,
    lora_alpha=8,
    target_modules=TARGET_MODULES,
    task_type="CAUSAL_LM",
    lora_dropout=0.05,
    bias="none",
    inference_mode=False,
    use_rslora=True,
    init_lora_weights="gaussian",
    revision=REVISION
)
peft_model = get_peft_model(model, config)


In [15]:
!pip install tensorboard



In [17]:
# @title Train loop
from torch.optim import AdamW  # Update import statement


optimizer = AdamW(model.parameters(), lr=5e-5)  # Use torch.optim.AdamW
num_training_steps = 3 * len(train_loader)  # Assuming 3 epochs

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model.train()
for epoch in range(3):
    train_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        pixel_values = batch["pixel_values"].to(DEVICE)

        # The input_ids are passed as decoder_input_ids
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values, decoder_input_ids=input_ids)

        # Ensure loss is not None
        if outputs.loss is not None:
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Average Training Loss: {avg_train_loss}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 36.00 MiB. GPU 

In [None]:
!pip install rouge_score
!pip install nltk

In [None]:
# @title Model Eval

from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score

# Generate captions for the validation dataset
generated_captions = []

for image in val_images:
    inputs = processor(text="<DETAILED_CAPTION>", images=image, return_tensors="pt").to(DEVICE)
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    generated_captions.append(generated_text)

# Initialize scorers
rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# Initialize lists to store scores
bleu_scores = []
rouge1_scores = []
rougeL_scores = []
meteor_scores = []

# Calculate scores for each pair of generated and ground truth captions
for generated, ground_truth in zip(generated_captions, val_captions):
    # BLEU score
    reference = [ground_truth.split()]
    candidate = generated.split()
    bleu_scores.append(sentence_bleu(reference, candidate))

    # ROUGE scores
    scores = rouge.score(ground_truth, generated)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

    # METEOR score
    meteor_scores.append(meteor_score([ground_truth], generated))

# Calculate average scores
average_bleu = sum(bleu_scores) / len(bleu_scores)
average_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
average_rougeL = sum(rougeL_scores) / len(rougeL_scores)
average_meteor = sum(meteor_scores) / len(meteor_scores)

print(f"Average BLEU Score: {average_bleu:.4f}")
print(f"Average ROUGE-1 Score: {average_rouge1:.4f}")
print(f"Average ROUGE-L Score: {average_rougeL:.4f}")
print(f"Average METEOR Score: {average_meteor:.4f}")


In [None]:
!huggingface-cli login


In [None]:
from transformers import AutoModelForCausalLM, AutoProcessor

# Save the model locally
model.save_pretrained("./fine_tuned_model")
processor.save_pretrained("./fine_tuned_model")

# Upload the model to Hugging Face
model_name = "florence2-gdr"  # Replace with your desired model name
path_to_save_directory = "./fine_tuned_model"

# Initialize the HfApi
api = HfApi()
token = HfFolder.get_token()

# Create a repository
api.create_repo(repo_id=model_name, token=token, private=False)

# Upload the model
repository = Repository(local_dir=path_to_save_directory, clone_from=f"{your_username}/{model_name}")

repository.push_to_hub(commit_message="Initial commit")
