In [1]:
from datasets import load_dataset

dataset = load_dataset("arampacha/rsicd") # HF dataset for rsicd

dataset_infos.json: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/55.1M [00:00<?, ?B/s]

data/valid-00000-of-00001.parquet:   0%|          | 0.00/51.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8734 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1093 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/1094 [00:00<?, ? examples/s]

In [2]:
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from accelerate import Accelerator, notebook_launcher
from tqdm import tqdm

In [3]:
class CustomDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        # remove batch dimension
        encodings = []
        for caption in item["captions"]:
            encoding = self.processor(images=item["image"], text=caption, padding="max_length", return_tensors="pt")
            encoding = {k:v.squeeze() for k,v in encoding.items()}
            encodings.append(encoding)
        return encodings

In [4]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from torch.optim import AdamW 
from transformers.optimization import get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import ReduceLROnPlateau

2025-11-16 16:34:44.063744: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763310884.481403      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763310884.596539      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [5]:
# Initialize the tokenizer, processor, and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [6]:
train_dataset = CustomDataset(dataset["train"], processor)
val_dataset = CustomDataset(dataset["valid"], processor)

In [7]:
def training_loop(mixed_precision="fp16", num_epochs=3, learning_rate=5e-5):
    # Initialize accelerator
    accelerator = Accelerator(mixed_precision=mixed_precision)
    
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    
    # Use DataLoader for efficient batching
    train_loader = DataLoader(train_dataset, batch_size=5, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=5, shuffle=False)
    
    # Set up the optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)
    
    model, optimizer, train_loader, val_loader = accelerator.prepare(model, optimizer, train_loader, val_loader)
    
    model.train()
    for epoch in range(num_epochs):
        epoch_losses = []  # To store losses for each batch in the epoch

        for idx, encodings in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1}", unit="batch")):
            for encoding in encodings:
                input_ids = encoding.pop("input_ids")
                pixel_values = encoding.pop("pixel_values")

                outputs = model(input_ids=input_ids,
                                pixel_values=pixel_values,
                                labels=input_ids)

                loss = outputs.loss
                epoch_losses.append(loss.item())  # Store the loss for this batch

                accelerator.backward(loss)

                optimizer.step()
                optimizer.zero_grad()

        # Calculate and print the average loss for the epoch
        average_loss = sum(epoch_losses) / len(epoch_losses)
        accelerator.print(f"Average Training Loss for Epoch {epoch + 1}: {average_loss}")

        # Validation phase
        model.eval()
        val_losses = []

        with torch.no_grad():
            for val_encodings in tqdm(val_loader, desc="Validation", unit="batch"):
                for val_encoding in val_encodings:
                    val_input_ids = val_encoding.pop("input_ids")
                    val_pixel_values = val_encoding.pop("pixel_values")

                    val_outputs = model(input_ids=val_input_ids,
                                        pixel_values=val_pixel_values,
                                        labels=val_input_ids)

                    val_loss = val_outputs.loss
                    val_losses.append(val_loss.item())

        average_val_loss = sum(val_losses) / len(val_losses)
        accelerator.print(f"Average Validation Loss for Epoch {epoch + 1}: {average_val_loss}")

        # Update learning rate based on validation loss
        scheduler.step(average_val_loss)

        model.train()
        
        save_path = f"kaggle/working/new_model_epoch_{epoch + 1}"
        
    # Save the fine-tuned model
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(
            save_path,
            is_main_process=accelerator.is_main_process,
            save_function=accelerator.save,
        )

In [8]:
# import torch

# # This is the critical line to resolve the CUDA re-initialization error
# try:
#     torch.multiprocessing.set_start_method('spawn', force=True)
# except RuntimeError as e:
#     print(f"Start method already set or error: {e}")

In [None]:
args = ("fp16", 5, 5e-7)
notebook_launcher(training_loop, args, num_processes=1)

Launching training on CPU.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]


Epoch 1:   0%|          | 0/1747 [00:00<?, ?batch/s][AWe strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
