# Fine-Tuning PaliGemma with QVLA

#### Author: nisan yildiz

----

PaliGemma is a pre-trained VLM designed to be a efficient base model for various fine-tuning applications in VL domain. Here, we will be fine-tuning the PaliGemma pre-trained model for image annotation task using quantization and Adapters. Adapters are small layers that are "plugged-in" to the larger model during fine-tuning to be trained while rest of the architecture remains frozen. This allows efficient fine-tuning of base-models without the need to train the entire network.

In [2]:
!git clone https://github.com/adapter-hub/adapters.git
%cd adapters
!pip install .
!pip install -U bitsandbytes
!pip install -U datasets

Cloning into 'adapters'...
remote: Enumerating objects: 126942, done.[K
remote: Counting objects: 100% (589/589), done.[K
remote: Compressing objects: 100% (431/431), done.[K
remote: Total 126942 (delta 389), reused 205 (delta 157), pack-reused 126353 (from 2)[K
Receiving objects: 100% (126942/126942), 99.40 MiB | 16.80 MiB/s, done.
Resolving deltas: 100% (96632/96632), done.
/content/adapters
Processing /content/adapters
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting transformers~=4.50.3 (from adapters==1.2.0.dev0)
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Downloading transformers-4.50.3-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m126.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: adapters
  Building wheel for adapters (pyproj

In [3]:
#Connect to drive
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/DI725/DI725-project


Mounted at /content/drive
/content/drive/MyDrive/DI725/DI725-project


In [4]:
import adapters
from adapters import AdapterModelInterface

In [5]:
import torch
from torch import nn

from transformers import BitsAndBytesConfig
from transformers import AutoProcessor, AutoModel, PaliGemmaForConditionalGeneration, AutoConfig

from huggingface_hub import notebook_login

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "google/paligemma-3b-pt-224" # pt for pre-trained, needs fine-tuning

## Fine-tuning without quantization

In [9]:
#We need to log-in before using the PaliGemma model, as it is subject to agreement

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
base_model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)
config = AutoConfig.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [47]:
from transformers.modeling_outputs import BaseModelOutputWithPooling
from adapters.heads import PredictionHead
# Create a custom model class by inheriting from the original model class

class IdentityHead(PredictionHead):
    def __init__(self):
        super().__init__(name="identity_head")
        self.config = {
            "layers": 1,
            "activation_function": None,
            "use_pooler": False,
            "dropout_prob": 0.0
        }
        self.identity = nn.Identity()
        # Add the identity module
        self.add_module("0", self.identity)

    def build(self, model):
        # Override build to do nothing since we just want identity functionality
        self.train(model.training)  # make sure training mode is consistent

    def forward(self, x, **kwargs):
        # Simple identity forward pass
        return self.identity(x)

    def get_label_names(self):
        # Override to return the expected label names
        return ["labels"]

# Add our custom identity head
base_model.heads = nn.ModuleDict({"identity_head": IdentityHead()})



In [48]:
base_model

PaliGemmaForConditionalGeneration(
  (vision_tower): SiglipVisionModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(256, 1152)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-26): 27 x SiglipEncoderLayer(
            (self_attn): SiglipSdpaAttention(
              (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp): SiglipMLP(
              (activation_fn): PytorchGELUTanh()
              (fc1): Linear(in_features

### Adding VL-Adapter

PaliGemma model is not officially supported by the adapters library. We need to create a model interface object to be able to use it with adapters.

In [53]:
bottleneck_interface_lm = AdapterModelInterface(
    adapter_methods=["bottleneck"], # the vanilla Adapter a.k.a bottleneck adapter
    model_embeddings="language_model.model.embed_tokens",
    model_layers="language_model.model.layers",
    layer_self_attn="self_attn",
    layer_cross_attn=None,
    attn_k_proj="k_proj",
    attn_q_proj="q_proj",
    attn_v_proj="v_proj",
    attn_o_proj="o_proj",
    layer_intermediate_proj="mlp.up_proj",
    layer_output_proj="mlp.down_proj",
)

In [54]:
adapters.init(base_model, interface=bottleneck_interface_lm)
base_model.add_adapter("adapter_lm", config="double_seq_bn")
base_model.set_active_adapters("adapter_lm")
print(base_model.adapter_summary())

#moving to device
#base_model.to(device)
#base_model.adapter_to("adapter_lm", device=device)



### Quantization

We will be using 4-bit quantization for our model, with the NF4 datatype. Computations will be done in 16-bit bfloat16 type. We are also double quantizing.  

In [None]:
from transformers import BitsAndBytesConfig

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_compute_dtype=torch.bfloat16,
   bnb_4bit_use_double_quant=True)

base_NF4_model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=nf4_config)

#adding the adapter
adapters.init(base_NF4_model, interface=bottleneck_interface_lm)
base_NF4_model.add_adapter("adapter_lm", config="double_seq_bn")
print(base_NF4_model.adapter_summary())

#cast some layers to full precision
for param in base_NF4_model.parameters():
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

# Enable gradient checkpointing to reduce required memory
base_NF4_model.gradient_checkpointing_enable()
base_NF4_model.enable_input_require_grads()

class CastOutputToFloat(torch.nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)
base_NF4_model.language_model.lm_head = CastOutputToFloat(base_NF4_model.language_model.lm_head)

#moving to device
base_NF4_model.to(device)
base_NF4_model.adapter_to("adapter_lm", device=device)

prompt = "caption en"

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
adapter_lm               bottleneck       18,952,704       1.098       0       1
--------------------------------------------------------------------------------
Full model                              1,725,847,280     100.000               1


In [None]:
#Verifying the datatypes.
dtypes = {}
for _, p in base_NF4_model.named_parameters():
    dtype = p.dtype
    if dtype not in dtypes:
        dtypes[dtype] = 0
    dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items():
    total += v
for k, v in dtypes.items():
    print(k, v, v / total)

torch.float16 527750656 0.3024705759052781
torch.float32 19430128 0.011136020276350484
torch.uint8 1197619200 0.6863934038183714


## Preparing and exploring the dataset

RISCM dataset consists of captioned sattelite imagery with 5 captions provided per image. Our captions table includes information about all captions, as well as informations about the training/test/validation splits and the original source of the images.

### Example of an image with a caption

In [13]:
from datasets import load_dataset

In [15]:
dataset = load_dataset("json", data_files={'train': 'RISCM/resized/train_data.jsonl', 'test':'RISCM/resized/test_data.jsonl', 'validation':"RISCM/resized/val_data.jsonl"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [19]:
image_dir = "RISCM/resized/"
from PIL import Image
import PIL
def collate_fn(examples):
      texts = [f"<image> <bos> {example['prefix']}" for example in examples]
      labels= [example['suffix'] for example in examples]
      images = [PIL.Image.open(image_dir + example["file_name"]).convert("RGB") for example in examples]
      tokens = processor(text=texts, images=images, suffix=labels,
      return_tensors="pt", padding="longest")
      tokens = tokens.to(torch.bfloat16).to(device)
      return tokens

In [18]:
input_image = PIL.Image.open(image_dir + dataset["test"][0]["file_name"])

'NWPU_31430.jpg'

In [26]:
input_text = f"<image> <bos> {dataset['test'][0]['prefix']}"
input_image = PIL.Image.open(image_dir + dataset["test"][0]["file_name"])

In [36]:
torch.cuda.empty_cache()

In [56]:
inputs = processor(text=input_text, images=input_image,
                  padding="longest", do_convert_rgb=True, return_tensors="pt").to("cpu")
inputs = inputs.to(dtype=base_model.dtype)
base_model.to("cpu")

PaliGemmaForConditionalGeneration(
  (vision_tower): SiglipVisionModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(256, 1152)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-26): 27 x SiglipEncoderLayer(
            (self_attn): SiglipSdpaAttention(
              (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp): SiglipMLP(
              (activation_fn): PytorchGELUTanh()
              (fc1): Linear(in_features

In [60]:
with torch.no_grad():
  output = base_model(**inputs, max_length=496)

#print(processor.decode(output[0], skip_special_tokens=False))

In [64]:
tokenizer = processor.tokenizer
def decode_full_sequence(model_output):
    # Get the logits from the output
    logits = model_output.logits

    # For each position, find the token ID with the highest probability
    predicted_token_ids = torch.argmax(logits, dim=-1)

    # Convert token IDs to text for the full sequence
    full_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)

    return full_text

# Example usage
full_text = decode_full_sequence(output)

In [65]:
full_text

" imagery the niger above  runaway trials london'   strip view the zoom city mud airport and plane vegetation  soil on asway track   ' a imagery imagery thailand airports area vegetation russian cargo of russian along car on on on countrya the and cargo and aerialman cargo by us strip car crane on pre aerial city capital y funny- perspective london the humanitarian blocking parked a arm us aircraft on view imagery central this the coordinate satellite  crashes  runway of on heavy on on view country china images island  on on on strandeds on taking  bo on st runway   to march spy soil sleeping view  view blocking a car median drone  russian orth cargo runway us the the low on  runway seen onside view  blue medical the on tyres southeast spot chin   bir  a over imagery  airport russian as lot runway shaped earth russian drone west for view bir imagery person c dog as runway as transport  the land the grass turning way runway  airport this green a aviation business the aircraft  turn unab

In [50]:
processor.decode(output[0],skip_special_tokens=True)

'  caption en\nthe plane was seen on the runway'

### Fine-tuning

In [18]:
from transformers import TrainingArguments
args=TrainingArguments(
            num_train_epochs=2,
            remove_unused_columns=False,
            per_device_train_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_steps=2,
            learning_rate=5e-4,
            weight_decay=1e-6,
            adam_beta2=0.999,
            logging_steps=100,
            optim="paged_adamw_8bit", # you can use paged optimizers like paged_adamw_8bit for Q or adamw_hf
            save_strategy="steps",
            save_steps=1000,
            save_total_limit=1,
            output_dir="paligemma_qvla",
            bf16=True,
            report_to=["tensorboard"],
            dataloader_pin_memory=False
        )


In [19]:
from datasets import Dataset, Image
from adapters import AdapterTrainer

In [25]:
base_model.active_head = "identity_head"
base_model._active_heads = [base_model.active_head]

In [None]:
base_model

In [None]:
base_NF4_model.active_adapter = "adapter_lm"
base_NF4_model.active_head = "language_model.lm_head"
base_NF4_model._active_heads = [base_NF4_model.active_head]
base_NF4_model.train_adapter("adapter_lm")

In [27]:
base_model.train_adapter("adapter_lm")
trainer = AdapterTrainer(
    model=base_model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=collate_fn,
    args=args
)

trained = trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [None]:
trainer = AdapterTrainer(
      model=base_model,
      train_dataset=dataset["train"],
      eval_dataset=dataset["validation"],
      data_collator=collate_fn,
      args=args
)
trainer.train()

AttributeError: 'PaliGemmaForConditionalGeneration' object has no attribute 'heads'

In [None]:
from transformers import Trainer

base_NF4_model.train_adapter("adapter_lm")
trainer = Trainer(
    model=base_NF4_model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=collate_fn,
    args=args
)



trainer.train()

ValueError: You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more details

In [None]:
from adapters import AutoAdapterModel
model = AutoAdapterModel.from_pretrained("google-bert/bert-base-cased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertAdapterModel were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.heads

ModuleDict(
  (default): BertStyleMaskedLMHead(
    (0): Linear(in_features=768, out_features=768, bias=True)
    (1): Activation_Function_Class(
      (f): GELUActivation()
    )
    (2): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (3): Linear(in_features=768, out_features=28996, bias=True)
  )
)

ModuleNotFoundError: No module named 'pytorch'

In [None]:
torch.ModuleDict(base_model_lm_head)

TypeError: __init__(): incompatible constructor arguments. The following argument types are supported:
    1. torch._C.ModuleDict(arg0: torch._C.ScriptModule)

Invoked with: Linear(in_features=2048, out_features=257216, bias=False)

In [None]:
from transformers import RobertaConfig
from adapters import AutoAdapterModel

config = RobertaConfig.from_pretrained(
    "roberta-base",
    num_labels=2,
)
test_model = AutoAdapterModel.from_pretrained(
    "roberta-base",
    config=config,
)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
test_model.active_head

'default'

In [None]:
from transformers import EncoderDecoderModel
test_model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e