# Finetuning ViLT

### Setup

In [1]:
!pip install transformers datasets accelerate peft bitsandbytes torch torchvision pillow scikit-learn tqdm bert-score timeout-decorator




In [18]:
!pip install csv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR: Could not find a version that satisfies the requirement csv (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for csv[0m[31m
[0m

In [19]:
import os, glob, time
import pandas as pd
import torch
from PIL import Image
from tqdm import tqdm

from transformers import (
    ViltProcessor,
    ViltForQuestionAnswering,
    TrainingArguments,
    Trainer
)
from datasets import Dataset, DatasetDict
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, f1_score
from bert_score import score as bertscore_score
from timeout_decorator import timeout, TimeoutError
import csv

# Paths for checkpointing predictions & metrics
pred_path        = "/kaggle/working/vilt_vqa_predictions_baseline.csv"
metrics_path     = "/kaggle/working/vilt_vqa_metrics_baseline.csv"
finetune_outdir  = "/kaggle/working/vilt_lora_finetuned"

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [20]:
os.environ["WANDB_API_KEY"] = "ce38015d72ee80819dabf703a40fbf6e26023d69"

### Load & Concatenate Headerless CSVs (S1–S6)

In [21]:
base_path = "/kaggle/input/vr-dataset/dataset_curated"
subfolders = [f"S{i}" for i in range(1, 7)]
all_dfs = []

for folder in subfolders:
    csv_path = os.path.join(base_path, folder, f"{folder}_qa_data.csv")
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path, header=None, names=["image_path", "question", "answer"])
        df["image_path"] = df["image_path"].apply(
            lambda p: os.path.join(base_path, os.path.normpath(p).split("dataset_curated/")[-1])
        )
        df = df[df["image_path"].apply(os.path.exists)].reset_index(drop=True)
        all_dfs.append(df)
        print(f"Loaded {len(df)} examples from {csv_path}")
    else:
        print(f"CSV not found: {csv_path}")

df_all = pd.concat(all_dfs).reset_index(drop=True)
print(f"\nTotal QA pairs loaded: {len(df_all)}")

Loaded 14366 examples from /kaggle/input/vr-dataset/dataset_curated/S1/S1_qa_data.csv
Loaded 14358 examples from /kaggle/input/vr-dataset/dataset_curated/S2/S2_qa_data.csv
Loaded 14367 examples from /kaggle/input/vr-dataset/dataset_curated/S3/S3_qa_data.csv
Loaded 14366 examples from /kaggle/input/vr-dataset/dataset_curated/S4/S4_qa_data.csv
Loaded 14387 examples from /kaggle/input/vr-dataset/dataset_curated/S5/S5_qa_data.csv
Loaded 14376 examples from /kaggle/input/vr-dataset/dataset_curated/S6/S6_qa_data.csv

Total QA pairs loaded: 86220


### Split into Baseline(first 30%) & Finetune(last 70%)

In [22]:
# Compute split indices
val_split_idx = int(0.2 * len(df_all))
train_split_idx = int(0.8 * len(df_all))  # 60% training, remaining 20% for testing

# Apply the split
val_df    = df_all.iloc[:val_split_idx].reset_index(drop=True)        # First 20% → Validation
train_df  = df_all.iloc[val_split_idx:train_split_idx].reset_index(drop=True)  # Next 60% → Training
test_df   = df_all.iloc[train_split_idx:].reset_index(drop=True)      # Last 20% → Test

# Display the new splits
print(f"Baseline eval samples (20%): {len(val_df)}")
print(f"LoRA train samples (60%):    {len(train_df)}")
print(f"Test samples (20%):          {len(test_df)}")


Baseline eval samples (20%): 17244
LoRA train samples (60%):    51732
Test samples (20%):          17244


### Initialize Processor & Model, wrap with LoRA on the two Linear layers

In [23]:
pip install --upgrade peft

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [24]:
from peft import TaskType
print([t for t in TaskType])

[<TaskType.SEQ_CLS: 'SEQ_CLS'>, <TaskType.SEQ_2_SEQ_LM: 'SEQ_2_SEQ_LM'>, <TaskType.CAUSAL_LM: 'CAUSAL_LM'>, <TaskType.TOKEN_CLS: 'TOKEN_CLS'>, <TaskType.QUESTION_ANS: 'QUESTION_ANS'>, <TaskType.FEATURE_EXTRACTION: 'FEATURE_EXTRACTION'>]


In [25]:
# Init Processor & Model, Wrap with LoRA, then Monkey‑Patch forward
processor  = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
base_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
base_model.to(device)
base_model.eval()

lora_config = LoraConfig(
    task_type=TaskType.QUESTION_ANS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["classifier.0", "classifier.3"]
)

model = get_peft_model(base_model, lora_config)
model.to(device)

# Monkey‑patch to drop SQuAD args from every forward call
def _patched_forward(self, *args, **kwargs):
    kwargs.pop("start_positions", None)
    kwargs.pop("end_positions",   None)
    return super(self.__class__, self).forward(*args, **kwargs)

model.forward = _patched_forward.__get__(model, model.__class__)

print("LoRA trainable params:",
      sum(p.numel() for p in model.parameters() if p.requires_grad),
      "/", sum(p.numel() for p in model.parameters()))


LoRA trainable params: 55752 / 117644289


### Training Arguments & Trainer (with step‐wise checkpoints)

In [26]:
# Build label2id mapping from the model config
label2id = {lbl.lower(): idx for idx, lbl in base_model.config.id2label.items()}
num_labels = len(label2id)
print(f"{num_labels} labels. Example: {list(label2id.items())[:5]}")

3129 labels. Example: [('net', 0), ('pitcher', 1), ('orange', 2), ('yes', 3), ('white', 4)]


### Prepare HuggingFace Datasets & Collator/Metric Functions

In [27]:
# Prepare HF Datasets & One‑Hot DataCollator
from datasets import Dataset, DatasetDict

train_hf = Dataset.from_pandas(train_df)
val_hf   = Dataset.from_pandas(val_df)
ds       = DatasetDict({"train": train_hf, "validation": val_hf})

num_labels = len(base_model.config.id2label)

class DataCollatorForVilt:
    def __init__(self, processor, label2id, num_labels):
        self.processor = processor
        self.label2id  = label2id
        self.num_labels = num_labels

    def __call__(self, features):
        images    = [Image.open(f["image_path"]).convert("RGB") for f in features]
        questions = [f["question"] for f in features]
        answers   = [str(f["answer"]).strip().lower() for f in features]

        # 1) Tokenize → CPU tensors
        batch = self.processor(
            images,
            questions,
            return_tensors="pt",
            padding=True,
            truncation=True
        )

        # 2) One‑hot encode labels
        labels = torch.zeros((len(answers), self.num_labels), dtype=torch.float)
        for i, ans in enumerate(answers):
            idx = self.label2id.get(ans, -100)
            if idx >= 0:
                labels[i, idx] = 1.0

        batch["labels"] = labels
        return batch

collator = DataCollatorForVilt(processor, label2id, num_labels)

### Manual evaluation on validation split + save metrics CSV

In [28]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Convert multi-hot vectors to class indices
    if len(labels.shape) > 1 and labels.shape[1] > 1:
        labels = labels.argmax(axis=1)   # Converts from one-hot to class IDs

    y_true = labels.tolist()
    y_pred = preds.argmax(axis=1).tolist()
    
    # Calculate accuracy and F1
    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro", zero_division=0)

    # BERTScore
    _, _, F1b = bertscore_score(
        [base_model.config.id2label[i] for i in y_pred],
        [base_model.config.id2label[i] for i in y_true],
        lang="en"
    )
    bert = F1b.mean().item()

    return {
        "eval_accuracy": acc,
        "eval_f1_macro": f1m,
        "eval_bert_score": bert
    }


In [29]:
# Custom Callback to force evaluation every 1000 steps
# Enhanced Callback for Evaluation & Checkpoint Saving
from transformers import TrainerCallback

class EvaluationCallback(TrainerCallback):
    def __init__(self, metrics_csv, pred_csv, processor, model):
        self.metrics_csv = metrics_csv
        self.pred_csv    = pred_csv
        self.processor   = processor
        self.model       = model
        
        # write headers if not exist
        if not os.path.exists(self.metrics_csv):
            with open(self.metrics_csv, "w", newline="") as f:
                csv.writer(f).writerow(
                    ["step","eval_loss","eval_accuracy","eval_f1_macro","eval_bert_score"]
                )
        if not os.path.exists(self.pred_csv):
            with open(self.pred_csv, "w", newline="") as f:
                csv.writer(f).writerow(
                    ["image_path","question","true_answer","predicted_answer"]
                )

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 1000 == 0 and state.global_step > 0:
            print(f"\n[Checkpoint] Running evaluation at step {state.global_step}...")
            control.should_evaluate = True

            # === Save Metrics ===
            metrics = kwargs.get("metrics", {}) or {}
            row = [
                state.global_step,
                metrics.get("eval_loss"),
                metrics.get("eval_accuracy"),
                metrics.get("eval_f1_macro"),
                metrics.get("eval_bert_score")
            ]
            with open(self.metrics_csv, "a", newline="") as f:
                csv.writer(f).writerow(row)

            # === Save Predictions ===
            predictions = []
            for example in val_df.itertuples(index=False):
                image = Image.open(example.image_path).convert("RGB")
                inputs = self.processor(
                    image,
                    example.question,
                    return_tensors="pt",
                    padding=True,
                    truncation=True
                ).to(device)

                with torch.no_grad():
                    logits = self.model(**inputs).logits
                    pred_id = logits.argmax(-1).item()
                    pred_label = base_model.config.id2label[pred_id].split()[0].lower()

                true = str(example.answer).strip().lower()
                predictions.append([
                    example.image_path,
                    example.question,
                    true,
                    pred_label
                ])

            with open(self.pred_csv, "a", newline="") as f:
                csv.writer(f).writerows(predictions)

            print(f"[Checkpoint] Completed save for step {state.global_step}")

In [30]:
# Define TrainingArguments (eval/log/save every 1000 steps)
training_args = TrainingArguments(
    output_dir=finetune_outdir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    fp16=True,
    logging_steps=1000,
    save_steps=1000,
    save_total_limit=2,
    remove_unused_columns=False,
    label_names=["labels"],
    run_name="vilt_lora_finetune_experiment"
)

#print("Label Names:", trainer.label_names)   # should output: ['labels']

In [31]:
# Initialize Trainer with Enhanced Callback
eval_callback = EvaluationCallback(
    metrics_csv=metrics_path,
    pred_csv=pred_path,
    processor=processor,
    model=model
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[eval_callback]
)

In [32]:
# Start Fine-Tuning
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,F1 Macro,Bert Score
1000,2.659,2.40653,0.349165,0.105772,0.976107
2000,2.3774,2.258263,0.359487,0.109688,0.976732
3000,2.257,2.192552,0.363779,0.115421,0.977216
4000,2.1703,2.156758,0.366272,0.118086,0.977448
5000,2.1501,2.12871,0.369694,0.120422,0.977403
6000,2.0985,2.106807,0.371723,0.124004,0.977421
7000,2.063,2.08978,0.373347,0.126154,0.977208
8000,2.0394,2.068769,0.377175,0.128645,0.977526
9000,2.0346,2.049999,0.38106,0.132565,0.977688
10000,2.0254,2.031389,0.384134,0.130654,0.977693



[Checkpoint] Running evaluation at step 1000...
[Checkpoint] Completed save for step 1000


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 2000...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 3000...
[Checkpoint] Completed save for step 3000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 4000...
[Checkpoint] Completed save for step 4000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 5000...
[Checkpoint] Completed save for step 5000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 6000...
[Checkpoint] Completed save for step 6000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 7000...
[Checkpoint] Completed save for step 7000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 8000...
[Checkpoint] Completed save for step 8000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 9000...
[Checkpoint] Completed save for step 9000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 10000...
[Checkpoint] Completed save for step 10000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 11000...
[Checkpoint] Completed save for step 11000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 12000...
[Checkpoint] Completed save for step 12000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 13000...
[Checkpoint] Completed save for step 13000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 14000...
[Checkpoint] Completed save for step 14000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 15000...
[Checkpoint] Completed save for step 15000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 16000...
[Checkpoint] Completed save for step 16000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 17000...
[Checkpoint] Completed save for step 17000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 18000...
[Checkpoint] Completed save for step 18000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[Checkpoint] Running evaluation at step 19000...
[Checkpoint] Completed save for step 19000


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TrainOutput(global_step=19398, training_loss=2.0845783020076167, metrics={'train_runtime': 18350.2523, 'train_samples_per_second': 8.457, 'train_steps_per_second': 1.057, 'total_flos': 1002824888114736.0, 'train_loss': 2.0845783020076167, 'epoch': 2.9996133920977344})

### Save LoRA adapters & processor

In [37]:
test_hf  = Dataset.from_pandas(test_df)
ds = DatasetDict({
    "train": train_hf,
    "validation": val_hf,
    "test": test_hf
})

In [38]:
# 1. Save the fine-tuned model and processor
model.save_pretrained(finetune_outdir)
processor.save_pretrained(finetune_outdir)
print("Fine-tuned adapters and processor saved.")

# 2. Evaluate on the Test Set explicitly
print("\nRunning Final Test Evaluation...")
test_metrics = trainer.evaluate(eval_dataset=ds["test"])

# 3. Save the test metrics (now separate from validation metrics)
test_metrics_path = os.path.join(finetune_outdir, "vilt_vqa_metrics_test.csv")
pd.DataFrame([test_metrics]).to_csv(test_metrics_path, index=False)
print(f"Test evaluation completed and saved to {test_metrics_path}")

Fine-tuned adapters and processor saved.

Running Final Test Evaluation...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test evaluation completed and saved to /kaggle/working/vilt_lora_finetuned/vilt_vqa_metrics_test.csv
