### Installation

In [1]:
%%capture
import os

if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth==2025.5.11
    !pip install unsloth_zoo==2025.5.1

else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install transformers==4.51.3
    !pip install --no-deps unsloth



### Unsloth

In [1]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",

    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model

    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",

    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2.5-VL-7B-Instruct",
    load_in_4bit = False,
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


  if () is not None or () != ():


==((====))==  Unsloth 2025.5.1: Fast Qwen2_5_Vl patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A40. Num GPUs = 1. Max memory: 44.448 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


We now add LoRA adapters for parameter efficient finetuning - this allows us to only efficiently train 1% of all parameters.

**[NEW]** We also support finetuning ONLY the vision part of the model, or ONLY the language part. Or you can select both! You can also select to finetune the attention or the MLP layers!

In [2]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

Unsloth: Making `model.base_model.model.model` require gradients


To format the dataset, all vision finetuning tasks should be formatted as follows:

```python
[
{ "role": "user",
  "content": [{"type": "text",  "text": instruction}, 
              {"type": "image", "image": image} ]
},
{ "role": "assistant",
  "content": [{"type": "text",  "text": answer} ]
},
]
```


In [None]:
import json
import random
import argparse
from PIL import Image


def build_prompt_for_caption_pair(caption_a_text, caption_b_text, contest_no):
    prompt_lines = [
        f"The image is a cartoon from the New Yorker Cartoon Caption Contest.\n",
        f"Contest number: {contest_no}.\n",
        f"I will provide you with two captions; one of them is deemed funnier by people.",
        f"Which of the following captions is funnier: A: {caption_a_text} B: {caption_b_text}?\n",
        f"A) {caption_a_text}\n",
        f"B) {caption_b_text}\n\n",
        f"Think step by step.",
        f"Please write your reasoning between <think> </think> tags,",
        f"and your final answer between <answer> </answer> tags.\n",
        f"Your answer should look like: <think> your reasoning here </think> <answer> your answer here </answer>"
    ]
    return "\n".join(prompt_lines)

def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

def load_pil_image(image_path):
    #image_path2 = f"/home/havural24/sft{image_path[1:]}" 
    img = Image.open(image_path).convert("RGB")
    return img


def convert_to_conversation(sample):
    caption_a_text = sample['caption_choices']['0'][0][0]
    caption_b_text = sample['caption_choices']['0'][0][1]
    answer = sample['caption_choices']['0'][1]
    contest_number = sample["contest_number"]
    instruction = build_prompt_for_caption_pair(caption_a_text, caption_b_text, contest_number)

    response = sample["responses"]["0"]
    txt = f"<think>{response}</think> <answer>{answer}</answer>"

    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": instruction},
                {"type": "image", "image": load_pil_image(sample["image"])}
            ]
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": txt}
            ]
        }
    ]
    return {"messages": conversation}


dataset_path = "./sft_data.json"
dataset = load_data(dataset_path)
converted_dataset = [convert_to_conversation(sample) for sample in dataset]
val_dataset = converted_dataset[:10]
train_dataset = converted_dataset[10:]

### Inference

In [7]:
FastVisionModel.for_inference(model) # Enable for inference!
sample = dataset[0]
image = load_pil_image(sample["image"])
caption_a_text = sample['caption_choices']['0'][0][0]
caption_b_text = sample['caption_choices']['0'][0][1]
contest_number = sample["contest_number"]
instruction = build_prompt_for_caption_pair(caption_a_text, caption_b_text, contest_number)


messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 512,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

<think>
Let's analyze both captions in the context of humor and their potential to elicit laughter from readers.

Caption A: "Don't laugh . Her hot flashes have completely stopped."
This caption suggests that the snowman, who looks disappointed, has been left outside by its creators, causing it not to suffer from hot flashes, an experience associated with menopause in humans. While this introduces an unusual situation involving a snowman, the humor is somewhat predictable and may not appeal to a wide audience as it lacks a fresh or unexpected twist.

Caption B: "It's all the Witness Protection Program would offer him."
This caption is a play on words where "him" refers to the snowman. The humor comes from the implication that the snowman is being offered something so meager or absurd that even the Witness Protection Program (which typically offers more substantial protection) seems better. The juxtaposition of the雪man needing protection and the seriousness of a Witness Protection Progr

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

We use our new `UnslothVisionDataCollator` which will help in our vision finetuning setup.

In [None]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

args = SFTConfig(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    max_steps = 200,
    learning_rate = 2e-4,
    fp16 = not is_bf16_supported(),
    bf16 = is_bf16_supported(),
    logging_steps = 1,
    eval_strategy = "steps",
    eval_steps = 10,
    save_strategy = "no",
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir = "outputs",
    report_to = "wandb",              # Enable wandb
    run_name = "qwen2_5_7b_vl_sft",  # Optional, custom name

    remove_unused_columns = False,
    dataset_text_field = "",
    dataset_kwargs = {"skip_prepare_dataset": True},
    dataset_num_proc = 4,
    max_seq_length = 2048,
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer),
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    args = args
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Unsloth: Model does not have a default image size - using 512


In [11]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A40. Max memory = 44.448 GB.
15.785 GB of memory reserved.


In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 369 | Num Epochs = 5 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 40,370,176/8,332,536,832 (0.48% trained)
[34m[1mwandb[0m: Currently logged in as: [33mhavural24[0m ([33mhavural24-ko-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
10,2.1734,1.987587
20,1.5182,1.483255
30,1.2777,1.356676
40,1.3252,1.311687
50,1.3009,1.284522
60,1.2154,1.267319
70,1.1811,1.245915
80,1.2014,1.228984
90,1.1781,1.223435
100,1.1037,1.211096


Unsloth: Not an error, but Qwen2_5_VLForConditionalGeneration does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [13]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

896.4811 seconds used for training.
14.94 minutes used for training.
Peak reserved memory = 16.67 GB.
Peak reserved memory for training = 0.885 GB.
Peak reserved memory % of max memory = 37.504 %.
Peak reserved memory for training % of max memory = 1.991 %.


<a name="Inference"></a>
### Inference


In [16]:
FastVisionModel.for_inference(model) # Enable for inference!



sample = dataset[0]
image = load_pil_image(sample["image"])
caption_a_text = sample['caption_choices']['0'][0][0]
caption_b_text = sample['caption_choices']['0'][0][1]
answer = sample['caption_choices']['0'][1]
instruction = build_prompt_for_caption_pair(caption_a_text, caption_b_text)

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 800,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

print(answer)

<think>Alright, let's tackle this step by step. First, when I look at the cartoon, I see the scene outside someone's living room window. There's a man standing in a top hat and overcoat making a bow gesture toward his snowman friend. The man seems content with his creation, but the woman inside appears confused or disapproving, which sets up the classic "snowman as friend" trope while adding a modern twist by including her present.

Next, who's speaking? The captions come from different characters. Caption A could be said by the man to his wife, commenting on why she doesn't want him building a snowman anymore. Caption B likely comes from an off-panel narrator, possibly hinting darkly at the source of their mannequin-like figure.

Now, reconstructing the situation: It's winter, so they're in a snowy setting. The man feels supported by his snowman buddy, treating it as a friendship despite its inability to reciprocate. His wife looks unimpressed inside, creating conflict between their p

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Select ONLY 1 to save! (Both not needed!)

# Save locally to 16bit
if True: model.save_pretrained_merged("unsloth_sft", tokenizer, save_method="merged_16bit")

# To export and save to your Hugging Face account
if False: model.push_to_hub_merged("YOUR_USERNAME/unsloth_finetune", tokenizer, token = "PUT_HERE")

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
#model.save_pretrained("lora_model")  # Local saving
#tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

### Evaluate

In [None]:
!python evaluation_upd.py \
    --model_name_or_path unsloth_sft \
    --dataset_name jmhessel/newyorker_caption_contest \
    --split test \
    --output_path predictions_ranking_a.json \
    --config_name ranking_from_pixels \
    --task_type ranking \
    --prompt_style alternative