In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.57.0
!pip install --no-deps trl==0.22.2

In [3]:
from unsloth import FastVisionModel
import torch
max_seq_length = 16384
lora_rank = 16
model = FastVisionModel.from_pretrained(
    model_name="unsloth/Qwen3-VL-8B-Instruct-unsloth-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    fast_inference=False,
    gpu_memory_utilization=0.8,
)

==((====))==  Unsloth 2025.11.4: Fast Qwen3_Vl patching. Transformers: 4.57.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/782 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/817 [00:00<?, ?B/s]

In [10]:
model, processor = model  # Unpack the tuple first

model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,

    r = lora_rank,
    lora_alpha = lora_rank,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
    use_gradient_checkpointing = "unsloth",
)

In [11]:
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer

dataset = load_dataset("AI4Math/MathVista", split = "testmini")

README.md: 0.00B [00:00, ?B/s]

data/testmini-00000-of-00001-725687bf7a1(…):   0%|          | 0.00/142M [00:00<?, ?B/s]

data/test-00000-of-00002-6b81bd7f7e2065e(…):   0%|          | 0.00/358M [00:00<?, ?B/s]

data/test-00001-of-00002-6a611c71596db30(…):   0%|          | 0.00/386M [00:00<?, ?B/s]

Generating testmini split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5141 [00:00<?, ? examples/s]

In [12]:
def is_numeric_answer(example):
    try:
        float(example["answer"])
        return True
    except:
        return False

dataset = dataset.filter(is_numeric_answer)

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
def resize_image(example):
    image=example["decoded_image"]
    image=image.resize((512,512))
    example["decoded_image"] = image
    return example
dataset = dataset.map(resize_image)

def convert_to_rgb(example):
    image=example["decoded_image"]
    if image.mode != "RGB":
        image=image.convert("RGB")
    example["decoded_image"] = image
    return example
dataset = dataset.map(convert_to_rgb)

Map:   0%|          | 0/566 [00:00<?, ? examples/s]

Map:   0%|          | 0/566 [00:00<?, ? examples/s]

In [14]:
REASONING_START='<REASONING>'
REASONING_END='</REASONING>'
SOLUTION_START='<SOLUTION>'
SOLUTION_END='</SOLUTION>'
def make_conversation(example):
    text_content=(
        f"{example['question']}. Also first provide your reasoning or working out"\
        f" on how you would go about solving the question between {REASONING_START} and {REASONING_END}"
        f" and then your final answer between {SOLUTION_START} and (put a single float here) {SOLUTION_END}"
    )
    prompt=[{"role":"user","content":[{"type":"image"},{"type":"text","text":text_content}]}]
    return {"prompt":prompt,"image":example["decoded_image"], "answer":example["answer"]}
train_dataset = dataset.map(make_conversation)
train_dataset=train_dataset.remove_columns('image')
train_dataset=train_dataset.rename_column('decoded_image','image')

Map:   0%|          | 0/566 [00:00<?, ? examples/s]

In [17]:
train_dataset = train_dataset.map(
    lambda example: {
        "prompt": processor.tokenizer.apply_chat_template(
            example["prompt"],
            tokenize = False,
            add_generation_prompt = True,
        )
    }
)

Map:   0%|          | 0/566 [00:00<?, ? examples/s]

In [28]:
import re
def formatting_reward_func(completions,**kwargs):
    thinking_pattern = re.compile(f"{REASONING_START}(.*?){REASONING_END}", re.DOTALL)
    solution_pattern = re.compile(f"{SOLUTION_START}(.*?){SOLUTION_END}", re.DOTALL)
    scores=[]
    for completion in completions:
        score=0
        thinking_match = re.findall(thinking_pattern, completion)
        solution_match = re.findall(solution_pattern, completion)
        if len(thinking_match)==1:
            score+=1.0
        if len(solution_match)==1:
            score+=1.0
        if len(completion)!=0:
            removal=completion.replace("addCriterion","").replace("\n","")
            if(len(completion)-len(removal))/len(completion) >= 0.5:
                score-=2.0
        scores.append(score)
    return scores
                
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    answer_pattern = f'{SOLUTION_START}(.*?){SOLUTION_END}'

    responses = [re.findall(answer_pattern, completion, re.DOTALL) for completion in completions]
    q = prompts[0]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:{completions[0]}")
    return [
        2.0 if len(r)==1 and a == r[0].replace('\n','') else 0.0
        for r, a in zip(responses, answer)
    ]

In [19]:
train_dataset[0]["prompt"]

"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>When a spring does work on an object, we cannot find the work by simply multiplying the spring force by the object's displacement. The reason is that there is no one value for the force-it changes. However, we can split the displacement up into an infinite number of tiny parts and then approximate the force in each as being constant. Integration sums the work done in all those parts. Here we use the generic result of the integration.\r\n\r\nIn Figure, a cumin canister of mass $m=0.40 \\mathrm{~kg}$ slides across a horizontal frictionless counter with speed $v=0.50 \\mathrm{~m} / \\mathrm{s}$. It then runs into and compresses a spring of spring constant $k=750 \\mathrm{~N} / \\mathrm{m}$. When the canister is momentarily stopped by the spring, by what distance $d$ is the spring compressed?. Also first provide your reasoning or working out on how you would go about solving the question between <REASONING> and </REASONING> and 

In [20]:
train_dataset[100]["prompt"]

'<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Move the ruler to measure the length of the nail to the nearest inch. The nail is about (_) inches long.. Also first provide your reasoning or working out on how you would go about solving the question between <REASONING> and </REASONING> and then your final answer between <SOLUTION> and (put a single float here) </SOLUTION><|im_end|>\n<|im_start|>assistant\n'

In [24]:
image = train_dataset[100]["image"]
prompt = train_dataset[100]["prompt"]

inputs = processor(
    images=image,
    text=prompt,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(processor.tokenizer, skip_prompt=True)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=1024,
                   use_cache=True, temperature=1.0, min_p=0.1)

  return torch._C._get_cublas_allow_tf32()


<REASONING>
To measure the length of the nail to the nearest inch, I need to determine how long the nail is by comparing it to the ruler.

Step 1: Identify the starting point of the nail.
The head of the nail (the circular part) is aligned with the 0-inch mark on the ruler.

Step 2: Identify the ending point of the nail.
The sharp tip of the nail extends past the 3-inch mark but does not reach the 4-inch mark.

Step 3: Determine the length to the nearest inch.
Since the tip is between 3 and 4 inches, and we are rounding to the nearest inch, I need to see which whole number it is closer to. The tip is clearly past 3 and before 4, so I need to estimate if it's closer to 3 or 4.

Looking at the visual: the tip appears to be just shy of the 4-inch mark. It’s about 3.5 inches long (since it’s roughly halfway between 3 and 4), but since we need to round to the nearest inch, 3.5 rounds up to 4. However, let’s be precise. The nail's tip looks to be at approximately 3.2 to 3.4 inches. Since thi

In [None]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    log_completions = False,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 2, # Decrease if out of memory
    max_prompt_length = 1024,
    max_completion_length = 1024,
    num_train_epochs = 0.1, # Set to 1 for a full training run
    max_steps = 60,
    save_steps = 60,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",

    # Below enables GSPO:
    importance_sampling_level = "sequence",
    mask_truncated_completions = False,
    loss_type = "dr_grpo",
)

Unsloth: We now expect `per_device_train_batch_size` * `gradient_accumulation_steps` * `world_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 2


In [None]:
trainer = GRPOTrainer(
    model = model,
    args = training_args,
    # Pass the processor to handle multimodal inputs
    processing_class = processor,
    reward_funcs = [
        formatting_reward_func,
        correctness_reward_func,
    ],
    train_dataset = train_dataset,
)

trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 566 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 43,646,976 of 8,810,770,672 (0.50% trained)


-------------------- Question:
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>What is the highest value on the X axis?. Also first provide your reasoning or working out on how you would go about solving the question between <REASONING> and </REASONING> and then your final answer between <SOLUTION> and (put a single float here) </SOLUTION><|im_end|>
<|im_start|>assistant
 
Answer:
30 
Response:<REASONING>
To determine the highest value on the X-axis of the given graph, I need to examine the horizontal axis (X-axis) which is labeled "MICROGRAMS/ml-E-DNP-LYSINE-HCL". The X-axis has tick marks with labeled values: 0, 5, 10, 15, 20, 25, and 30. The last tick mark on the right side of the axis is labeled "30". Therefore, the highest value shown on the X-axis is 30.

</REASONING>
<SOLUTION>30.0</SOLUTION>
Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,rewards / formatting_reward_func / mean,rewards / formatting_reward_func / std,rewards / correctness_reward_func / mean,rewards / correctness_reward_func / std
1,0.0,2.0,0.0,155.5,134.0,177.0,0.0,155.5,134.0,177.0,0.0,2.0,0.0,0.0,0.0


-------------------- Question:
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>What is the age gap between these two people in image?. Also first provide your reasoning or working out on how you would go about solving the question between <REASONING> and </REASONING> and then your final answer between <SOLUTION> and (put a single float here) </SOLUTION><|im_end|>
<|im_start|>assistant
 
Answer:
6 
Response:<REASONING>
To determine the age gap between the two people in the image, I would need to analyze visual cues such as facial features, body proportions, hairstyle, clothing style, and context (e.g., era of the photo). However, this is a black-and-white photograph, and without clear identifiers like birth dates, names, or other contextual information, it is impossible to determine their exact ages or the difference between them with any degree of certainty.

In such cases, we must consider whether there is any other information provided in the image that might help. The im