In [None]:
import torch
import requests

from PIL import Image
from transformers import AutoProcessor, AutoModel
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset

In [None]:
SYSTEM_PROMPT = (
    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
    "<think> reasoning process here </think><answer> answer here </answer>"
)

In [None]:
def make_eagle_multiview_conversation(example, processor):
    prompt_blocks = example["prompt_blocks"]
    # Change the value of 'type' from 'image_url' to 'image' in dicts
    for block in prompt_blocks:
        if isinstance(block, dict) and block.get("type") == "image_url":
            block["type"] = "image"    
    
    conversation = [
        {
            "role": "user",
            "content": prompt_blocks,
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    return {
        "prompt": prompt,
        # "image": example["image"],
    }

In [None]:
def make_conversation(example, processor):
    conversation = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": example["problem"]},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    return {
        "prompt": prompt,
        "image": example["image"],
        "solution"
    }

In [None]:
dataset_id = "lmms-lab/multimodal-open-r1-8k-verified"
dataset = load_dataset(dataset_id, split="train[:5%]")

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

In [None]:
model_id = "nvidia/Eagle2-2B"

processor = AutoProcessor.from_pretrained("nvidia/Eagle2-2B", trust_remote_code=True, use_fast=True)
processor.tokenizer.padding_side = "left"

train_dataset = train_dataset.map(make_conversation, processor=processor)

In [None]:
print(train_dataset[0]["prompt"])

In [None]:
model = AutoModel.from_pretrained(
    model_id,
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16,
    device_map="cuda" if torch.cuda.is_available() else "cpu"
)

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

In [None]:
import re


def format_reward(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>\n.*?\n</think>\n<answer>\n.*?\n</answer>$"
    matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completions]
    rewards = [1.0 if match else 0.0 for match in matches]
    return rewards

In [None]:
from math_verify import LatexExtractionConfig, parse, verify
from latex2sympy2_extended import NormalizationConfig
from typing import Optional


def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str], **kwargs) -> list[Optional[float]]:
    """Reward function that checks if the completion matches the ground truth.
    - If both gold and prediction are parseable → use math verification.
    - If not parseable → compare as normalized text.
    """
    rewards = []

    for completion, sol in zip(completions, solution):
        try:
            gold_parsed = parse(sol, extraction_mode="first_match")
        except Exception as e:
            gold_parsed = []

        if len(gold_parsed) != 0:
            # Try parsing predicted answer too
            try:
                answer_parsed = parse(
                    completion,
                    extraction_config=[
                        LatexExtractionConfig(
                            normalization_config=NormalizationConfig(
                                nits=False,
                                malformed_operators=False,
                                basic_latex=True,
                                boxed="all",
                                units=True,
                            ),
                            boxed_match_priority=0,
                            try_extract_without_anchor=False,
                        )
                    ],
                    extraction_mode="first_match",
                )
                reward = float(verify(gold_parsed, answer_parsed))
            except Exception as e:
                print(f"verify failed: {e}, answer: {completion}, gold: {sol}")
                reward = None
        else:
            # fallback to text match
            reward = float(completion.strip().lower() == sol.strip().lower())

        rewards.append(reward)

    return rewards

In [None]:
from trl import GRPOConfig

# Configure training arguments using GRPOConfig
training_args = GRPOConfig(
    output_dir="Qwen2.5-VL-3B-Instruct-Thinking",
    learning_rate=1e-5,
    remove_unused_columns=False,  # to access the solution column in accuracy_reward
    num_train_epochs=1,
    bf16=True,
    # Parameters that control the data preprocessing
    per_device_train_batch_size=2,
    max_completion_length=1024,  # default: 256
    num_generations=2,  # default: 8
    max_prompt_length=2048,
    # Parameters related to reporting and saving
    report_to=["wandb"],
    logging_steps=10,
    push_to_hub=True,
    save_strategy="steps",
    save_steps=10,
)

In [None]:
from trl import GRPOTrainer

trainer = GRPOTrainer(
    model=model,
    processing_class=processor,
    reward_funcs=[format_reward, accuracy_reward],
    args=training_args,
    train_dataset=train_dataset,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(training_args.output_dir)
trainer.push_to_hub(dataset_name=dataset_id)

# Testing

In [None]:
import gc
import time


def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


clear_memory()

In [None]:
trained_model_id = "sergiopaniego/Qwen2.5-VL-3B-Instruct-Thinking"

In [None]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor

trained_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    trained_model_id,
    torch_dtype="auto",
    device_map="auto",
)
trained_processor = AutoProcessor.from_pretrained(trained_model_id, use_fast=True, padding_side="left")

In [None]:
import time
import torch
from qwen_vl_utils import process_vision_info


def generate_with_reasoning(problem, image):
    # Conversation setting for sending to the model
    conversation = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": problem},
            ],
        },
    ]
    prompt = trained_processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)

    # Process images using the process_vision_info from qwen_vl_utils
    image_inputs, video_inputs = process_vision_info(conversation)

    inputs = processor(
        text=[prompt],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(trained_model.device)

    # Generate text without gradients
    start_time = time.time()
    with torch.no_grad():
        output_ids = trained_model.generate(**inputs, max_new_tokens=500)
    end_time = time.time()

    # Decode and extract model response
    generated_text = trained_processor.decode(output_ids[0], skip_special_tokens=True)

    # Get inference time
    inference_duration = end_time - start_time

    # Get number of generated tokens
    num_input_tokens = inputs["input_ids"].shape[1]
    num_generated_tokens = output_ids.shape[1] - num_input_tokens

    return generated_text, inference_duration, num_generated_tokens

In [None]:
generated_text, inference_duration, num_generated_tokens = generate_with_reasoning(
    test_dataset[0]["problem"], test_dataset[0]["image"]
)
print(generated_text)

In [None]:
test_dataset[0]["solution"]