## Packages Installation

In [None]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
!pip install --no-deps unsloth
!pip install gdown

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting xformers==0.0.29.post3
  Downloading xformers-0.0.29.post3-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting trl
  Downloading trl-0.21.0-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.8.8-py3-none-any.whl.metadata (9.4 kB)
Downloading xformers-0.0.29.post3-cp312-cp312-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.21.0-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import json
import re
from typing import Dict, Any

from unsloth import FastVisionModel

import torch
from PIL import Image
from datasets import Dataset
from trl import GRPOTrainer, GRPOConfig

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.8.0+cu126)
    Python  3.12.9 (you have 3.12.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


## Model

In [None]:
model, tokenizer = FastVisionModel.from_pretrained(
    "lmq1909/Qwen2.5-VL-7B-LQA-1e", # "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit",
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth",
)

==((====))==  Unsloth 2025.8.9: Fast Qwen2_5_Vl patching. Transformers: 4.55.2.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.69G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

video_preprocessor_config.json: 0.00B [00:00, ?B/s]

In [None]:
lora_rank = 4

model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = False,
    r = lora_rank,
    lora_alpha = lora_rank*2,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
)

Unsloth: Making `model.base_model.model.model.language_model` require gradients


## Data

In [None]:
!gdown --folder https://drive.google.com/drive/u/1/folders/1rNk1xV03yMhiSdP4zpa9vLZ-TchZNld1
!unzip '/content/vlsp-train/train_images.zip'

Retrieving folder contents
Processing file 15NXOSpWhJ51zCqrW89vXOtfyJaiYVlDR train_images.zip
Processing file 1cJbA7-W9h4hfQcziFZXbu_4atVLoJxG8 vlsp_2025_train.json
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From (original): https://drive.google.com/uc?id=15NXOSpWhJ51zCqrW89vXOtfyJaiYVlDR
From (redirected): https://drive.google.com/uc?id=15NXOSpWhJ51zCqrW89vXOtfyJaiYVlDR&confirm=t&uuid=5b5f32cc-0751-409a-a49e-ef0cb01f2b36
To: /content/vlsp-train/train_images.zip
100% 27.0M/27.0M [00:00<00:00, 119MB/s]
Downloading...
From: https://drive.google.com/uc?id=1cJbA7-W9h4hfQcziFZXbu_4atVLoJxG8
To: /content/vlsp-train/vlsp_2025_train.json
100% 1.86M/1.86M [00:00<00:00, 137MB/s]
Download completed
Archive:  /content/vlsp-train/train_images.zip
   creating: train_images/
  inflating: train_images/train_1_1.jpg  
  inflating: train_images/train_1_10.jpg  
  inflating: train_images/train_1_11.jpg  
  inflating: train_image

In [None]:
train_path = "/content/vlsp-train/vlsp_2025_train.json"

with open(train_path, "r", encoding="utf-8") as f:
    train = json.load(f)

In [None]:
system_prompt=(
    "Bạn là một trợ lý pháp lý tiếng Việt chuyên phân tích câu hỏi liên quan đến luật giao thông đường bộ và biển báo. "
    "Luôn trả lời hoàn toàn bằng tiếng Việt.\n\n"
    "Trả lời theo định dạng sau:\n"
    "<think>"
    "[Suy nghĩ, phân tích của bạn]"
    "</think>"
    "[Câu trả lời của bạn]"
)

def create_conversation(input):
    img_path = "/content/train_images/" + input["image_id"] + ".jpg"
    img = Image.open(img_path)

    if input["question_type"] == "Multiple choice":
        if input["answer"] == 40:  # Process edge case
            input["answer"] = "A"

        user_prompt = (
            "Dựa vào bối cảnh bên dưới, hãy phân tích kỹ trước khi trả lời câu hỏi.\n\n"
            "Loại câu hỏi: Trắc nghiệm (kết luận cuối cùng sau khi suy luận là 1 trong 4 lựa chọn: ‘A’, ‘B’, ‘C’, ‘D’. Không được giải thích gì thêm.)\n\n"
            f"Câu hỏi: {input['question']}\n\n"
            "4 lựa chọn:\n"
            f"A: {input['choices']['A']}\n"
            f"B: {input['choices']['B']}\n"
            f"C: {input['choices']['C']}\n"
            f"D: {input['choices']['D']}\n\n"
            "Hãy trả lời theo định dạng:\n"
            "<think>…phân tích chi tiết…</think>[Câu trả lời: A/B/C/D]"
        )
    else:
        user_prompt = (
            "Dựa vào bức ảnh, hãy phân tích kỹ trước khi trả lời câu hỏi.\n\n"
            "Loại câu hỏi: Đúng/Sai (kết luận cuối cùng chỉ là một từ: 'Đúng' hoặc 'Sai'. Không được giải thích gì thêm.)\n\n"
            f"Câu hỏi: {input['question']}\n\n"
            "Hãy trả lời theo định dạng:\n"
            "<think>…phân tích chi tiết…</think>[Câu trả lời: Đúng/Sai]"
        )
    return [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": [
                {"type": "text",  "text": user_prompt},
                {"type": "image", "image": img}
            ],
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": input["answer_think"]}
            ],
        },
    ]

In [None]:
def convert_input_to_grpo(input):
    conversation = create_conversation(input)
    rendered_prompt = tokenizer.apply_chat_template(
        conversation,
        tokenize=False,
        add_generation_prompt=True,
    )
    return {
        "prompt": rendered_prompt,
        "answer": str(input["answer"]),
        "question_type": str(input["question_type"])
    }

train_grpo = [convert_input_to_grpo(x) for x in train]
train_ds = Dataset.from_list(train_grpo)
print("The number of train samples: ", len(train_ds))

The number of train samples:  530


## Reward Function

In [None]:
# <think>[reasoning...]</think>[answer]
def _extract_reasoning(text: str) -> str:
    if not text: return ""
    m = re.search(r'<think>(.*?)</think>', text, re.DOTALL)
    return m.group(1).strip().lower() if m else ""

def _extract_answer(text: str) -> str:
    if not text: return ""
    m = re.search(r'<think>.*?</think>(.*)', text, re.DOTALL)
    return m.group(1).strip().lower() if m else ""

def reward_func(prompts, completions, answer, question_type, **kwargs):
    rewards = []

    for prompt, completion, a, qt in zip(prompts, completions, answer, question_type):
        score = 0.0

        w_formatting = 1
        w_reasoning = 3
        w_factual = 6

        reasoning = _extract_reasoning(completion)
        extracted_answer = _extract_answer(completion)

        # Formatting
        score += w_formatting/2 if bool(reasoning) else 0
        score += w_formatting/2 if bool(extracted_answer) else 0

        # Reasoning
        sw_length = 0.3
        sw_comprehensive = 0.5
        sw_keyword = 0.2

        # (1) Adequate length, prioritize longer reasoning
        length_bonus = min(len(reasoning.split()) / 2048, 1.0)
        score += w_reasoning * sw_length * length_bonus

        # (2) Comprehensive judgement
        if qt == "Multiple choice":
            candidates = ["a", "b", "c", "d"]
            candidate_score = min(sum( 1/len(candidates) for ch in candidates if re.search(rf"\b{ch}\b", reasoning.lower())), 1.0)
            score += w_reasoning * sw_comprehensive * candidate_score
        else:
            candidates = ["đúng", "sai"]
            candidate_score = min(sum(1/len(candidates) for ch in candidates if re.search(rf"\b{ch}\b", reasoning.lower())),1.0)
            score += w_reasoning * sw_comprehensive * candidate_score

        # (3) Contain keywords
        reasoning_keywords = [
            "phân tích", "giải thích", "so sánh", "đối chiếu",
            "nguyên nhân", "kết luận", "lý do", "bước", "giả thiết"
        ]
        keyword_hits = sum(kw in reasoning.lower() for kw in reasoning_keywords)
        if keyword_hits > 0:
            score += w_reasoning * sw_keyword * min(keyword_hits / len(reasoning_keywords), 1.0)

        # Factual
        if a.strip().lower() == extracted_answer:
            score += w_factual

        rewards.append(score)
    return rewards


## Training

In [None]:
grpo_config = GRPOConfig(
    output_dir="outputs",
    run_name="main_run",
    seed=3407,

    max_steps=200,
    learning_rate=1e-5,
    gradient_accumulation_steps=2,
    per_device_train_batch_size=2,
    fp16=False,
    bf16=True,

    num_generations=2,
    max_prompt_length=4096,
    max_completion_length=4096,
    temperature=0.7,
    beta=0.05,

    logging_steps=1,
    save_steps=20,
    save_total_limit=2,
    report_to=["none"],
)

grpo_config_test = GRPOConfig(
    output_dir="outputs",
    run_name="test_run",
    seed=3407,

    max_steps=10,
    learning_rate=1e-5,
    gradient_accumulation_steps=2,
    per_device_train_batch_size=2,
    fp16=False, # Changed to False
    bf16=True,  # Added bf16=True

    num_generations=2,
    max_prompt_length=4096,
    max_completion_length=4096,
    temperature=0.7,
    beta=0.05,

    logging_steps=1,
    save_steps=10,
    save_total_limit=1,
    report_to=["none"],
)

In [None]:
trainer = GRPOTrainer(
    model=model,
    tokenizer=tokenizer,
    reward_funcs=reward_func,
    args=grpo_config,
    train_dataset=train_ds,
)

In [None]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 530 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 2,523,136 of 8,294,689,792 (0.03% trained)
`generation_config` default values have been modified to match model-specific defaults: {'max_length': 128000, 'repetition_penalty': 1.05, 'bos_token_id': 151643}. If this is not desired, please set these values explicitly.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,entropy,rewards / reward_func / mean,rewards / reward_func / std
1,0.0,6.535787,3.149716,465.75,169.0,807.0,0.0,465.75,169.0,807.0,0.0,0,6.535787,4.35915
2,-0.0,3.9927,0.408934,113.5,17.0,193.0,0.0,113.5,17.0,193.0,0.0,No Log,3.9927,4.634491
3,0.0002,4.018705,0.451926,152.0,69.0,331.0,0.0,152.0,69.0,331.0,0.003898,No Log,4.018705,4.669651
4,0.0001,4.745561,4.533039,476.5,15.0,1132.0,0.0,476.5,15.0,1132.0,0.001805,No Log,4.745561,3.937034
5,0.0003,4.013399,0.458405,50.25,11.0,152.0,0.0,50.25,11.0,152.0,0.006848,No Log,4.013399,4.664406
6,0.0001,6.302042,3.561912,321.25,109.0,786.0,0.0,321.25,109.0,786.0,0.001221,No Log,6.302042,4.245253
7,0.0,4.280981,0.00404,287.75,175.0,374.0,0.0,287.75,175.0,374.0,0.001183,No Log,4.280981,4.943254
8,0.0001,6.59985,3.293047,353.5,136.0,848.0,0.0,353.5,136.0,848.0,0.001003,No Log,6.59985,4.405197
9,0.0001,5.02091,0.970597,403.0,351.0,496.0,0.0,403.0,351.0,496.0,0.001184,No Log,5.02091,4.439838
10,0.0001,2.127747,3.009088,178.25,40.0,346.0,0.0,178.25,40.0,346.0,0.001139,No Log,2.127747,4.255493


## Save Model

In [None]:
model.save_pretrained_merged("./final", tokenizer, save_method="merged_16bit")
tokenizer.save_pretrained("./final")

In [None]:
repo = "venomsnaker/Qwen2.5-VL-7B-Instruct-GRPO"
hf_write_token = ''

model.push_to_hub_merged(
    repo, tokenizer, save_method="merged_16bit", token=hf_write_token
)