In [1]:
from unsloth import FastVisionModel
import torch
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2-VL-7B-Instruct-unsloth-bnb-4bit",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.1.4: Fast Qwen2_Vl patching. Transformers: 4.57.6.
   \\   /|    NVIDIA GeForce RTX 5090. Num GPUs = 1. Max memory: 31.367 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 12.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


In [2]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 8,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

Unsloth: Making `model.base_model.model.model.visual` require gradients


In [3]:
def convert_dataset_format(input_data):
    new_dataset = []

    for item in input_data:
        new_messages = []
        image_path = None

        for msg in item["messages"]:
            role = msg["role"]
            content = msg["content"]

            text_parts = []

            if isinstance(content, list):
                for part in content:
                    if part["type"] == "image":
                        image_path = part["image_path"]
                    elif part["type"] == "text":
                        text_parts.append(part["text"])
                final_text = " ".join(text_parts)
            else:
                final_text = content

            new_messages.append({
                "role": role,
                "content": final_text
            })

        new_entry = {
            "messages": new_messages,
            "images": [image_path] if image_path else []
        }

        new_dataset.append(new_entry)

    return new_dataset


In [4]:
import json

with open("workspace/Dataset/train_multimodal.json", "r") as f:
    raw_data = json.load(f)
    raw_data = convert_dataset_format(raw_data)
def normalize_sample(sample):
    for msg in sample["messages"]:
        if isinstance(msg["content"], str):
            msg["content"] = [{"type": "text", "text": msg["content"]}]
    return sample


normalized = [normalize_sample(x) for x in raw_data]

with open("workspace/Dataset/course_dataset_normalized.json", "w") as f:
    json.dump(normalized, f, indent=2)


In [5]:
import json
import os
from datasets import Dataset

base_path = "workspace/Dataset"

def extract_text(content):
    if isinstance(content, str):
        return content
    if isinstance(content, list):
        for block in content:
            if block.get("type") == "text":
                return block.get("text", "")
    return ""

def to_unsloth_vl(examples):
    messages_out = []

    for i in range(len(examples["messages"])):
        msgs = examples["messages"][i]
        images = examples["images"][i]

        user_text = extract_text(msgs[0]["content"])
        assistant_text = extract_text(msgs[1]["content"])

        content_user = []

        # Only add image if it exists
        if images:
            image_path = os.path.join(base_path, images[0])
            content_user.append({"type": "image", "image": image_path})

        content_user.append({"type": "text", "text": user_text})

        messages_out.append([
            {
                "role": "user",
                "content": content_user,
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": assistant_text}
                ],
            },
        ])

    return {"messages": messages_out}



dataset = Dataset.from_list(normalized)
dataset = dataset.map(
    to_unsloth_vl,
    batched=True,
    remove_columns=dataset.column_names,
)


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 395/395 [00:00<00:00, 28452.32 examples/s]


In [6]:
sample = dataset[0]

print(type(sample["messages"]))
print(sample["messages"])
print(type(sample["messages"][0]))


<class 'list'>
[{'content': [{'image': 'workspace/Dataset/images/1.jpeg', 'text': None, 'type': 'image'}, {'image': None, 'text': 'A 20-year-old woman from Sudan presents to a hospital in northern Uganda with a 2-day history of fever (39.6¬∞C), severe asthenia, chest and abdominal pain, nausea, vomiting, diarrhoea, and slight non-productive cough. Physical examination reveals semiconsciousness, hypotension (BP 90/60mmHg), abdominal tenderness, hepatosplenomegaly, and bleeding from the gums. Later, her condition rapidly worsens, and the clinical photograph shows copious oral bleeding (_page_1_Picture_4.jpeg). What is the most likely diagnosis, and what are the primary differential diagnoses that need to be considered in this epidemiological context?', 'type': 'text'}], 'role': 'user'}, {'content': [{'image': None, 'text': "Given the patient's presentation with high fever (39.6¬∞C), shock (BP 90/60mmHg), multiple systemic symptoms, hepatosplenomegaly, and significant haemorrhage (gum ble

In [7]:
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        #max_steps = 30,
        num_train_epochs = 3, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        max_length = 2048,
    ),
)

Unsloth: Model does not have a default image size - using 512




In [8]:
trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 395 | Num Epochs = 3 | Total steps = 150
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 25,427,968 of 8,316,803,584 (0.31% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.0879
2,2.0322
3,1.8028
4,1.9353
5,1.8295
6,1.8634
7,1.7437
8,1.758
9,1.6025
10,1.6241


TrainOutput(global_step=150, training_loss=1.1702577741940816, metrics={'train_runtime': 338.7421, 'train_samples_per_second': 3.498, 'train_steps_per_second': 0.443, 'total_flos': 2.780876610121728e+16, 'train_loss': 1.1702577741940816, 'epoch': 3.0})

In [None]:
tokenizer.push_to_hub("usernameAvailable142/qwen2-vl-finetuned-lora") 

Processing Files (0 / 0): |          |  0.00B /  0.00B            
Processing Files (1 / 1): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11.4MB / 11.4MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  


In [None]:
model.push_to_hub("usernameAvailable142/qwen2-vl-finetuned-lora")

Processing Files (0 / 0): |          |  0.00B /  0.00B            
Processing Files (0 / 1):   1%|          |  559kB /  102MB,  699kB/s  
Processing Files (0 / 1):   2%|‚ñè         | 1.68MB /  102MB, 1.68MB/s  
Processing Files (0 / 1):   3%|‚ñé         | 2.79MB /  102MB, 2.33MB/s  
Processing Files (0 / 1):   6%|‚ñå         | 6.15MB /  102MB, 4.39MB/s  
Processing Files (0 / 1):  10%|‚ñà         | 10.6MB /  102MB, 6.63MB/s  
Processing Files (0 / 1):  17%|‚ñà‚ñã        | 17.3MB /  102MB, 9.62MB/s  
Processing Files (0 / 1):  18%|‚ñà‚ñä        | 18.4MB /  102MB, 9.21MB/s  
Processing Files (0 / 1):  22%|‚ñà‚ñà‚ñè       | 22.9MB /  102MB, 10.4MB/s  
Processing Files (0 / 1):  30%|‚ñà‚ñà‚ñâ       | 30.2MB /  102MB, 12.6MB/s  
Processing Files (0 / 1):  32%|‚ñà‚ñà‚ñà‚ñè      | 33.0MB /  102MB, 12.7MB/s  
Processing Files (0 / 1):  37%|‚ñà‚ñà‚ñà‚ñã      | 37.4MB /  102MB, 13.4MB/s  
Processing Files (0 / 1):  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 42.4MB /  102MB, 14.1MB/s  
Processing Files (0 / 1):  

Saved model to https://huggingface.co/usernameAvailable142/qwen2-vl-finetuned-lora
