In [1]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth",) # True or "unsloth" for long context

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"cuda:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.4.7: Fast Clip patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 12.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
model = FastVisionModel.get_peft_model(
   model,
   finetune_vision_layers     = True, # False if not finetuning vision layers
   finetune_language_layers   = True, # False if not finetuning language layers
   finetune_attention_modules = True, # False if not finetuning attention layers
   finetune_mlp_modules       = True, # False if not finetuning MLP layers

   r = 16,           # The larger, the higher the accuracy, but might overfit
   lora_alpha = 16,  # Recommended alpha == r at least
   lora_dropout = 0.05,
   bias = "none",
   random_state = 3407,
   use_rslora = True,  # We support rank stabilized LoRAs
   loftq_config = None, # And LoftQ
   # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

Unsloth: Making `base_model.model.vision_tower.vision_model` require gradients


In [3]:
model.print_trainable_parameters()

trainable params: 49,020,928 || all params: 7,615,768,576 || trainable%: 0.6437


In [4]:
from datasets import load_dataset
from datasets import load_dataset
dataset = load_dataset('json', data_files={'train': 'Robot_Arm_Data/train_balanced.json', 'test': 'Robot_Arm_Data/test_balanced.json'})


from PIL import Image

system_message = 'You are a Visual Language Model Trained to output robot arm end-effectors parameters.' \
'Base on the user requests, locate the appropriate object in the image and you must return the position and orientation to reach it in xml format.' \

def convert_to_conversation(sample, system_message = system_message):
    conversation = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : sample['prompt']},
            # {"type" : "image", "image" : Image.open(sample["images"]).resize((854, 480)).convert('RGB')} ]
            {"type" : "image", "image" : Image.open(sample["images"]).resize((630, 360)).convert('RGB')} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample["output"]} ]
        },
    ]
    return { "messages" : conversation }

In [5]:
converted_dataset_train = [convert_to_conversation(sample) for sample in dataset['train']]
converted_dataset_test = [convert_to_conversation(sample) for sample in dataset['test']]

In [6]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset_train,
    eval_dataset = converted_dataset_test[:150],  # Add evaluation dataset here
    args = SFTConfig(
        per_device_train_batch_size = 1,
        per_device_eval_batch_size = 1,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        # max_steps = 30,
        num_train_epochs = 3, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 10,
        eval_strategy = "steps",  # Enables evaluation
        eval_steps = 50,  # Set how often to evaluate (adjust as needed)
        save_steps = 100,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "Llava-1.6-Mistral/V1-p2",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 8,
        max_seq_length = 2048,
    ),
)

In [None]:
from unsloth import unsloth_train
trainer_stats = unsloth_train(trainer)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,715 | Num Epochs = 3 | Total steps = 1,017
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 49,020,928/7,000,000,000 (0.70% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
