In [1]:
from unsloth import FastVisionModel 
import torch

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  name = re.sub("\.([\d]{1,})\.", r"[\1].", name)


🦥 Unsloth Zoo will now patch everything to make training faster!


  GPU_BUFFER = torch.empty(2*256*2048, dtype = dtype, device = "cuda:0")


==((====))==  Unsloth 2025.3.1: Fast Mllama vision patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 12.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

In [2]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

Unsloth: Making `model.base_model.model.language_model.model` require gradients


In [3]:
model.print_trainable_parameters()

trainable params: 52,428,800 || all params: 10,722,649,635 || trainable%: 0.4890


In [4]:
from datasets import load_dataset
dataset = load_dataset('json', data_files={'train': 'Robot_Arm_Data/train.json', 'test': 'Robot_Arm_Data/test.json'})

from PIL import Image

system_message = None # Llama Vision does not support system message

def convert_to_conversation(sample, system_message = system_message):
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : sample['prompt']},
            {"type" : "image", "image" : Image.open(sample["images"]).resize((854, 480)).convert('RGB')} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample["output"]}]
        },
    ]
    return { "messages" : conversation }

In [5]:
converted_dataset_train = [convert_to_conversation(sample) for sample in dataset['train']]
converted_dataset_test = [convert_to_conversation(sample) for sample in dataset['test']]

In [6]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) 

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), 
    train_dataset = converted_dataset_train,
    eval_dataset = converted_dataset_test[:150],  # Add evaluation dataset here
    args = SFTConfig(
        per_device_train_batch_size = 2,
        per_device_eval_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 30,
        num_train_epochs = 3, 
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 10,
        eval_strategy = "steps",  # Enables evaluation
        eval_steps = 50,  # Set how often to evaluate (adjust as needed)
        save_steps = 100,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "LlamaVision",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 8,
        max_seq_length = 2048,
    ),
)

In [None]:
from unsloth import unsloth_train
trainer_stats = unsloth_train(trainer)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,523 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,320
 "-____-"     Number of trainable parameters = 52,428,800
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss,Validation Loss


In [None]:
import pandas as pd

log = pd.DataFrame(trainer.states.log_history)
log.to_csv('log.csv')