### Installation

In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git

In [None]:
!pip install --upgrade transformers # For Qwen3VLForConditionalGeneration

In [None]:
!pip show torch torchaudio torchvision transformers bitsandbytes trl

### Unsloth

- Load The Base Model
- Note: Unsloth's `FastVisonModel` Only Supports Training On Single GPU. Loading And LoRA Might Work Fine, Training Start Will Cause Error On Multi-GPU.
- So `device_map="auto/balanced"` Will Not Work Good

In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

model, tokenizer = FastVisionModel.from_pretrained(
    "Qwen/Qwen3-VL-2B-Instruct", # OR "Qwen/Qwen3-VL-2B-Thinking" (Try Others Like InternVL)
    load_in_4bit = False, 
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

### Apply LoRA

- Recommended To Set R and Alpha Equal

**[NEW]** We also support finetuning ONLY the vision part of the model, or ONLY the language part. Or you can select both! You can also select to finetune the attention or the MLP layers!

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 256,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 256,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

### Data Prep
**We'll Be Using a text dataset in Instruction-Tuning SFT Format Converted In Suitable Format For `FastVisionModel`**

**Below We Just Verify By Inspecting Few Examples From Converted Dataset. And Make Train Split**


#### For Conversion Guide All You Need Is A JSON Dataset In Alpaca Format
**See [https://github.com/Vinayyyy7/FineTune-Vision-LMs.git](http://)**

**Your Output Should Be :**

```json
{
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "How to finetune a VL Model Only On Text-Based Dataset"
        }
      ]
    },
    {
      "role": "assistant",
      "content": [
        {
          "type": "text",
          "text": "For This Check Out This Notebook + Conversion Of Your Normal Alpaca Format Dataset To Vision SFT Dataset"
        }
      ]
    }
  ]
}
```

- Excluded `type: "image"` For Only Training It's Brain Not Eyes.

In [None]:
from datasets import load_dataset
dataset = load_dataset('json', data_files="path/to/data.jsonl", split = "train")

In [None]:
dataset

In [None]:
dataset[2]["messages"]

### Train the model

**We'll Be Using Transformer's DataCollator For Finetuning A Vision Model On A Text Only Dataset**

### Steps 

- First We Load Our Vision Model's Tokenizer And Our Converted Dataset
- Format The Dataset Entires
- Tokenize The Dataset To Feed Into The Model
- Set Training Args
- Start Fresh OR Resume From A Checkpoint

In [None]:
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForSeq2Seq, AutoTokenizer

# Load tokenizer
base_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen3-VL-2B-Instruct",
    trust_remote_code=True
)

if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token

# Load JSONL directly
dataset = load_dataset("json", data_files="/path/to/data.jsonl")
train_dataset = dataset["train"]

print(f"Loaded {len(train_dataset)} samples")
print(f"Sample:\n{train_dataset[0]}")

# Preprocessing function
def format_messages(examples):
    """Format messages as text"""
    formatted_texts = []
    
    for messages in examples['messages']:
        text = ""
        for msg in messages:
            role = msg.get('role', '').upper()
            content = msg.get('content', [])
            
            msg_text = ""
            for item in content:
                if isinstance(item, dict) and item.get('type') == 'text':
                    msg_text += item.get('text', '')
            
            text += f"{role}: {msg_text}\n"
        
        formatted_texts.append(text.strip())
    
    tokenized = base_tokenizer(
        formatted_texts,
        truncation=True,
        max_length=2048,
        padding="max_length",
        return_tensors=None
    )
    
    tokenized["labels"] = [ids[:] for ids in tokenized["input_ids"]]
    
    return tokenized

# Apply preprocessing
train_dataset = train_dataset.map(
    format_messages,
    batched=True,
    batch_size=16,
    remove_columns=['messages'],
    desc="Formatting and tokenizing"
)

print(f"Preprocessed dataset ready!")

FastVisionModel.for_training(model)

trainer = SFTTrainer(
    model=model,
    tokenizer=base_tokenizer,
    data_collator=DataCollatorForSeq2Seq(base_tokenizer, pad_to_multiple_of=8),
    train_dataset=train_dataset,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # max_steps=30,
        num_train_epochs=1,
        learning_rate=2e-5,
        logging_steps=1, # For Each Step
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        save_total_limit=2, # To Save Only 2 Latest Checkpoints
        seed=3407,
        output_dir="/kaggle/working/checkpoints",
        report_to="none",
        remove_unused_columns=False,
        max_length=2048,
    ),
)



- START TRAINING FROM FRESH

In [None]:
trainer_stats = trainer.train()

- RESUME FROM A EXISITING CHECKPOINT IN `/kaggle/checkpoints` e.g `/kaggle/checkpoints/checkpoint-3000`

In [None]:
trainer_stats = trainer.train(resume_from_checkpoint = True)

### Save FineTuned LoRA Adapters For Your VL Model

In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")

### Now Load The FineTuned LoRA With Your Loaded Base Model

In [None]:
if True:
    from unsloth import FastVisionModel
    model, tokenizer = FastVisionModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = True, # Set to False for 16bit LoRA
    )
    FastVisionModel.for_inference(model) # Enable for inference!

image = "path/to/image.jpg"
instruction = "Tell Me What Can You See In This Picture."

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

### Merge The Full Finetuned Model In 16bit

In [None]:
# Save locally to 16bit
if True: model.save_pretrained_merged("unsloth_finetune", tokenizer,)

### Upload To Huggingface

- Upload Checkpoints
- Upload Fully Merged Model
- Upload Just LoRA Adapters

In [None]:
from huggingface_hub import login
login()

- Upload Any Of The Folders From Above After Login

In [None]:
from huggingface_hub import upload_folder
import os

folder_path = input("Enter the full path of the folder to upload (e.g., /kaggle/checkpoints/checkpoint-100): ").strip()

if not os.path.exists(folder_path):
    raise FileNotFoundError(f"‚ùå Folder not found: {folder_path}")
if not os.path.isdir(folder_path):
    raise NotADirectoryError(f"‚ùå This is not a folder: {folder_path}")

# Input repo ID
repo_id = input("Enter the Hugging Face repo ID (e.g., username/my-model): ").strip()
if not repo_id:
    raise ValueError("Repo ID cannot be empty.")

local_folder_name = os.path.basename(folder_path)
repo_subfolder = input(f"Enter subfolder name in repo (default: '{local_folder_name}'): ").strip()
if not repo_subfolder:
    repo_subfolder = local_folder_name

# Perform upload using stored credentials
print(f"\nüöÄ Uploading '{folder_path}' to repo '{repo_id}', subfolder: '{repo_subfolder}'...")

try:
    upload_folder(
        folder_path=folder_path,
        repo_id=repo_id,
        repo_type="model",           # Use "dataset" if uploading to a dataset repo
        path_in_repo=repo_subfolder,
        commit_message=f"Upload folder {local_folder_name}"
    )
    print(f"\n‚úÖ Successfully uploaded to: https://huggingface.co/{repo_id}/tree/main/{repo_subfolder}")
except Exception as e:
    print(f"‚ùå Upload failed: {e}")

- Download And Save Any Checkpoint To Continue Training

In [None]:
from huggingface_hub import hf_hub_download, snapshot_download
from pathlib import Path
import os

repo_id = input("Enter the Hugging Face repo ID (e.g., username/model-name): ").strip()
if not repo_id:
    raise ValueError("Repo ID cannot be empty.")

print("\nChoose download mode:")
print("1 ‚Üí Download entire repo")
print("2 ‚Üí Download a specific folder only")
choice = input("\nEnter 1 or 2: ").strip()

local_dir = input(f"\nEnter the local path to save files (e.g., /kaggle/working/downloaded): ").strip()
if not local_dir:
    local_dir = "/kaggle/working"  # fallback

os.makedirs(local_dir, exist_ok=True)
print(f"üìÅ Files will be saved to: {os.path.abspath(local_dir)}")

try:
    if choice == "1":
        # Download entire repo
        print(f"\nüöÄ Downloading entire repo '{repo_id}'...")
        snapshot_download(
            repo_id=repo_id,
            repo_type="model",  # Change to "dataset" if needed
            local_dir=local_dir,
            local_dir_use_symlinks=False  # Copies all files fully (no symlinks)
        )
        print(f"\n‚úÖ Entire repo downloaded to: {os.path.abspath(local_dir)}")

    elif choice == "2":
        # Download specific folder
        folder_name = input("Enter the exact folder name to download (e.g., checkpoint-100): ").strip()
        if not folder_name:
            raise ValueError("Folder name cannot be empty.")

        print(f"\nüöÄ Downloading folder '{folder_name}' from repo '{repo_id}'...")

        # We use snapshot_download with allow_patterns to filter just that folder
        snapshot_download(
            repo_id=repo_id,
            repo_type="model", 
            local_dir=local_dir,
            local_dir_use_symlinks=False,
            allow_patterns=f"{folder_name}/**"  # Include all subfiles
        )
        print(f"\n‚úÖ Folder '{folder_name}' downloaded to: {os.path.join(os.path.abspath(local_dir), folder_name)}")

    else:
        print("‚ùå Invalid choice. Please run again and enter 1 or 2.")

except Exception as e:
    print(f"‚ùå Download failed: {e}")

#### Thank You Now You Have A Fully Finetuned Merged Vision Language Model With Your Own Preferences ! Ready To Chat