In [None]:
%%capture
import os
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["UNSLOTH_DISABLE_DYNAMO"] = "1"


In [None]:
%%capture
!pip install --no-deps \
    bitsandbytes accelerate peft trl triton cut_cross_entropy unsloth_zoo

!pip install \
    sentencepiece protobuf \
    "datasets==4.3.0" \
    "huggingface_hub>=0.34.0" \
    hf_transfer

!pip install --no-deps unsloth


In [None]:
%%capture
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2


In [None]:
import torch, unsloth
print("torch:", torch.__version__)
print("cuda:", torch.cuda.is_available())


# LOAD MODEL

In [None]:
from unsloth import FastVisionModel
import torch
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2-VL-7B-Instruct-unsloth-bnb-4bit",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
)


In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

# LOAD DATASET 

In [None]:
import json

with open("/kaggle/input/trieuds/train_multimodal.json", "r", encoding="utf-8") as f:
    raw = json.load(f)

cleaned = []

for sample in raw:
    user = sample["messages"][0]
    assistant = sample["messages"][1]

    # Extract user text
    user_text_parts = []
    user_image_paths = []

    for item in user["content"]:
        if item["type"] == "text":
            user_text_parts.append(item["text"])
        elif item["type"] == "image":
            user_image_paths.append(item["image_path"])

    cleaned.append({
        "messages": [
            {
                "role": "user",
                "content": None,
                "texts": user_text_parts,
                "image_paths": user_image_paths if user_image_paths else None
            },
            {
                "role": "assistant",
                "content": assistant["content"],
                "texts": None,
                "image_paths": None
            }
        ]
    })

with open("clean_dataset.json", "w", encoding="utf-8") as f:
    json.dump(cleaned, f, indent=2, ensure_ascii=False)

print("DONE â€” cleaned_dataset.json generated.")


In [None]:
import json

with open("clean_dataset.json", "r") as f:
    train_dataset3 = json.load(f)


# DEFINE FORMATTING FUNCTION

In [None]:
train_dataset3[0]

In [None]:
def to_chatml(sample):
    """
    TRL-safe ChatML formatter.
    Handles:
      - str
      - dict (single sample)
      - LazyBatch (batched samples)
    Returns: List[str]
    """

    # Case 1: already formatted text
    if isinstance(sample, str):
        return [sample]

    # Case 2: batched input (LazyBatch)
    if "messages" in sample and isinstance(sample["messages"], list) and \
       len(sample["messages"]) > 0 and isinstance(sample["messages"][0], list):
        outputs = []
        for messages in sample["messages"]:
            outputs.extend(to_chatml({"messages": messages}))
        return outputs

    # Case 3: single dict sample
    conversation = []

    for msg in sample["messages"]:
        role = msg["role"]
        conversation.append(f"<|{role}|>")

        if msg.get("image_paths"):
            for img in msg["image_paths"]:
                conversation.append(f"<image>{img}</image>")

        if msg.get("texts"):
            for text in msg["texts"]:
                conversation.append(text)

        if msg.get("content"):
            conversation.append(msg["content"])

    return ["\n".join(conversation)]


In [None]:
print(type(train_dataset3[0]))
print(to_chatml(train_dataset3[0]))


In [None]:
from datasets import Dataset

train_dataset3 = Dataset.from_list(train_dataset3)
train_dataset3 = train_dataset3.shuffle(seed = 3407)
train_dataset3[0]


# TRAINER & TRAINING

In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset3,
    eval_dataset=None,
    args=SFTConfig(
        dataset_text_field=None,   
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        warmup_steps=5,
        num_train_epochs=3,
        max_steps=200,
        learning_rate=1e-5,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.001,
        lr_scheduler_type="linear",
        seed=3407,
        report_to="none",
    ),
    formatting_func=to_chatml
)


In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# TEST

In [None]:
#messages = [
#    {"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
#]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 256, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

# SAVE MODEL TO DOWNLOAD

In [None]:
# Save the model
model.save_pretrained("qwen2_7b")
tokenizer.save_pretrained("qwen2_7b")

# Create a zip file to download
import zipfile
import os

# Create zip file of the model
with zipfile.ZipFile('qwen2_7b.zip', 'w') as zipf:
    for root, dirs, files in os.walk('qwen2_7b'):
        for file in files:
            zipf.write(os.path.join(root, file))
