# Train และ Evaluate โมเดล

### 1. การติดตั้งและ Import Library

- ส่วนนี้คือการติดตั้ง library ที่จำเป็น เช่น unsloth, transformers, trl
- transformers คือ library หลักของ HuggingFace ที่มีโมเดล NLP และ Vision ที่ pretrained มาแล้ว
- trl เอาไว้ train/fine-tune โมเดลโดยใช้เทคนิค reinforcement learning หรือ supervised fine-tuning

In [None]:
# # 1) สร้างและเข้า env
# python -m venv .venv
# # Windows: .\.venv\Scripts\Activate.ps1
# # Linux/macOS: source .venv/bin/activate
# python -m pip install -U pip

# # 2) ล้างของเก่า
# pip uninstall -y torch torchvision torchaudio xformers

# # 3) ติดตั้ง PyTorch + CUDA runtime (เลือก 1 ทาง)
# pip install --index-url https://download.pytorch.org/whl/cu121 torch torchvision torchaudio
# # หรือใช้คำสั่งที่หน้า Get Started ให้มาสำหรับเวอร์ชัน CUDA ล่าสุดของคุณ

# # 4) xformers (เลือกจากล้อที่เข้ากัน)
# pip install xformers -f https://download.pytorch.org/whl/xformers/

# # 5) Unsloth
# pip install unsloth

ตรวจสอบว่า PyTorch เห็น GPU:

In [None]:
import torch, platform

print("Python:", platform.python_version())
print("Torch:", torch.__version__)
print("Built with CUDA:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

### 2. โหลดโมเดล Vision-Language

- โหลดโมเดล Qwen2.5-VL (Vision-Language model) ที่สามารถเข้าใจ รูปภาพ + ข้อความ
- tokenizer ทำหน้าที่แปลงข้อความเป็นตัวเลข (tokens)
- load_in_4bit=True → ใช้ quantization 4-bit เพื่อลดขนาดโมเดลและประหยัด GPU memory

In [1]:
from unsloth import FastVisionModel
import torch

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2.5-VL-3B-Instruct-bnb-4bit",
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth",
)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


W1004 23:35:55.926000 18320 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


🦥 Unsloth Zoo will now patch everything to make training faster!


  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"{DEVICE_TYPE}:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.9.11: Fast Qwen2_5_Vl patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 4050 Laptop GPU. Num GPUs = 1. Max memory: 5.997 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.8.0+cu129. CUDA: 8.9. CUDA Toolkit: 12.9. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


### 3. เพิ่ม LoRA (Low-Rank Adaptation)

- ใช้ LoRA (Low-Rank Adaptation) เพื่อ fine-tune โมเดลเฉพาะบางพารามิเตอร์ แทนที่จะ train ใหม่ทั้งหมด
- เราสามารถเลือกว่าจะฝึกเฉพาะ vision part หรือ language part ของโมเดลได้

In [2]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

### 4. โหลด Dataset

In [3]:
from datasets import load_from_disk

hf = load_from_disk("lung8_image_text")

print(hf)
print(hf.features)

Dataset({
    features: ['image', 'text', '__class__'],
    num_rows: 6085
})
{'image': Image(mode=None, decode=True), 'text': Value('string'), '__class__': Value('string')}


- ส่ง train_hf ให้เป็น train_dataset และ val_hf ให้เป็น eval_dataset ระหว่างฝึก
- ตอนจบเรียก trainer.evaluate(eval_dataset=test_hf) เพื่อรายงานผล บน test set

In [4]:
# แบ่ง train/test (30%) ก่อน
splits = hf.train_test_split(test_size=0.3, seed=42)
train_hf = splits["train"]
tmp_hf   = splits["test"]

# แบ่ง tmp ให้เป็น val/test อย่างละครึ่ง → ได้ 15/15
vt = tmp_hf.train_test_split(test_size=0.5, seed=42)
val_hf  = vt["train"]
test_hf = vt["test"]

print(train_hf)
print(val_hf)
print(test_hf)

Dataset({
    features: ['image', 'text', '__class__'],
    num_rows: 4259
})
Dataset({
    features: ['image', 'text', '__class__'],
    num_rows: 913
})
Dataset({
    features: ['image', 'text', '__class__'],
    num_rows: 913
})


### 5. แปลง Dataset เป็น Conversation Format

- แปลงข้อมูลให้เป็นรูปแบบ Chat (user → assistant)
- user → ส่งภาพ + คำสั่ง
- assistant → ตอบกลับด้วย LaTeX

In [None]:
instruction = "Describe the chest X-ray using precise clinical terms. Identify one main diagnostic category from: Chest_Changes, Degenerative_Infectious, Higher_Density, Inflammatory_Pneumonia, Lower_Density, Mediastinal_Changes, Normal, or Obstructive."

def convert_to_conversation(sample):
    cls_name = sample["__class__"]
    description = sample["text"]

    answer = f"Class: {cls_name}\nExplanation: {description}"
    
    conversation = [
        {"role": "user", "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : sample["image"]} ]
        },
        {"role" : "assistant", "content" : [
            {"type" : "text", "text" : answer} ]
        },
    ]
    
    return {"messages" : conversation}

converted_dataset = [convert_to_conversation(sample) for sample in train_hf]

converted_dataset_val = [convert_to_conversation(sample) for sample in val_hf]

converted_dataset_test = [convert_to_conversation(sample) for sample in test_hf]

converted_dataset[0]

### 6. ทดสอบโมเดลก่อนฝึก

In [None]:
FastVisionModel.for_inference(model) # Enable for inference!

image = hf[0]["image"]
instruction = "Describe the chest X-ray using precise clinical terms. Identify one main diagnostic category from: Chest_Changes, Degenerative_Infectious, Higher_Density, Inflammatory_Pneumonia, Lower_Density, Mediastinal_Changes, Normal, or Obstructive."

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                    use_cache = True, temperature = 1.5, min_p = 0.1)

### 7. ฝึกโมเดล (Training)

- ใช้ SFTTrainer ทำการ supervised fine-tuning
- ใช้ข้อมูล converted_dataset ที่มีคู่ (ภาพ → LaTeX)
- กำหนด hyperparameters เช่น batch size, learning rate

ในการแสดง Performance Graph สร้างฟังก์ชันก็บค่า Training Metrics

In [None]:
import numpy as np
from evaluate import load

metric = load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [lbl.strip() for lbl in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {
        "rouge1": result["rouge1"].mid.fmeasure,
        "rougeL": result["rougeL"].mid.fmeasure,
    }

แล้วเพิ่มใน SFTTrainer

In [None]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    eval_dataset=converted_dataset_val,   # เพิ่ม validation set
    args = SFTConfig(
        # ===== Training schedule =====
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        seed = 3407,
        
        # ===== Eval =====
        eval_strategy = "steps",         # หรือ "epoch"
        eval_steps = 5,                  # ถ้าใช้ "steps"
        prediction_loss_only = True,     # สำคัญ: ไม่ดึง logits/preds
        metric_for_best_model = "eval_loss",
        greater_is_better = False,

        # ===== Precision / dtype =====
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),

        # ===== Optimization =====
        learning_rate = 2e-4,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",

        # ===== Logging / reporting =====
        report_to = "none",
        logging_strategy = "steps",
        logging_steps = 1,

        # ===== Output =====
        output_dir = "outputs",

        # ===== Vision finetuning (required) =====
        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
    compute_metrics=compute_metrics  # ส่วนที่เพิ่มเข้ามาด
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# trainer_stats = trainer.train()

from unsloth import unsloth_train

trainer_stats = unsloth_train(trainer)

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

#### แสดง Performance Graph เช่น Loss

In [None]:
import matplotlib.pyplot as plt

loss_values = trainer_stats.training_loss  # หรือ trainer.state.log_history

loss_list = [x["loss"] for x in trainer.state.log_history if "loss" in x]

plt.plot(loss_list, label="Training Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.legend()
plt.show()

print(len(loss_list))
print(loss_list)

In [None]:
history = trainer.state.log_history

# ดึงเฉพาะจุดที่มี eval_loss
eval_points = [h for h in history if "eval_loss" in h]

steps = [h.get("step", i) for i, h in enumerate(eval_points)]
eval_losses = [h["eval_loss"] for h in eval_points]

import matplotlib.pyplot as plt

plt.figure()
plt.plot(steps, eval_losses, marker="o")
plt.xlabel("Step")
plt.ylabel("eval_loss")
plt.title("Validation Loss over time")
plt.grid(True)
plt.show()

train_points = [h for h in history if "loss" in h and "eval_loss" not in h]
train_steps = [h.get("step", i) for i, h in enumerate(train_points)]
train_losses = [h["loss"] for h in train_points]

plt.figure()
plt.plot(train_steps, train_losses, label="train loss")
plt.plot(steps, eval_losses, label="eval loss")
plt.xlabel("Step"); plt.ylabel("Loss"); plt.grid(True); plt.legend(); plt.show()

In [None]:
import pandas as pd

df = pd.DataFrame(history)
print(df.shape)
display(df)

### 8. ทดสอบโมเดลหลังฝึก

รัน inference อีกครั้งเพื่อเปรียบเทียบ performance หลัง fine-tuning

In [None]:
FastVisionModel.for_inference(model) # Enable for inference!

image = hf[0]["image"]
instruction = "Describe the chest X-ray using precise clinical terms. Identify one main diagnostic category from: Chest_Changes, Degenerative_Infectious, Higher_Density, Inflammatory_Pneumonia, Lower_Density, Mediastinal_Changes, Normal, or Obstructive."

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                    use_cache = True, temperature = 1.5, min_p = 0.1)

### 9. บันทึกโมเดล

- บันทึกเฉพาะ LoRA adapters ไม่ใช่ full model
- สามารถโหลดกลับมาใช้ใหม่ได้

In [None]:
from datetime import datetime
import os

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # เช่น 20251004_1658
save_dir = f"lora_model_{timestamp}"

os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"✅ Model saved to: {save_dir}")

ตอนนี้หากคุณต้องการโหลดอะแดปเตอร์ LoRA ที่เราเพิ่งบันทึกไว้สำหรับการอนุมาน ให้ตั้งค่า False ถึง True

In [None]:
if False:
    from unsloth import FastVisionModel
    model, tokenizer = FastVisionModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = True, # Set to False for 16bit LoRA
    )
    FastVisionModel.for_inference(model) # Enable for inference!

image = dataset[0]["image"]
instruction = "Write the LaTeX representation for this image."

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                    use_cache = True, temperature = 1.5, min_p = 0.1)    