In [1]:
!pip install transformers datasets torch torchvision accelerate peft bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 k

In [2]:
from huggingface_hub import login
login(new_session=False)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
import torch
from transformers import (
    PaliGemmaProcessor, 
    PaliGemmaForConditionalGeneration,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    TrainerCallback
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training, get_peft_config
from datasets import Dataset, load_dataset
from PIL import Image
import json
import os
from torch.utils.data import DataLoader

In [28]:

# 1. Cấu hình 4-bit quantization - Force GPU usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# 2. Tải model và processor - Explicit GPU placement
model_id = "google/paligemma-3b-pt-224"
processor = PaliGemmaProcessor.from_pretrained(
    model_id,
    trust_remote_code=True
)

print("📥 Loading model to GPU...")
model = PaliGemmaForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",  # This should put model on GPU
    trust_remote_code=True,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16,
    low_cpu_mem_usage=True
)

# Verify model is on GPU
if torch.cuda.is_available():
    print(f"✅ Model loaded on: {next(model.parameters()).device}")
    print(f"🧠 GPU memory after model load: {torch.cuda.memory_allocated()/1e9:.2f}GB")
else:
    print("⚠️ Model loaded on CPU")

📥 Loading model to GPU...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Model loaded on: cuda:0
🧠 GPU memory after model load: 6.97GB


In [29]:
# 3. Cấu hình LoRA - Tối ưu cho tốc độ

model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,  # Reduced from 16 to 8 for faster training
    lora_alpha=16,  # Reduced proportionally
    lora_dropout=0.05,  # Reduced dropout
    target_modules=[
        "q_proj", "v_proj", "k_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none"
)

# Áp dụng LoRA lên model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 22,597,632 || all params: 2,946,064,112 || trainable%: 0.7670


In [40]:
import pandas as pd
def load_data_optimized():
    csv_file = "/kaggle/input/vieduvqa/Verify_Convert_80.csv"
    image_folder = "/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu"
    
    df = pd.read_csv(csv_file)
    # df = df.head(100).copy()  # Increased sample size for better training
    print(f"Số mẫu: {len(df)}")
    
    # Pre-validate images to avoid runtime errors
    valid_data = []
    for _, row in df.iterrows():
        image_id = row['ImageID']
        category = image_id.split('_')[0]
        image_path = os.path.join(image_folder, category, f"{image_id}.png")
        
        if os.path.exists(image_path):
            try:
                # Pre-load and validate image
                img = Image.open(image_path).convert('RGB')
                # Resize to consistent size for faster processing
                img = img.resize((224, 224), Image.Resampling.LANCZOS)
                
                valid_data.append({
                    'image': img,
                    'question': row['Question'],
                    'answer': row['Answer']
                })
            except Exception as e:
                print(f"Lỗi ảnh {image_id}: {e}")
        else:
            print(f"Không tìm thấy: {image_path}")
    
    return Dataset.from_list(valid_data)

# Load data
dataset = load_data_optimized()
print(f"Dataset có {len(dataset)} mẫu")

Số mẫu: 18838
Dataset có 18838 mẫu


In [41]:
# Chia train/test
train_test = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test['train']
test_dataset = train_test['test']
print(f"Train: {len(train_dataset)}, Test: {len(test_dataset)}")

Train: 16954, Test: 1884


In [42]:
print(train_dataset[0])

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=224x224 at 0x782493CA4490>, 'question': 'Ngoài cậu bé, những đồ vật nào khác có màu đỏ trong ảnh?', 'answer': 'Ngoài cậu bé, những đồ vật khác có màu đỏ trong ảnh là quả táo, một số cuốn sách, một phần của chiếc ghế và một hộp Rubik.\n'}


In [76]:
# def preprocess_batch(examples):
#     """Optimized preprocessing for batch processing"""
#     batch_size = len(examples['image'])
    
#     # Process all images and questions at once
#     images = []
#     questions = []
#     answers = []
    
#     for i in range(batch_size):
#         image = examples['image'][i]
#         question = f"<image>{examples['question'][i]}"
#         answer = examples['answer'][i]
        
#         images.append(image)
#         questions.append(question)
#         answers.append(answer)
    
#     # Batch tokenization for efficiency
#     inputs = processor(
#         text=questions,
#         images=images,
#         return_tensors="pt",
#         padding="max_length",
#         truncation=True,
#         max_length=256,  # Reduced from 512
#         do_resize=True,
#         size={"height": 224, "width": 224}
#     )
    
#     # Tokenize labels
#     with processor.tokenizer.as_target_tokenizer():
#         labels = processor.tokenizer(
#             text=answers,
#             return_tensors="pt",
#             truncation=True,
#             padding=True,
#             max_length=64  # Reduced from 128
#         )
    
#     inputs["labels"] = labels.input_ids
#     return inputs

In [43]:
def preprocess_batch(examples):
    """Chuẩn hóa batch dữ liệu cho VQA"""
    images = []
    questions = []
    answers = []

    for image, question, answer in zip(examples["image"], examples["question"], examples["answer"]):
        questions.append(f"<image>{question}")
        images.append(image)
        answers.append(answer)

    # Tokenize input (image + question)
    inputs = processor(
        text=questions,
        images=images,
        return_tensors=None,       # ❗ KHÔNG ép thành tensor ở đây
        padding=False,             # ❗ KHÔNG padding
        truncation=True,
        max_length=256,
        do_resize=True,
        size={"height": 224, "width": 224}
    )

    # Tokenize labels (answer)
    with processor.tokenizer.as_target_tokenizer():
        labels = processor.tokenizer(
            text=answers,
            return_tensors=None,    # ❗ Giữ raw list of input_ids
            padding=False,
            truncation=True,
            max_length=64
        )

    # Để labels dạng list[int] (không tensor), để collator xử lý
    inputs["labels"] = labels["input_ids"]
    return inputs


In [44]:
print("🔄 Preprocessing data...")
train_processed = train_dataset.map(
    preprocess_batch, 
    batched=True, 
    batch_size=4,
    remove_columns=dataset.column_names,
    num_proc=1,
    desc="Processing train data"
)

test_processed = test_dataset.map(
    preprocess_batch, 
    batched=True, 
    batch_size=4,
    remove_columns=dataset.column_names,
    num_proc=1,
    desc="Processing test data"
)
# Move processed data to GPU if needed (for small datasets)

# train_processed.save_to_disk("dataset_cached/train")
# test_processed.save_to_disk("dataset_cached/test")
print("💾 Data preprocessing completed")

🔄 Preprocessing data...


Processing train data:   0%|          | 0/16954 [00:00<?, ? examples/s]



Processing test data:   0%|          | 0/1884 [00:00<?, ? examples/s]

Saving the dataset (0/21 shards):   0%|          | 0/16954 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/1884 [00:00<?, ? examples/s]

💾 Data preprocessing completed


In [13]:
# # Test 1 sample trước
# sample = dataset[0]  # 1 sample đầu tiên
# prompt = f"answer vi\n{sample['question']}"
# inputs = processor(images=sample["image"], text=prompt, return_tensors="pt")
# print("Preprocessing OK!")

You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.


Preprocessing OK!


In [45]:
train_processed

Dataset({
    features: ['input_ids', 'attention_mask', 'pixel_values', 'labels'],
    num_rows: 16954
})

In [64]:

# 6. GPU-Optimized Training Arguments
training_args = TrainingArguments(
    output_dir="pali_gemma_vqa_8bit",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=5e-4,
    fp16=True,  # Khi dùng 4-bit, tính toán bên trong đã là bfloat16
    dataloader_num_workers=4,
    remove_unused_columns=False,
    logging_steps=10,
    save_steps=200,
    label_names=["labels"],
    report_to=[],
    save_total_limit=1,
    optim="adamw_torch_fused",
    eval_strategy="no",
    save_strategy="no"
)

In [65]:
import os
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"


In [66]:
from transformers import TrainerCallback

class TerminateOnEndCallback(TrainerCallback):
    def on_train_end(self, args, state, control, **kwargs):
        print("🚩 Training ended.")
        exit(0)


In [81]:
# from torch.nn.utils.rnn import pad_sequence

# class OptimizedVQADataCollator:
#     def __init__(self, processor):
#         self.processor = processor

#     def __call__(self, features):
#         batch = {}
#         for key in features[0].keys():
#             vals = [f[key] for f in features]
#             if key == "labels":
#                 # đảm bảo từng phần tử là tensor
#                 tensors = [v if isinstance(v, torch.Tensor) else torch.tensor(v) for v in vals]
#                 batch[key] = pad_sequence(tensors, batch_first=True, padding_value=-100)
#             else:
#                 tensors = [v if isinstance(v, torch.Tensor) else torch.tensor(v) for v in vals]
#                 batch[key] = torch.stack(tensors)
#         return batch


In [67]:
# 8. GPU Performance Monitoring Callback
class GPUOptimizedTrainingCallback(TrainerCallback):
    def __init__(self):
        self.start_time = None
    
    def on_train_begin(self, args, state, control, **kwargs):
        import time
        self.start_time = time.time()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            print(f"🚀 Training started on GPU: {torch.cuda.get_device_name(0)}")
            print(f"💾 Initial GPU memory: {torch.cuda.memory_allocated()/1e9:.2f}GB")
        else:
            print("⚠️ Training started on CPU - this will be very slow!")
    
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 10 == 0:
            if torch.cuda.is_available():
                gpu_memory = torch.cuda.memory_allocated() / 1e9
                gpu_reserved = torch.cuda.memory_reserved() / 1e9
                gpu_util = f"{gpu_memory:.2f}GB/{gpu_reserved:.2f}GB"
                print(f"Step {state.global_step} | GPU Memory: {gpu_util}")
                
                # Check for GPU utilization issues
                if gpu_memory < 1.0:  # Less than 1GB usage might indicate CPU training
                    print("⚠️ Low GPU memory usage - check if model is on GPU!")
            else:
                print(f"Step {state.global_step} | Running on CPU")
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "train_loss" in logs:
            import time
            elapsed = time.time() - self.start_time if self.start_time else 0
            steps_per_sec = state.global_step / elapsed if elapsed > 0 else 0
            device_info = "GPU" if torch.cuda.is_available() else "CPU"
            print(f"Loss: {logs['train_loss']:.4f} | Steps/sec: {steps_per_sec:.2f} | Device: {device_info}")


In [68]:
from torch.nn.utils.rnn import pad_sequence
import torch

class OptimizedVQADataCollator:
    def __init__(self, processor, label_pad_token_id=-100):
        self.processor = processor
        self.label_pad_token_id = label_pad_token_id

    def __call__(self, features):
        batch = {}

        # Pad input_ids, attention_mask theo max_length trong batch
        input_ids = [f["input_ids"] for f in features]
        attention_mask = [f["attention_mask"] for f in features]
        
        # Nếu pixel_values là list thì convert
        pixel_values = torch.stack([
            torch.tensor(f["pixel_values"]) if not isinstance(f["pixel_values"], torch.Tensor) else f["pixel_values"]
            for f in features
        ])

        input_ids_padded = pad_sequence(
            [torch.tensor(ids) for ids in input_ids],
            batch_first=True,
            padding_value=self.processor.tokenizer.pad_token_id
        )
        attention_mask_padded = pad_sequence(
            [torch.tensor(mask) for mask in attention_mask],
            batch_first=True,
            padding_value=0
        )

        # Pad labels theo input_ids length
        label_pad_len = input_ids_padded.shape[1]
        labels = [torch.tensor(f["labels"]) for f in features]
        labels_padded = pad_sequence(labels, batch_first=True, padding_value=self.label_pad_token_id)

        # Align labels to input_ids shape
        if labels_padded.shape[1] < label_pad_len:
            pad_len = label_pad_len - labels_padded.shape[1]
            labels_padded = torch.nn.functional.pad(labels_padded, (0, pad_len), value=self.label_pad_token_id)
        elif labels_padded.shape[1] > label_pad_len:
            labels_padded = labels_padded[:, :label_pad_len]

        batch["input_ids"] = input_ids_padded
        batch["attention_mask"] = attention_mask_padded
        batch["pixel_values"] = pixel_values
        batch["labels"] = labels_padded

        return batch


In [69]:
data_collator = OptimizedVQADataCollator(processor)

batch = data_collator([train_processed[i] for i in range(2)])
for k, v in batch.items():
    if isinstance(v, torch.Tensor):
        print(f"{k}: {v.shape}")
    else:
        print(f"{k}: {type(v)}")


input_ids: torch.Size([2, 273])
attention_mask: torch.Size([2, 273])
pixel_values: torch.Size([2, 3, 224, 224])
labels: torch.Size([2, 273])


In [70]:
# # 9. Initialize Trainer with GPU verification
# data_collator = OptimizedVQADataCollator(processor)


# Shape phải giống nhau ở dim1

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_processed,
    eval_dataset=None,
    data_collator=data_collator,
    tokenizer=processor.tokenizer,
    callbacks=[GPUOptimizedTrainingCallback()]
)

print("🚀 Bắt đầu fine-tuning PaliGemma...")
print(f"📊 Training với {len(train_dataset)} samples")
print(f"🖥️  Device: {training_args.device}")

# Bắt đầu training
try:
    trainer.train()
    
    # Lưu model
    print("💾 Đang lưu model...")
    trainer.save_model("./paligemma-vietnamese-vqa-final")
    processor.save_pretrained("./paligemma-vietnamese-vqa-final")
    print("✅ Hoàn thành!")
except KeyboardInterrupt:
    print("⚠️ Training bị dừng bởi người dùng")
    trainer.save_model("./paligemma-vietnamese-vqa-interrupted")
    processor.save_pretrained("./paligemma-vietnamese-vqa-interrupted")
    
except Exception as e:
    print(f"❌ Lỗi: {str(e)}")
    raise

  trainer = Trainer(


🚀 Bắt đầu fine-tuning PaliGemma...
📊 Training với 16954 samples
🖥️  Device: cuda:0
🚀 Training started on GPU: Tesla P100-PCIE-16GB
💾 Initial GPU memory: 8.13GB


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
10,38.6329
20,33.377
30,32.778
40,32.9802
50,32.1702
60,32.3774
70,30.0426
80,29.9188
90,30.518
100,31.299


Step 10 | GPU Memory: 8.31GB/16.62GB
Step 20 | GPU Memory: 8.31GB/16.62GB
Step 30 | GPU Memory: 8.31GB/16.62GB
Step 40 | GPU Memory: 8.31GB/16.62GB
Step 50 | GPU Memory: 8.31GB/16.65GB
Step 60 | GPU Memory: 8.31GB/16.68GB
Step 70 | GPU Memory: 8.31GB/16.68GB
Step 80 | GPU Memory: 8.31GB/16.68GB
Step 90 | GPU Memory: 8.31GB/15.56GB
Step 100 | GPU Memory: 8.31GB/14.45GB
Step 110 | GPU Memory: 8.31GB/14.46GB
Step 120 | GPU Memory: 8.31GB/14.46GB
Step 130 | GPU Memory: 8.31GB/14.46GB
Step 140 | GPU Memory: 8.31GB/14.46GB
Step 150 | GPU Memory: 8.31GB/14.46GB
Step 160 | GPU Memory: 8.31GB/14.46GB
Step 170 | GPU Memory: 8.31GB/14.46GB
Step 180 | GPU Memory: 8.31GB/16.69GB
Step 190 | GPU Memory: 8.31GB/16.69GB
Step 200 | GPU Memory: 8.31GB/16.69GB
Step 210 | GPU Memory: 8.31GB/16.69GB
Step 220 | GPU Memory: 8.31GB/16.69GB
Step 230 | GPU Memory: 8.31GB/14.50GB
Step 240 | GPU Memory: 8.31GB/15.62GB
Step 250 | GPU Memory: 8.31GB/15.62GB
Step 260 | GPU Memory: 8.31GB/14.51GB
Step 270 | GPU Memory

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step 530 | GPU Memory: 8.31GB/15.64GB
Step 540 | GPU Memory: 8.31GB/15.64GB
Step 550 | GPU Memory: 8.31GB/15.64GB
Step 560 | GPU Memory: 8.31GB/15.64GB
Step 570 | GPU Memory: 8.31GB/15.64GB
Step 580 | GPU Memory: 8.31GB/15.64GB
Step 590 | GPU Memory: 8.31GB/15.64GB
Step 600 | GPU Memory: 8.31GB/15.64GB
Step 610 | GPU Memory: 8.31GB/15.64GB
Step 620 | GPU Memory: 8.31GB/15.64GB
Step 630 | GPU Memory: 8.31GB/15.64GB
Step 640 | GPU Memory: 8.31GB/15.64GB
Step 650 | GPU Memory: 8.31GB/15.64GB
Step 660 | GPU Memory: 8.31GB/15.64GB
Step 670 | GPU Memory: 8.31GB/15.64GB
Step 680 | GPU Memory: 8.31GB/15.64GB
Step 690 | GPU Memory: 8.31GB/15.64GB
Step 700 | GPU Memory: 8.31GB/15.64GB
Step 710 | GPU Memory: 8.31GB/15.64GB
Step 720 | GPU Memory: 8.31GB/15.64GB
Step 730 | GPU Memory: 8.31GB/15.64GB
Step 740 | GPU Memory: 8.31GB/15.64GB
Step 750 | GPU Memory: 8.31GB/15.64GB
Step 760 | GPU Memory: 8.31GB/15.64GB
Step 770 | GPU Memory: 8.31GB/15.64GB
Step 780 | GPU Memory: 8.31GB/15.64GB
Step 790 | G

In [73]:
def generate_answer(image_path, question, model, processor):
    """Sinh câu trả lời cho câu hỏi về ảnh"""
    
    # Tải và xử lý ảnh
    image = Image.open(image_path).convert('RGB')
    
    # Chuẩn bị input
    prompt = f"<image>{question}"
    inputs = processor(
        text=prompt,
        images=image,
        return_tensors="pt"
    ).to(model.device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.7,
            pad_token_id=processor.tokenizer.eos_token_id
        )
    
    # Decode response
    response = processor.tokenizer.decode(
        outputs[0][inputs.input_ids.shape[1]:], 
        skip_special_tokens=True
    )
    
    return response.strip()

# Test model
test_image = "/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Education/Education_000000000001.png"
test_question = "Cô giáo mặc trang phục màu gì?"

answer = generate_answer(test_image, test_question, model, processor)
print(f"Câu hỏi: {test_question}")
print(f"Câu trả lời: {answer}")

Câu hỏi: Cô giáo mặc trang phục màu gì?
Câu trả lời: màu xanh lá.
 xanh.
 xanh.
 xanh
 xanh lá.
 xanh lá
 xanh,
 xanh
 xanh.
 xanh lá.
 xanh.
 xanh
 xanh.
 xanh lá
 xanh.
 xanh
 xanh
 xanh lá xanh
 xanh,
 xanh
 xanh,
 xanh,
 xanh
 xanh lá
 xanh xanh
 xanh xanh
 xanh xanh
 xanh lá
 xanh.
 xanh.
 xanh.
 xanh
 xanh
 xanh. xanh
 xanh


In [43]:
# # device = "cuda"

# import torch
# from transformers import (
#     PaliGemmaProcessor, 
#     PaliGemmaForConditionalGeneration,
#     BitsAndBytesConfig,
#     TrainingArguments,
#     Trainer
# )
# from peft import LoraConfig, get_peft_model, TaskType

# # Cấu hình 8-bit quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True,
#     bnb_8bit_use_double_quant=True,
#     bnb_8bit_quant_type="nf4",
#     bnb_8bit_compute_dtype=torch.bfloat16
# )

# # Tải model và processor
# model_id = "google/paligemma-3b-pt-224"

# processor = PaliGemmaProcessor.from_pretrained(
#     model_id,
#     trust_remote_code=True
# )

# model = PaliGemmaForConditionalGeneration.from_pretrained(
#     model_id,
#     quantization_config=bnb_config,
#     device_map="auto",
#     trust_remote_code=True,
#     torch_dtype=torch.bfloat16
# )

# # Cấu hình LoRA
# lora_config = LoraConfig(
#     task_type=TaskType.CAUSAL_LM,
#     inference_mode=False,
#     r=8,  # Rank của LoRA
#     lora_alpha=16,  # Alpha parameter
#     lora_dropout=0.1,
#     target_modules=[
#         "q_proj", "v_proj", "k_proj", "o_proj",
#         "gate_proj", "up_proj", "down_proj"
#     ],
#     bias="none"
# )

# # Áp dụng LoRA lên model
# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [42]:
# if torch.cuda.is_available():
#     print(f"✅ Model loaded on: {next(model.parameters()).device}")
#     print(f"🧠 GPU memory after model load: {torch.cuda.memory_allocated()/1e9:.2f}GB")
# else:
#     print("⚠️ Model loaded on CPU")

✅ Model loaded on: cuda:0
🧠 GPU memory after model load: 7.26GB


In [4]:
# import pandas as pd
# import os
# from datasets import Dataset

# df = pd.read_csv("/kaggle/input/vieduvqa/Verify_Convert_80.csv")
# # Function tìm đường dẫn
# def get_image_path(row):
#     sub = row["ImageID"].split("_", 1)[0]  # lấy subfolder: Education, Life, ...
#     fname = row["ImageID"] + ".png"       # nếu ảnh là .jpg
#     path = os.path.join("/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu", sub, fname)
#     if os.path.exists(path):
#         return path
#     else:
#         raise FileNotFoundError(f"Không tìm thấy: {path}")

# df["image_path"] = df.apply(get_image_path, axis=1)
# dataset = Dataset.from_pandas(df[["image_path", "Question", "Answer"]])
# dataset

In [34]:
# import pandas as pd
# import os
# from PIL import Image
# from datasets import Dataset

# # Đường dẫn
# csv_file = "/kaggle/input/vieduvqa/Verify_Convert_80.csv"  # Thay tên file CSV của bạn
# image_folder = "/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu"      # Folder chứa Education, Nature, Life, Object, Others

# # Đọc CSV
# df = pd.read_csv(csv_file)
# df = df.head(10).copy()

# print(f"Số mẫu: {len(df)}")

# # Chuyển đổi
# data = []
# for _, row in df.iterrows():
#     image_id = row['ImageID']
#     question = row['Question']
#     answer = row['Answer']
    
#     # Lấy category từ tên ảnh (phần trước dấu _)
#     category = image_id.split('_')[0]
    
#     # Tạo đường dẫn ảnh
#     image_path = os.path.join(image_folder, category, f"{image_id}.png")
    
#     # Kiểm tra ảnh tồn tại
#     if os.path.exists(image_path):
#         try:
#             img = Image.open(image_path).convert('RGB')
#             data.append({
#                 'image': img,
#                 'question': question,
#                 'answer': answer
#             })
#         except:
#             print(f"Lỗi ảnh: {image_id}")
#     else:
#         print(f"Không tìm thấy: {image_path}")

# # Tạo dataset
# dataset = Dataset.from_list(data)
# print(f"Dataset có {len(dataset)} mẫu")

Số mẫu: 10
Dataset có 10 mẫu


In [35]:
# # Chia train/test
# train_test = dataset.train_test_split(test_size=0.1)
# train_dataset = train_test['train']
# test_dataset = train_test['test']

# print(f"Train: {len(train_dataset)}, Test: {len(test_dataset)}")

Train: 9, Test: 1


In [36]:
# def preprocess(examples):
#     """Xử lý dữ liệu đầu vào cho training"""
    
#     batch_images = []
#     batch_questions = []
#     batch_answers = []
    
#     for i in range(len(examples['image'])):
#         # Xử lý ảnh
#         image = examples['image'][i]
#         if isinstance(image, str):
#             image = Image.open(image).convert('RGB')
        
#         # Thêm image token vào đầu câu hỏi
#         question = f"<image>{examples['question'][i]}"
#         answer = examples['answer'][i]
        
#         batch_images.append(image)
#         batch_questions.append(question)
#         batch_answers.append(answer)
    
#     # Tokenize inputs
#     inputs = processor(
#         text=batch_questions,
#         images=batch_images,
#         return_tensors="pt",
#         padding=True,
#         truncation=True,
#         max_length=256
#     )
    
#     # Tokenize labels (answers)
#     labels = processor.tokenizer(
#         text=batch_answers,
#         return_tensors="pt",
#         padding=True,
#         truncation=True,
#         max_length=64
#     )
    
#     inputs["labels"] = labels.input_ids
    
#     return inputs

# # Áp dụng preprocessing
# train_processed = train_dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)
# test_processed = test_dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [50]:
# # Preprocessing cho PaliGemma
# def preprocess(examples):
#     # Thêm <image> token vào đầu question
#     questions = [f"<image>{q}" for q in examples['question']]
    
#     # Process với processor
#     inputs = processor(
#         text=questions,
#         images=examples['image'],
#         return_tensors="pt",
#         padding="max_length",
#         truncation=True,
#         max_length=512
#     )
    
#     # Process answers
#     labels = processor.tokenizer(
#         text=examples['answer'],
#         return_tensors="pt", 
#         padding="max_length",
#         truncation=True,
#         max_length=128
#     ).input_ids
    
#     inputs["labels"] = labels
#     return {k: v.squeeze() for k, v in inputs.items()}

# # Áp dụng preprocessing (cần có processor đã load)
# train_processed = train_dataset.map(preprocess, batched=True)
# test_processed = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

ArrowInvalid: Column 3 named input_ids expected length 1 but got length 768

In [37]:
# # Cấu hình training
# training_args = TrainingArguments(
#     output_dir="./paligemma-vietnamese-vqa",
#     per_device_train_batch_size=8,  # Giảm batch size do memory hạn chế
#     per_device_eval_batch_size=4,
#     gradient_accumulation_steps=16,   # Tăng để compensate batch size nhỏ
#     num_train_epochs=1,
#     warmup_steps=100,
#     logging_steps=10,
#     save_steps=100,
#     eval_steps=100,
#     learning_rate=10e-4,
#     weight_decay=0.01,
#     fp16=True,  # Sử dụng mixed precision
#     dataloader_pin_memory=False,
#     remove_unused_columns=False,
#     push_to_hub=False,
#     report_to=None,
#     gradient_checkpointing=True,  # Tiết kiệm memory
# )



# # Custom Data Collator
# class VQADataCollator:
#     def __init__(self, processor):
#         self.processor = processor
    
#     def __call__(self, features):
#         batch = {}
        
#         # Lấy keys từ feature đầu tiên
#         keys = features[0].keys()
        
#         for key in keys:
#             if key == "labels":
#                 # Xử lý labels đặc biệt
#                 labels = [f[key] for f in features]
#                 batch[key] = torch.stack(labels)
#             else:
#                 # Stack các tensor khác
#                 values = [f[key] for f in features]
#                 if isinstance(values[0], torch.Tensor):
#                     batch[key] = torch.stack(values)
#                 else:
#                     batch[key] = values
        
#         return batch

# data_collator = VQADataCollator(processor)

In [38]:
# # Fix cho các warning trong Trainer setup

# import torch
# import os
# from transformers import Trainer, TrainingArguments, TrainerCallback
# import logging

# # 3. Custom callback để monitor
# class DetailedTrainingCallback(TrainerCallback):
#     def on_step_end(self, args, state, control, **kwargs):
#         if state.global_step % 10 == 0:
#             gpu_memory = torch.cuda.memory_allocated() / 1e9
#             print(f"Step {state.global_step} | GPU Memory: {gpu_memory:.2f}GB")
            
#     def on_log(self, args, state, control, logs=None, **kwargs):
#         if logs:
#             log_msg = "Metrics: "
#             for key, value in logs.items():
#                 if isinstance(value, float):
#                     log_msg += f"{key}: {value:.4f} | "
#             print(log_msg.rstrip(" | "))

# # 4. Khởi tạo Trainer với cách mới (fix deprecation warning)
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     data_collator=data_collator,
#     processing_class=processor,  # Thay vì tokenizer=processor.tokenizer
#     callbacks=[DetailedTrainingCallback()],
# )



# # 5. Thêm label names để tránh warning (nếu cần)
# # Với VQA task, thường không cần label_names cụ thể
# # nhưng có thể thêm nếu muốn tránh warning:
# if hasattr(trainer.model.config, 'label_names') and trainer.model.config.label_names is None:
#     trainer.label_names = []  # Hoặc danh sách labels cụ thể nếu có

# print("🚀 Bắt đầu fine-tuning PaliGemma...")
# print(f"📊 Training với {len(train_dataset)} samples")
# print(f"🖥️  Device: {training_args.device}")

# # Bắt đầu training
# try:
#     trainer.train()
    
#     # Lưu model
#     print("💾 Đang lưu model...")
#     trainer.save_model("./paligemma-vietnamese-vqa-final")
#     processor.save_pretrained("./paligemma-vietnamese-vqa-final")
#     print("✅ Hoàn thành!")
    
# except KeyboardInterrupt:
#     print("⚠️ Training bị dừng bởi người dùng")
#     trainer.save_model("./paligemma-vietnamese-vqa-interrupted")
#     processor.save_pretrained("./paligemma-vietnamese-vqa-interrupted")
    
# except Exception as e:
#     print(f"❌ Lỗi: {str(e)}")
#     raise

🚀 Bắt đầu fine-tuning PaliGemma...
📊 Training với 9 samples
🖥️  Device: cuda:0


<IPython.core.display.Javascript object>

⚠️ Training bị dừng bởi người dùng


In [24]:
# def generate_answer(image_path, question, model, processor):
#     """Sinh câu trả lời cho câu hỏi về ảnh"""
    
#     # Tải và xử lý ảnh
#     image = Image.open(image_path).convert('RGB')
    
#     # Chuẩn bị input
#     prompt = f"<image>{question}"
#     inputs = processor(
#         text=prompt,
#         images=image,
#         return_tensors="pt"
#     ).to(model.device)
    
#     # Generate
#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens=100,
#             do_sample=True,
#             temperature=0.7,
#             pad_token_id=processor.tokenizer.eos_token_id
#         )
    
#     # Decode response
#     response = processor.tokenizer.decode(
#         outputs[0][inputs.input_ids.shape[1]:], 
#         skip_special_tokens=True
#     )
    
#     return response.strip()

# # Test model
# test_image = "/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Education/Education_000000000001.png"
# test_question = "Có bao nhiêu người trong ảnh này?"

# answer = generate_answer(test_image, test_question, model, processor)
# print(f"Câu hỏi: {test_question}")
# print(f"Câu trả lời: {answer}")

Câu hỏi: Màu của ảnh?
Câu trả lời: yes


In [26]:
# # Optimize
# # Giảm memory usage
# torch.cuda.empty_cache()

# # Sử dụng gradient checkpointing
# model.gradient_checkpointing_enable()

# # Freeze một số layers nếu cần
# for param in model.vision_tower.parameters():
#     param.requires_grad = False

In [None]:
# def evaluate_vqa(model, processor, test_dataset):
#     """Đánh giá model trên test set"""
    
#     correct = 0
#     total = len(test_dataset)
    
#     for item in test_dataset:
#         predicted = generate_answer(
#             item['image_path'], 
#             item['question'], 
#             model, 
#             processor
#         )
        
#         # So sánh với ground truth
#         if predicted.lower().strip() in item['answer'].lower().strip():
#             correct += 1
    
#     accuracy = correct / total
#     print(f"Accuracy: {accuracy:.2%}")
#     return accuracy

In [4]:
# from datasets import Dataset
# import pandas as pd

# data = pd.read_csv("/kaggle/input/vieduvqa/Verify_Convert_80.csv")  # hoặc .json
# data

Unnamed: 0,ImageID,Question,Answer
0,Education_000000000176,Các bạn học sinh đang làm gì trong lớp học?,Các bạn học sinh đang thực hiện các hoạt động ...
1,Life_000000000565,"Trong hình 1, bé gái đang chơi trò gì?",Bé gái đang chơi trò nhảy dây. Cô bé cầm dây m...
2,Life_000000000568,Khăn trải bàn có màu gì?,Khăn trải bàn có màu vàng kẻ ô vuông.\n
3,Life_000000000568,Trên bàn ăn có mấy người?,"Trên bàn ăn có 4 người, gồm bố, mẹ, con trai v..."
4,Life_000000000784,Người đàn ông đang làm gì với chiếc điều khiển...,Người đàn ông đang sử dụng điều khiển từ xa để...
...,...,...,...
18833,Life_000000000550,Chuồng của những con vật được làm bằng gì?,Chuồng của những con vật được làm bằng lưới sắ...
18834,Life_000000000551,Bé trai ở phía dưới bên trái đang làm gì?,Bé trai ở phía dưới bên trái đang hát.
18835,Life_000000000551,Có bao nhiêu con gà con trong ảnh?,Có 6 con gà con trong ảnh.
18836,Life_000000000551,Người phụ nữ trong ảnh đang làm gì?,Người phụ nữ trong ảnh đang cho gà ăn.\n


In [34]:
# data

Unnamed: 0,ImageID,Question,Answer,image_path
0,Education_000000000176,Các bạn học sinh đang làm gì trong lớp học?,Các bạn học sinh đang thực hiện các hoạt động ...,/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Edu...
1,Life_000000000565,"Trong hình 1, bé gái đang chơi trò gì?",Bé gái đang chơi trò nhảy dây. Cô bé cầm dây m...,/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Lif...
2,Life_000000000568,Khăn trải bàn có màu gì?,Khăn trải bàn có màu vàng kẻ ô vuông.\n,/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Lif...
3,Life_000000000568,Trên bàn ăn có mấy người?,"Trên bàn ăn có 4 người, gồm bố, mẹ, con trai v...",/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Lif...
4,Life_000000000784,Người đàn ông đang làm gì với chiếc điều khiển...,Người đàn ông đang sử dụng điều khiển từ xa để...,/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Lif...
...,...,...,...,...
18833,Life_000000000550,Chuồng của những con vật được làm bằng gì?,Chuồng của những con vật được làm bằng lưới sắ...,/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Lif...
18834,Life_000000000551,Bé trai ở phía dưới bên trái đang làm gì?,Bé trai ở phía dưới bên trái đang hát.,/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Lif...
18835,Life_000000000551,Có bao nhiêu con gà con trong ảnh?,Có 6 con gà con trong ảnh.,/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Lif...
18836,Life_000000000551,Người phụ nữ trong ảnh đang làm gì?,Người phụ nữ trong ảnh đang cho gà ăn.\n,/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Lif...


In [35]:
# from datasets import Dataset

# dataset = Dataset.from_pandas(data[["image_path", "Question", "Answer"]])
# dataset['image_path']

['/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Education/Education_000000000176.png',
 '/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Life/Life_000000000565.png',
 '/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Life/Life_000000000568.png',
 '/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Life/Life_000000000568.png',
 '/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Life/Life_000000000784.png',
 '/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Nature/Nature_000000000008.png',
 '/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Nature/Nature_000000000250.png',
 '/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Nature/Nature_000000000371.png',
 '/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Nature/Nature_000000000398.png',
 '/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Nature/Nature_000000000401.png',
 '/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Nature/Nature_000000000401.png',
 '/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Object/Object_000000000048.png',
 '/kaggle/input/vieduvqa/ViVQA4Edu/ViVQA4Edu/Object/Object_000000000097.pn

In [None]:
# from PIL import Image

# def preprocess(ex):
#     # Process inputs (image + text)
#     # Optionally add image token manually to suppress warning
#     question_with_image_token = f"<image>{ex['Question']}"
    
#     inputs = processor(
#         text=question_with_image_token, 
#         images=Image.open(ex["image_path"]).convert("RGB"), 
#         return_tensors="pt", 
#         padding="max_length", 
#         truncation=True
#     )
    
#     # Process labels (text only) - use the tokenizer directly
#     labels = processor.tokenizer(
#         text=ex["Answer"], 
#         return_tensors="pt",
#         padding="max_length", 
#         truncation=True
#     ).input_ids
    
#     # Add labels to inputs
#     inputs["labels"] = labels
    
#     # Squeeze batch dimension if needed
#     return {k: v.squeeze() for k, v in inputs.items()}

# dataset = dataset.map(preprocess)
# train_ds, val_ds = ds.train_test_split(0.1).values()

Map:   0%|          | 0/18838 [00:00<?, ? examples/s]

In [28]:
# from PIL import Image

# def preprocess(ex):
#     img = Image.open(ex["image_path"]).convert("RGB")
#     inp = processor(images=img, text=ex["Question"], return_tensors="pt", padding="max_length", truncation=True)
#     inp["labels"] = processor(text=ex["Answer"], return_tensors="pt", padding="max_length", truncation=True).input_ids
#     return {k: v.squeeze(0) for k, v in inp.items()}

# train_ds = train_ds.map(preprocess)
# val_ds = val_ds.map(preprocess)


Map:   0%|          | 0/16954 [00:00<?, ? examples/s]

You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ValueError: `images` are expected as arguments to a `PaliGemmaProcessor` instance.

In [21]:
# from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# model = prepare_model_for_kbit_training(model)

# lora_config = LoraConfig(
#     r=8,
#     lora_alpha=16,
#     target_modules=["q_proj","v_proj","k_proj","o_proj"],
#     lora_dropout=0.1,
#     bias="none",
#     task_type="SEQ_2_SEQ_LM"
# )

# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()


trainable params: 3,336,192 || all params: 2,926,802,672 || trainable%: 0.1140


In [30]:
# from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(
#     output_dir="./results",
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=5,
#     learning_rate=2e-5,
#     logging_steps=10,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     fp16=True,
#     save_total_limit=1
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_ds,
#     eval_dataset=val_ds,
#     tokenizer=processor.tokenizer
# )

# trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [image_path, Question, pixel_values, Answer]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

In [None]:
# model.save_pretrained("pali_vqa_lora8bit")
# processor.save_pretrained("pali_vqa_lora8bit")

In [None]:
# from PIL import Image

# image = Image.open("images/test.jpg").convert("RGB")
# question = "Bức tranh vẽ cảnh gì?"

# inputs = processor(images=image, text=question, return_tensors="pt").to("cuda")
# output = model.generate(**inputs, max_new_tokens=64)

# generated_answer = processor.tokenizer.decode(output[0], skip_special_tokens=True)
# print("Answer:", generated_answer)


In [None]:
# import pandas as pd


# # Read the CSV file
# df = pd.read_csv(r'D:\IT\GITHUB\FinalProject_DataLabeling\benchmark_dataset.csv')

# # Create a new dataframe with only the required columns
# simplified_df = df[['ImageID', 'SuggestedQuestion', 'SuggestedAnswer']].copy()

# # Rename the columns
# simplified_df.rename(columns={
#     'SuggestedQuestion': 'Question',
#     'SuggestedAnswer': 'Answer'
# }, inplace=True)

# # Strip trailing newline characters from Answer field
# simplified_df['Answer'] = simplified_df['Answer'].str.rstrip()

# # Save the new dataframe to a CSV file
# simplified_df.to_csv(r'D:\IT\GITHUB\FinalProject_DataLabeling\simplified_dataset.csv', index=False)

# print(f"Successfully created simplified_dataset.csv with {len(simplified_df)} rows.")
    


Successfully created simplified_dataset.csv with 250 rows.
