In [None]:
import os
import json
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq

epochs = 5
data_dir1 = "./data/just_car/"
data_dir2 = "./data/car_trees/"
output_dir = "model"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CarDataset(Dataset):
    def __init__(self, root):
        self.root = root
        # Get all PNG files in the folder
        self.imgs = sorted([f for f in os.listdir(root) if f.endswith('.png')])
        # For each image, get corresponding .npy and .json files
        self.npy_files = []
        self.json_files = []
        for png_file in self.imgs:
            idx = png_file.replace('rgb_', '').replace('.png', '')
            npy_name = f"bounding_box_2d_tight_{idx}.npy"
            json_name = f"bounding_box_2d_tight_labels_{idx}.json"
            self.npy_files.append(npy_name)
            self.json_files.append(json_name)

    def __getitem__(self, idx):
        # Load image
        img_path = os.path.join(self.root, self.imgs[idx])
        img = Image.open(img_path).convert("RGB")

        # Load bounding boxes
        npy_path = os.path.join(self.root, self.npy_files[idx])
        bboxes = np.load(npy_path)  # shape: (N, 5) => [object_id, x_min, y_min, x_max, y_max]

        # Load labels
        json_path = os.path.join(self.root, self.json_files[idx])
        with open(json_path, 'r') as f:
            label_dict = json.load(f)

        boxes = []
        labels = []
        for box in bboxes:
            obj_id = int(box[0])
            x_min = float(box[1])
            y_min = float(box[2])
            x_max = float(box[3])
            y_max = float(box[4])
            # Skip invalid boxes
            if x_max <= x_min or y_max <= y_min:
                continue

            obj_class_name = label_dict.get(str(obj_id), {}).get("class", "unknown")
            # Only process ground and cars classes
            if obj_class_name == "ground":
                boxes.append([x_min, y_min, x_max, y_max])
                labels.append("ground")
            elif obj_class_name == "cars":
                boxes.append([x_min, y_min, x_max, y_max])
                labels.append("car")

        # Create a text description from the detection annotations
        if boxes:
            description = "; ".join([f"{label} at {box}" for label, box in zip(labels, boxes)])
        else:
            description = "No objects detected"

        return img, description

    def __len__(self):
        return len(self.imgs)

def collate_fn(batch):
    images, texts = zip(*batch)
    return list(images), list(texts)

# Load meta-llama's vision-language model and its processor from Hugging Face
model_name = "meta-llama/Llama-3.2-11B-Vision"
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForVision2Seq.from_pretrained(model_name)
model.to(device)

# Create datasets and dataloaders
dataset1 = CarDataset(data_dir1)
dataset2 = CarDataset(data_dir2)

data_loader1 = DataLoader(dataset1, batch_size=4, shuffle=True, collate_fn=collate_fn)
data_loader2 = DataLoader(dataset2, batch_size=4, shuffle=True, collate_fn=collate_fn)

# Use an optimizer over all trainable parameters
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
model.train()

for epoch in range(epochs):
    # Use dataset1 for epochs 1 and 2; use dataset2 for later epochs
    if epoch < 2:
        current_loader = data_loader1
        print(f"Epoch [{epoch+1}/{epochs}]: Using dataset1")
    else:
        current_loader = data_loader2
        print(f"Epoch [{epoch+1}/{epochs}]: Using dataset2")

    len_dataloader = len(current_loader)
    for i, (imgs, texts) in enumerate(current_loader):
        # The processor handles any necessary image pre-processing and tokenizes text.
        inputs = processor(images=imgs, text=texts, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        optimizer.zero_grad()
        # For a generative model, we pass the input_ids as labels.
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        if (i+1) % 5 == 0:
            print(f"  Batch [{i+1}/{len_dataloader}], Loss: {loss.item():.4f}")

# Save the fine-tuned model and processor
model.save_pretrained(output_dir)
processor.save_pretrained(output_dir)
