In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/DI725/DI725-assignment2/

Mounted at /content/drive
/content/drive/MyDrive/DI725/DI725-assignment2


We will be working with the [AU-AIR dataset](https://bozcani.github.io/auairdataset), which consists of around 30k annotated low altitute traffic surveillance images with 8 object categories.

In [2]:
import os
import json
import torch
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import YolosForObjectDetection, YolosConfig, YolosImageProcessor
from transformers import Trainer, TrainingArguments
from transformers.integrations import TensorBoardCallback, WandbCallback
from transformers.trainer_callback import EarlyStoppingCallback
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm
import logging
import random
import wandb
import datetime

# Set up logging
## no need
##logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
##logger = logging.getLogger(__name__)

# Configuration
DATA_DIR = "auair2019data"
IMAGES_DIR = os.path.join(DATA_DIR, "images")
ANNOTATIONS_FILE = os.path.join(DATA_DIR, "annotations.json")
OUTPUT_DIR = "yolos_finetuned"
MODEL_CHECKPOINT = "hustvl/yolos-small"
BATCH_SIZE = 4  # Smaller batch size to accommodate more images
LEARNING_RATE = 3e-5
NUM_EPOCHS = 50  # More epochs for large dataset
WARMUP_RATIO = 0.05  # Warmup ratio instead of fixed steps for large dataset
GRAD_ACCUMULATION_STEPS = 16  # Increased for effective batch size of 64
NUM_WORKERS = 8  # For data loading
EVAL_STEPS = 2000  # Evaluate less frequently
SAVE_STEPS = 2000  # Save less frequently
EARLY_STOPPING_PATIENCE = 5  # Stop if no improvement for 5 evaluations

# Wandb configuration
WANDB_PROJECT = "object_detection_transformer"
WANDB_NAME = f"yolos-small-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"


# Seed everything for reproducibility
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()

# Initialize wandb
print("Initializing wandb...")
wandb.init(
    project=WANDB_PROJECT,
    name=WANDB_NAME,
    config={
        "model_checkpoint": MODEL_CHECKPOINT,
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "epochs": NUM_EPOCHS,
        "warmup_ratio": WARMUP_RATIO,
        "grad_accumulation_steps": GRAD_ACCUMULATION_STEPS,
        "effective_batch_size": BATCH_SIZE * GRAD_ACCUMULATION_STEPS,
        "early_stopping_patience": EARLY_STOPPING_PATIENCE,
    }
)

# Load annotations
print("Loading annotations...")
with open(ANNOTATIONS_FILE, 'r') as f:
    data = json.load(f)

# Extract category information
categories = {idx: cat for idx, cat in enumerate(data['categories'], start=0)}
id2label = {k: v for k, v in categories.items()}
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)

print(f"Found {num_labels} categories: {id2label}")
wandb.config.update({"num_classes": num_labels, "classes": list(id2label.values())})

annotations = data["annotations"]

# class distribution
category_counts = {class_name: 0 for class_name in id2label.values()}
for image_data in annotations:
    if "bbox" in image_data:
        for bbox in image_data["bbox"]:
            class_id = bbox["class"]
            if class_id in id2label:
                category_counts[id2label[class_id]] += 1

wandb.log({"class_distribution": wandb.Table(
    columns=["Category", "Count"],
    data=[[cat, count] for cat, count in category_counts.items()]
)})



Initializing wandb...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myildizz-nisan[0m ([33myildizz-nisan-middle-east-technical-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Loading annotations...
Found 8 categories: {0: 'Human', 1: 'Car', 2: 'Truck', 3: 'Van', 4: 'Motorbike', 5: 'Bicycle', 6: 'Bus', 7: 'Trailer'}


In [22]:
import os
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset

class CustomObjectDetectionDataset(Dataset):
    def __init__(self, annotations, img_dir, processor, transform=None):
        self.annotations = annotations
        self.img_dir = img_dir
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        # Load annotation
        ann_data = self.annotations[idx]
        img_path = os.path.join(self.img_dir, ann_data["image_name"])

        # Load image safely
        try:
            image = Image.open(img_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
            image = Image.new('RGB', (640, 640), color='gray')
            ann_data["bbox"] = []

        image_width, image_height = image.size

        # Prepare boxes & labels
        boxes = []
        labels = []

        for bbox in ann_data["bbox"]:
            x_min = bbox["left"]
            y_min = bbox["top"]
            width = bbox["width"]
            height = bbox["height"]

            x_max = x_min + width
            y_max = y_min + height

            # Normalize to [0, 1]
            x_min = max(0, x_min / image_width)
            y_min = max(0, y_min / image_height)
            x_max = min(1, x_max / image_width)
            y_max = min(1, y_max / image_height)

            boxes.append([x_min, y_min, x_max, y_max])
            labels.append(bbox["class"])

        boxes = np.array(boxes, dtype=np.float32)
        labels = np.array(labels, dtype=np.int64)

        # Albumentations transform
        if self.transform:
            transformed = self.transform(image=np.array(image), bboxes=boxes.tolist(), labels=labels.tolist())
            image = transformed['image']
            boxes = np.array(transformed['bboxes'], dtype=np.float32)
            labels = np.array(transformed['labels'], dtype=np.int64)

            # Clip out-of-bounds
            boxes = np.clip(boxes, 0.0, 1.0)

            # Filter out invalid boxes
            valid_boxes = []
            valid_labels = []
            for box, label in zip(boxes, labels):
                x_min, y_min, x_max, y_max = box
                if x_max > x_min and y_max > y_min:
                    valid_boxes.append([x_min, y_min, x_max, y_max])
                    valid_labels.append(label)

            boxes = np.array(valid_boxes, dtype=np.float32)
            labels = np.array(valid_labels, dtype=np.int64)

        # Construct annotations for processor
        annotations = []
        for box, label in zip(boxes, labels):
            x_min, y_min, x_max, y_max = box
            area = (x_max - x_min) * (y_max - y_min)
            annotations.append({
                'bbox': [x_min, y_min, x_max, y_max],
                'category_id': int(label),
                'area': float(area),
                'iscrowd': 0
            })

        # Feed image + annotations into YOLOS processor
        encoding = self.processor(
            images=image,
            annotations={
                'image_id': idx,
                'annotations': annotations
            },
            return_tensors="pt"
        )

        # Remove batch dimension
        for k, v in encoding.items():
            if isinstance(v, torch.Tensor):
                encoding[k] = v.squeeze(0)  # remove batch dim only
            else:
                encoding[k] = v  # leave non-tensor items as-is

        return encoding


In [24]:

# Initialize image processor
print(f"Loading YOLOS image processor from {MODEL_CHECKPOINT}")
processor = YolosImageProcessor.from_pretrained(MODEL_CHECKPOINT)

# Define data augmentations

train_transform = A.Compose(
    [
        A.Resize(640, 640),
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.2),
        A.ShiftScaleRotate(
            shift_limit=0.05, scale_limit=0.1, rotate_limit=10, border_mode=0, p=0.5
        ),
    ],
    bbox_params=A.BboxParams(
        format='yolo',              #
        label_fields=['labels'],
        min_visibility=0.1,         # Discard boxes too small or barely visible
        clip=True,            # clips bboxes to [0, 1]
    )
)

val_transform = A.Compose(
    [
        A.Resize(640, 640)  #
    ],
    bbox_params=A.BboxParams(
        format='yolo',
        label_fields=['labels'],
        clip=True,
    )
)


# Log augmentation pipeline to wandb
wandb.config.update({
    "augmentations": str(train_transform),
})

# Create dataset
print("Creating datasets...")
full_dataset = CustomObjectDetectionDataset(
    annotations,
    IMAGES_DIR,
    processor
)

# Split dataset
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

# Apply transforms
train_dataset.dataset.transform = train_transform
val_dataset.dataset.transform = val_transform

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

# Log data split info
wandb.config.update({
    "train_size": len(train_dataset),
    "val_size": len(val_dataset),
    "train_val_ratio": train_size / val_size if val_size > 0 else "N/A",
})


Loading YOLOS image processor from hustvl/yolos-small
Creating datasets...
Train dataset size: 26258
Validation dataset size: 6565


In [25]:

# Custom data collator to handle batching and padding
def custom_collate_fn(batch):
    # Separate inputs and labels
    pixel_values = torch.stack([item['pixel_values'] for item in batch])
    labels = [item['labels'] for item in batch]
    pixel_mask = torch.stack([item['pixel_mask'] for item in batch]) if 'pixel_mask' in batch[0] else None

    # Create batch dictionary
    batch_dict = {
        'pixel_values': pixel_values,
        'labels': labels,
    }

    if pixel_mask is not None:
        batch_dict['pixel_mask'] = pixel_mask

    return batch_dict

# Get model with updated config for your number of classes
print(f"Loading YOLOS model from {MODEL_CHECKPOINT}")
config = YolosConfig.from_pretrained(MODEL_CHECKPOINT, id2label=id2label, label2id=label2id)
model = YolosForObjectDetection.from_pretrained(MODEL_CHECKPOINT, config=config,ignore_mismatched_sizes= True)

# Log model architecture summary to wandb
wandb.config.update({
    "model_params": sum(p.numel() for p in model.parameters()),
    "trainable_params": sum(p.numel() for p in model.parameters() if p.requires_grad),
})

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Define training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    push_to_hub=False,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=100,
    fp16=True,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    save_total_limit=3,
    dataloader_num_workers=NUM_WORKERS,
    dataloader_drop_last=True,
    dataloader_pin_memory=True,
    report_to="wandb",
    gradient_checkpointing=True,
    ddp_find_unused_parameters=False,
)

# Initialize Trainer with early stopping and wandb
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=custom_collate_fn,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE),
    ],
)

# Train the model
print("Starting training...")
train_result = trainer.train()

# Save the final model
final_model_path = os.path.join(OUTPUT_DIR, "final_model")
trainer.save_model(final_model_path)
print(f"Model saved to {final_model_path}")

# Create a validation dataset subset for final evaluation
print("Performing final evaluation...")
eval_subset_size = min(1000, len(val_dataset))
eval_subset_indices = torch.randperm(len(val_dataset))[:eval_subset_size]
eval_subset = torch.utils.data.Subset(val_dataset, eval_subset_indices)

# Evaluate the model on the validation subset
eval_results = trainer.evaluate(eval_dataset=eval_subset)
print(f"Final evaluation results: {eval_results}")


# Finish wandb run
wandb.finish()


Loading YOLOS model from hustvl/yolos-small


Some weights of YolosForObjectDetection were not initialized from the model checkpoint at hustvl/yolos-small and are newly initialized because the shapes did not match:
- class_labels_classifier.layers.2.bias: found shape torch.Size([92]) in the checkpoint and torch.Size([9]) in the model instantiated
- class_labels_classifier.layers.2.weight: found shape torch.Size([92, 384]) in the checkpoint and torch.Size([9, 384]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


TypeError: list indices must be integers or slices, not str