In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/DI725/DI725-assignment2/

Mounted at /content/drive
/content/drive/MyDrive/DI725/DI725-assignment2


We will be working with the [AU-AIR dataset](https://bozcani.github.io/auairdataset), which consists of around 30k annotated low altitute traffic surveillance images with 8 object categories.

In [2]:
import os
import json
import torch
import numpy as np
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import YolosForObjectDetection, YolosConfig, YolosImageProcessor
from transformers import Trainer, TrainingArguments
from transformers.integrations import TensorBoardCallback, WandbCallback
from transformers.trainer_callback import EarlyStoppingCallback
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm
import random
import wandb
import datetime
import pickle


# Configuration
DATA_DIR = "auair2019data"
IMAGES_DIR = os.path.join(DATA_DIR, "images")
ANNOTATIONS_FILE = os.path.join(DATA_DIR, "annotations.json")
OUTPUT_DIR = "yolos_finetuned"
MODEL_CHECKPOINT = "hustvl/yolos-small"
BATCH_SIZE = 4  # Smaller batch size to accommodate more images
LEARNING_RATE = 3e-5
NUM_EPOCHS = 20  # More epochs for large dataset
WARMUP_RATIO = 0.05  # Warmup ratio instead of fixed steps for large dataset
GRAD_ACCUMULATION_STEPS = 16  # for effective batch size of 64
NUM_WORKERS = 8  # For data loading
EVAL_STEPS = 200
SAVE_STEPS = 2000
EARLY_STOPPING_PATIENCE = 4  # Stop if no improvement for 4 evaluations

# Wandb configuration
WANDB_PROJECT = "object_detection_transformer"
WANDB_NAME = f"yolos-small-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"


# Seed everything for reproducibility
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()

# Initialize wandb
print("Initializing wandb...")
wandb.init(
    project=WANDB_PROJECT,
    name=WANDB_NAME,
    config={
        "model_checkpoint": MODEL_CHECKPOINT,
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "epochs": NUM_EPOCHS,
        "warmup_ratio": WARMUP_RATIO,
        "grad_accumulation_steps": GRAD_ACCUMULATION_STEPS,
        "effective_batch_size": BATCH_SIZE * GRAD_ACCUMULATION_STEPS,
        "early_stopping_patience": EARLY_STOPPING_PATIENCE,
    }
)

# Load annotations
print("Loading annotations...")
with open(ANNOTATIONS_FILE, 'r') as f:
    data = json.load(f)

# Extract category information
categories = {idx: cat for idx, cat in enumerate(data['categories'], start=0)}
id2label = {k: v for k, v in categories.items()}
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)

print(f"Found {num_labels} categories: {id2label}")
wandb.config.update({"num_classes": num_labels, "classes": list(id2label.values())})

annotations = data["annotations"]

# class distribution
category_counts = {class_name: 0 for class_name in id2label.values()}
for image_data in annotations:
    if "bbox" in image_data:
        for bbox in image_data["bbox"]:
            class_id = bbox["class"]
            if class_id in id2label:
                category_counts[id2label[class_id]] += 1

#wandb.log({"class_distribution": wandb.Table(
#    columns=["Category", "Count"],
#    data=[[cat, count] for cat, count in category_counts.items()]
#)})



Initializing wandb...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myildizz-nisan[0m ([33myildizz-nisan-middle-east-technical-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Loading annotations...
Found 8 categories: {0: 'Human', 1: 'Car', 2: 'Truck', 3: 'Van', 4: 'Motorbike', 5: 'Bicycle', 6: 'Bus', 7: 'Trailer'}


In [3]:
import os
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset

class CustomObjectDetectionDataset(Dataset):
    def __init__(self, annotations, img_dir, processor, transform=None):
        self.annotations = annotations
        self.img_dir = img_dir
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        # Load annotation
        ann_data = self.annotations[idx]
        img_path = os.path.join(self.img_dir, ann_data["image_name"])

        image = Image.open(img_path)

        image_width, image_height = image.size

        # Prepare boxes & labels
        boxes = []
        labels = []

        for bbox in ann_data["bbox"]:
            x_min = bbox["left"]
            y_min = bbox["top"]
            width = bbox["width"]
            height = bbox["height"]

            x_max = x_min + width
            y_max = y_min + height


            # Only add valid boxes (where max > min) before any transformations
            if x_max > x_min and y_max > y_min:
                boxes.append([x_min, y_min, width, height])
                labels.append(bbox["class"])

        boxes = np.array(boxes, dtype=np.float32) if boxes else np.zeros((0, 4), dtype=np.float32)
        labels = np.array(labels, dtype=np.int64) if labels else np.zeros((0,), dtype=np.int64)

        # Albumentations transform
        if self.transform and len(boxes) > 0:
            try:
                transformed = self.transform(image=np.array(image), bboxes=boxes.tolist(), labels=labels.tolist())
                image = transformed['image']
                boxes = transformed['bboxes']
                labels = transformed['labels']

                # Handle empty transform result
                if not boxes:
                    boxes = np.zeros((0, 4), dtype=np.float32)
                    labels = np.zeros((0,), dtype=np.int64)
                else:
                    boxes = np.array(boxes, dtype=np.float32)
                    labels = np.array(labels, dtype=np.int64)

                    # Clip out-of-bounds
                    boxes = np.clip(boxes, 0.0, 1.0)

                    # Filter out invalid boxes
                    valid_indices = []
                    for i, box in enumerate(boxes):
                        x_min, y_min, width, height = box
                        x_max = x_min + width
                        y_max = y_min + height
                        if x_max > x_min and y_max > y_min:
                            valid_indices.append(i)

                    boxes = boxes[valid_indices] if valid_indices else np.zeros((0, 4), dtype=np.float32)
                    labels = labels[valid_indices] if valid_indices else np.zeros((0,), dtype=np.int64)

            except Exception as e:
                print(f"Error during transformation: {e}")
                # Fall back to original image and empty boxes/labels
                image = np.array(image)
                boxes = np.zeros((0, 4), dtype=np.float32)
                labels = np.zeros((0,), dtype=np.int64)
        elif self.transform:
            # If there are no boxes but we have a transform
            image = self.transform(image=np.array(image))['image']

        # Construct annotations for processor
        annotations = []
        for i, (box, label) in enumerate(zip(boxes, labels)):
            x_min, y_min, width, height = box

            area = (width) * (height)
            annotations.append({
                'bbox': [x_min, y_min, width, height],
                'category_id': int(label),
                'area': float(area),
                'iscrowd': 0
            })

        # Feed image + annotations into YOLOS processor
        encoding = self.processor(
            images=image,
            annotations={
                'image_id': idx,
                'annotations': annotations
            },
            return_tensors="pt"
        )

        # Remove batch dimension
        for k, v in encoding.items():
            if isinstance(v, torch.Tensor):
                encoding[k] = v.squeeze(0)  # remove batch dim only
            else:
                encoding[k] = v  # leave non-tensor items as-is

        return encoding


def collate_fn(batch):
    batch_dict = {
        "pixel_values": torch.stack([item["pixel_values"] for item in batch])
    }

    # For YOLOS, we need to keep labels as a list of label dictionaries
    if "labels" in batch[0]:
        batch_dict["labels"] = [item["labels"][0] for item in batch]

    return batch_dict

In [4]:

# Initialize image processor
print(f"Loading YOLOS image processor from {MODEL_CHECKPOINT}")
processor = YolosImageProcessor.from_pretrained(MODEL_CHECKPOINT)

# Define data augmentations

train_transform = A.Compose(
    [
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.2),
        A.ShiftScaleRotate(
            shift_limit=0.05, scale_limit=0.1, rotate_limit=10, border_mode=0, p=0.5
        ),
    ],
    bbox_params=A.BboxParams(
        format='coco',
        label_fields=['labels'],
        min_visibility=0.1,         # Discard boxes too small or barely visible
        clip=True,            # clips bboxes to [0, 1]
    )
)

val_transform = A.Compose(
    [],
    bbox_params=A.BboxParams(
        format='coco',
        label_fields=['labels'],
        clip=True,
    )
)


# Log augmentation pipeline to wandb
wandb.config.update({
    "augmentations": str(train_transform),
})

# Create dataset
print("Creating datasets...")
full_dataset = CustomObjectDetectionDataset(
    annotations,
    IMAGES_DIR,
    processor
)

# If we haven't already split the dataset:
if not os.path.exists(f"{OUTPUT_DIR}/dataset_splits.pkl"):

    # subsetting the data because it is too big
    subset_size = int(len(full_dataset) * 0.3)

    # Create random subset indices
    indices = torch.randperm(len(full_dataset))[:subset_size]

    # Create subset
    subset_dataset = torch.utils.data.Subset(full_dataset, indices)

    # Split dataset
    train_size = int(0.7 * len(subset_dataset))
    remaining = len(subset_dataset) - train_size
    val_size = remaining // 2      # 15% of subset
    test_size = remaining - val_size  # 15% of subset
    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(subset_dataset, [train_size, val_size, test_size])

    # Save splits using indices for reproducibility
    split_indices = {
        "subset_indices": indices,
        "train_indices": train_dataset.indices,
        "val_indices": val_dataset.indices,
        "test_indices": test_dataset.indices
    }

    with open(f"{OUTPUT_DIR}/dataset_splits.pkl", "wb") as f:
        pickle.dump(split_indices, f)

with open(f"{OUTPUT_DIR}/dataset_splits.pkl", "rb") as f:
    loaded_splits = pickle.load(f)

#subset the data
subset_dataset = torch.utils.data.Subset(full_dataset, loaded_splits["subset_indices"])

# Recreate datasets using original dataset
train_dataset = torch.utils.data.Subset(subset_dataset, loaded_splits["train_indices"])
val_dataset = torch.utils.data.Subset(subset_dataset, loaded_splits["val_indices"])
test_dataset = torch.utils.data.Subset(subset_dataset, loaded_splits["test_indices"])


# Apply transforms
train_dataset.dataset.transform = train_transform
val_dataset.dataset.transform = val_transform
test_dataset.dataset.transform = val_transform

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")


# Log data split info
wandb.config.update({
    "train_size": len(train_dataset),
    "val_size": len(val_dataset),
    "train_val_ratio": len(train_dataset) / len(val_dataset) if len(val_dataset) > 0 else "N/A",
})



Loading YOLOS image processor from hustvl/yolos-small


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

  original_init(self, **validated_kwargs)
  self._set_keys()


Creating datasets...
Train dataset size: 6892
Validation dataset size: 1477
Test dataset size: 1477


## Fine-tuning the YOLOS-small model

In [5]:
from transformers import TrainerCallback


# Get model with updated config for your number of classes
print(f"Loading YOLOS model from {MODEL_CHECKPOINT}")
config = YolosConfig.from_pretrained(MODEL_CHECKPOINT, id2label=id2label, label2id=label2id)
model = YolosForObjectDetection.from_pretrained(MODEL_CHECKPOINT, config=config, ignore_mismatched_sizes=True)

# Log model architecture summary to wandb
wandb.config.update({
    "model_params": sum(p.numel() for p in model.parameters()),
    "trainable_params": sum(p.numel() for p in model.parameters() if p.requires_grad),
})

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Create a custom callback to log losses to wandb and print progress
class WandbLoggingCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is not None:
            # Calculate epoch from global step
            epoch = state.epoch

            # Print evaluation information
            print(f"\n=== Evaluation at Step {state.global_step} (Epoch {epoch:.2f}) ===")
            for key, value in metrics.items():
                if key != "epoch":
                    print(f"Eval {key}: {value:.5f}")

            # Log eval metrics to wandb
            wandb.log(
                {f"eval/{k}": v for k, v in metrics.items() if k != "epoch"},
                step=state.global_step
            )

    def on_log(self, args, state, control, logs=None, **kwargs):
        # Log training metrics
        if logs is not None:
            # Print training information
            if "loss" in logs:
                print(f"Step {state.global_step} (Epoch {logs.get('epoch', state.epoch):.2f}) - Training loss: {logs['loss']:.5f}")

            # Filter out eval metrics which are handled in on_evaluate
            train_logs = {k: v for k, v in logs.items() if not k.startswith("eval_")}
            wandb.log(
                {f"train/{k}": v for k, v in train_logs.items() if k != "epoch"},
                step=state.global_step
            )

# Define training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    push_to_hub=False,
    fp16=True,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    save_total_limit=3,
    dataloader_num_workers=NUM_WORKERS,
    dataloader_drop_last=True,
    dataloader_pin_memory=True,
    report_to="wandb",
    gradient_checkpointing=True,
    ddp_find_unused_parameters=False,
    logging_strategy="steps",  # Log at the same frequency as evaluation
    logging_steps=EVAL_STEPS // 5,  # Log more frequently than evaluation for better tracking
    logging_first_step=True,  # Log the first step to get initial loss
)

# Initialize Trainer with early stopping and wandb logging
print("Initializing the Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE),
        WandbLoggingCallback(),  # Add custom wandb logging callback
    ],
)

# Train the model
print("\n" + "="*50)
print("Starting training...")
print(f"Number of epochs: {NUM_EPOCHS}")
print(f"Training batch size: {BATCH_SIZE}")
print(f"Evaluation steps: Every {EVAL_STEPS} steps")
print(f"Saving steps: Every {SAVE_STEPS} steps")
print(f"Early stopping patience: {EARLY_STOPPING_PATIENCE} evaluations")
print("="*50 + "\n")

train_result = trainer.train()

# Print training summary
print("\n" + "="*50)
print("Training completed!")
print(f"Total steps: {trainer.state.global_step}")
print(f"Final training loss: {train_result.training_loss:.5f}")
print("="*50 + "\n")

# Save the final model
final_model_path = os.path.join(OUTPUT_DIR, "final_model")
trainer.save_model(final_model_path)
print(f"Model saved to {final_model_path}")

# Create a validation dataset subset for final evaluation
print("\nPerforming final evaluation...")
eval_subset_size = min(1000, len(val_dataset))
eval_subset_indices = torch.randperm(len(val_dataset))[:eval_subset_size]
eval_subset = torch.utils.data.Subset(val_dataset, eval_subset_indices)

# Evaluate the model on the validation subset
eval_results = trainer.evaluate(eval_dataset=eval_subset)
print("\n" + "="*50)
print("Final evaluation results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.5f}")
print("="*50 + "\n")

# Finish wandb run
print("Finishing wandb run...")
wandb.finish()
print("Training complete!")

Loading YOLOS model from hustvl/yolos-small


config.json:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/123M [00:00<?, ?B/s]

Some weights of YolosForObjectDetection were not initialized from the model checkpoint at hustvl/yolos-small and are newly initialized because the shapes did not match:
- class_labels_classifier.layers.2.bias: found shape torch.Size([92]) in the checkpoint and torch.Size([9]) in the model instantiated
- class_labels_classifier.layers.2.weight: found shape torch.Size([92, 384]) in the checkpoint and torch.Size([9, 384]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initializing the Trainer...

Starting training...
Number of epochs: 20
Training batch size: 4
Evaluation steps: Every 200 steps
Saving steps: Every 2000 steps
Early stopping patience: 4 evaluations





Step,Training Loss,Validation Loss
200,1.561,1.563906
400,1.4513,1.481082
600,1.4115,1.470901
800,1.3625,1.430606
1000,1.3191,1.401126
1200,1.2956,1.39647
1400,1.2952,1.380664
1600,1.2749,1.380914
1800,1.2592,1.371868
2000,1.2115,1.364194


Step 1 (Epoch 0.01) - Training loss: 3.70200
Step 40 (Epoch 0.37) - Training loss: 3.29850
Step 80 (Epoch 0.74) - Training loss: 2.07160
Step 120 (Epoch 1.11) - Training loss: 1.69690
Step 160 (Epoch 1.48) - Training loss: 1.59920
Step 200 (Epoch 1.85) - Training loss: 1.56100

=== Evaluation at Step 200 (Epoch 1.85) ===
Eval eval_loss: 1.56391
Eval eval_runtime: 102.09700
Eval eval_samples_per_second: 14.46700
Eval eval_steps_per_second: 3.62400




Step 240 (Epoch 2.22) - Training loss: 1.48300
Step 280 (Epoch 2.59) - Training loss: 1.50070
Step 320 (Epoch 2.97) - Training loss: 1.48460
Step 360 (Epoch 3.33) - Training loss: 1.43690
Step 400 (Epoch 3.71) - Training loss: 1.45130

=== Evaluation at Step 400 (Epoch 3.71) ===
Eval eval_loss: 1.48108
Eval eval_runtime: 29.42700
Eval eval_samples_per_second: 50.19200
Eval eval_steps_per_second: 12.57300




Step 440 (Epoch 4.07) - Training loss: 1.40470
Step 480 (Epoch 4.45) - Training loss: 1.40850
Step 520 (Epoch 4.82) - Training loss: 1.43060
Step 560 (Epoch 5.19) - Training loss: 1.39290
Step 600 (Epoch 5.56) - Training loss: 1.41150

=== Evaluation at Step 600 (Epoch 5.56) ===
Eval eval_loss: 1.47090
Eval eval_runtime: 29.48000
Eval eval_samples_per_second: 50.10200
Eval eval_steps_per_second: 12.55100




Step 640 (Epoch 5.93) - Training loss: 1.43230
Step 680 (Epoch 6.30) - Training loss: 1.34800
Step 720 (Epoch 6.67) - Training loss: 1.37660
Step 760 (Epoch 7.04) - Training loss: 1.37110
Step 800 (Epoch 7.41) - Training loss: 1.36250

=== Evaluation at Step 800 (Epoch 7.41) ===
Eval eval_loss: 1.43061
Eval eval_runtime: 29.45300
Eval eval_samples_per_second: 50.14800
Eval eval_steps_per_second: 12.56200




Step 840 (Epoch 7.78) - Training loss: 1.36000
Step 880 (Epoch 8.15) - Training loss: 1.32700
Step 920 (Epoch 8.52) - Training loss: 1.35130
Step 960 (Epoch 8.89) - Training loss: 1.34010
Step 1000 (Epoch 9.26) - Training loss: 1.31910

=== Evaluation at Step 1000 (Epoch 9.26) ===
Eval eval_loss: 1.40113
Eval eval_runtime: 29.54950
Eval eval_samples_per_second: 49.98400
Eval eval_steps_per_second: 12.52100




Step 1040 (Epoch 9.63) - Training loss: 1.32050
Step 1080 (Epoch 10.00) - Training loss: 1.32350
Step 1120 (Epoch 10.37) - Training loss: 1.30920
Step 1160 (Epoch 10.74) - Training loss: 1.29990
Step 1200 (Epoch 11.11) - Training loss: 1.29560

=== Evaluation at Step 1200 (Epoch 11.11) ===
Eval eval_loss: 1.39647
Eval eval_runtime: 29.57420
Eval eval_samples_per_second: 49.94200
Eval eval_steps_per_second: 12.51100




Step 1240 (Epoch 11.48) - Training loss: 1.28470
Step 1280 (Epoch 11.85) - Training loss: 1.28530
Step 1320 (Epoch 12.22) - Training loss: 1.28790
Step 1360 (Epoch 12.59) - Training loss: 1.28070
Step 1400 (Epoch 12.97) - Training loss: 1.29520

=== Evaluation at Step 1400 (Epoch 12.97) ===
Eval eval_loss: 1.38066
Eval eval_runtime: 29.57510
Eval eval_samples_per_second: 49.94100
Eval eval_steps_per_second: 12.51100




Step 1440 (Epoch 13.33) - Training loss: 1.26710
Step 1480 (Epoch 13.71) - Training loss: 1.26660
Step 1520 (Epoch 14.07) - Training loss: 1.27330
Step 1560 (Epoch 14.45) - Training loss: 1.28180
Step 1600 (Epoch 14.82) - Training loss: 1.27490

=== Evaluation at Step 1600 (Epoch 14.82) ===
Eval eval_loss: 1.38091
Eval eval_runtime: 29.65340
Eval eval_samples_per_second: 49.80900
Eval eval_steps_per_second: 12.47700




Step 1640 (Epoch 15.19) - Training loss: 1.22990
Step 1680 (Epoch 15.56) - Training loss: 1.25860
Step 1720 (Epoch 15.93) - Training loss: 1.25020
Step 1760 (Epoch 16.30) - Training loss: 1.23480
Step 1800 (Epoch 16.67) - Training loss: 1.25920

=== Evaluation at Step 1800 (Epoch 16.67) ===
Eval eval_loss: 1.37187
Eval eval_runtime: 29.61300
Eval eval_samples_per_second: 49.87700
Eval eval_steps_per_second: 12.49400




Step 1840 (Epoch 17.04) - Training loss: 1.21550
Step 1880 (Epoch 17.41) - Training loss: 1.24720
Step 1920 (Epoch 17.78) - Training loss: 1.22770
Step 1960 (Epoch 18.15) - Training loss: 1.22340
Step 2000 (Epoch 18.52) - Training loss: 1.21150

=== Evaluation at Step 2000 (Epoch 18.52) ===
Eval eval_loss: 1.36419
Eval eval_runtime: 29.60720
Eval eval_samples_per_second: 49.88700
Eval eval_steps_per_second: 12.49700




Step 2040 (Epoch 18.89) - Training loss: 1.21830
Step 2080 (Epoch 19.26) - Training loss: 1.20570
Step 2120 (Epoch 19.63) - Training loss: 1.20610

Training completed!
Total steps: 2140
Final training loss: 1.38629

Model saved to yolos_finetuned/final_model

Performing final evaluation...



=== Evaluation at Step 2140 (Epoch 19.82) ===
Eval eval_loss: 1.37559
Eval eval_runtime: 26.68600
Eval eval_samples_per_second: 37.47300
Eval eval_steps_per_second: 9.36800

Final evaluation results:
eval_loss: 1.37559
eval_runtime: 26.68600
eval_samples_per_second: 37.47300
eval_steps_per_second: 9.36800
epoch: 19.81718

Finishing wandb run...


0,1
eval/loss,█▅▅▃▂▂▂▂▁▁▁
eval/runtime,█▁▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▁█████████▆
eval/steps_per_second,▁█████████▅
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇█████
train/grad_norm,▇▃▂▃▃▂▂▂▂▂▄█▃▃▂▁▂▃▅▂▂▄▂▂▃▁▁▁▂▃▂▂▃▂▂▂▂▂▁▃
train/learning_rate,▆████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁
train/loss,█▇▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁

0,1
eval/loss,1.37559
eval/runtime,26.686
eval/samples_per_second,37.473
eval/steps_per_second,9.368
total_flos,7.365561136055989e+19
train/epoch,19.81718
train/global_step,2140.0
train/grad_norm,22.06278
train/learning_rate,0.0
train/loss,1.2061


Training complete!


## Inference and scores

In [6]:
MODEL_PATH = "yolos_finetuned/final_model"
model = YolosForObjectDetection.from_pretrained(MODEL_PATH).to("cuda")


In [6]:
import matplotlib.pyplot as plt

In [7]:
#conf
IOU_THRESHOLD = 0.5

#auair classes
CLASSES = ['Human','Car','Truck','Van','Motorbike','Bicycle','Bus','Trailer']
# colors for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]


def plot_results(pil_img, prob, boxes):
    plt.figure(figsize=(16,10))
    plt.imshow(pil_img)
    ax = plt.gca()
    colors = COLORS * 100
    for p, (x_min, y_min, width, height), c in zip(prob, boxes.tolist(), colors):
        x_max = x_min + width
        y_max = y_min + height
        ax.add_patch(plt.Rectangle((x_min, y_min), x_max - x_min, y_max - y_min,
                                   fill=False, color=c, linewidth=3))
        cl = p.argmax()
        text = f'{CLASSES[cl]}: {p[cl]:0.2f}'
        ax.text(x_min, y_min, text, fontsize=15,
                bbox=dict(facecolor='yellow', alpha=0.5))
    plt.axis('off')
    plt.show()

In [8]:
# for output bounding box post-processing

def box_xmymwh_to_xyxy(x):
    x_m, y_m, w, h = x.unbind(1)
    b = [x_m, y_m,
         (x_m + w), (y_m + h)]
    return torch.stack(b, dim=1).to('cuda')

def auair_bbox_to_xyxy(bbox):
    x_min = bbox['left']
    y_min = bbox['top']
    x_max = x_min + bbox['width']
    y_max = y_min + bbox['height']
    return [x_min, y_min, x_max, y_max]

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_xmymwh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32).to('cuda')
    return b.to('cuda')

In [9]:
def calculate_iou(bbox1, bbox2):
    #bboxes are of form x_min, y_min, x_max, y_max
    x1 = max(bbox1[0], bbox2[0])
    y1 = max(bbox1[1], bbox2[1])
    x2 = min(bbox1[2], bbox2[2])
    y2 = min(bbox1[3], bbox2[3])

    if x2 < x1 or y2 < y1:
        return 0
    intersection = (y2 - y1) * (x2 - x1)
    area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
    area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
    union = area1 + area2 - intersection

    iou = intersection / union
    return iou


def find_closest_bbox(bbox, bboxes):
    max_iou = 0
    closest_bbox_index = None
    for i, bbox2 in enumerate(bboxes):
        iou = calculate_iou(bbox, bbox2)
        if iou > max_iou:
            max_iou = iou
            closest_bbox_index = i
    return closest_bbox_index, max_iou


In [10]:
test_annot = [annotations[i] for i in loaded_splits["test_indices"]]

In [11]:
test_dataset

<torch.utils.data.dataset.Subset at 0x7c9a7cebf210>

In [None]:
from transformers import pipeline

pipeline

In [13]:
image_paths = []
test_outputs = []
test_ground_truth_bbox = []
test_ground_truth_classes = []

model.eval()
with torch.no_grad():
    for annot in tqdm(test_annot, desc="Evaluating samples"):
        image_path = f'{IMAGES_DIR}/{annot["image_name"]}'
        img = Image.open(image_path)
        image_paths.append(image_path)

        # Process on GPU
        inputs = processor(images=img, return_tensors="pt").to("cuda")
        outputs = model(**inputs)

        # Move to CPU before storing
        cpu_outputs = {k: v.detach().cpu() if isinstance(v, torch.Tensor) else v for k, v in outputs.items()}
        test_outputs.append(cpu_outputs)

        ground_truth_bboxes = torch.Tensor([auair_bbox_to_xyxy(bbox) for bbox in annot['bbox']])  # Keep on CPU
        test_ground_truth_bbox.append(ground_truth_bboxes)

        ground_truth_classes = torch.Tensor([bbox['class'] for bbox in annot['bbox']]).to(torch.int64)  # Keep on CPU
        test_ground_truth_classes.append(ground_truth_classes)

        torch.cuda.empty_cache()


Evaluating samples: 100%|██████████| 1477/1477 [09:13<00:00,  2.67it/s]


KeyError: 'pred_logits'

In [15]:
for output in test_outputs:

    # keep only predictions with 0.7+ confidence
    probas = output['logits'].softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > 0.7

    # convert boxes from [0; 1] to image scales
    output['bboxes_scaled_filtered'] = rescale_bboxes(output['pred_boxes'][0, keep], img.size)
    output['probas_filtered'] = probas[keep]

In [16]:
torch.cuda.empty_cache()

In [16]:
import pandas as pd
import math

results = pd.DataFrame(columns=['image_name', 'true_class', 'pred_class', 'iou', 'pred_confidence'])

for i in range(len(test_outputs)):
    image_path = image_paths[i]
    image_name = image_path.split('/')[-1]
    output = test_outputs[i]
    pred_probas = output['probas_filtered']
    pred_classes = [pred_probas[i].argmax() for i in range(len(pred_probas))]
    pred_bboxes = output['bboxes_scaled_filtered']
    ground_truth_bboxes = test_ground_truth_bbox[i]
    ground_truth_classes = [int(i) for i in test_ground_truth_classes[i]]

    considered_ground_truth_indexes = []
    for j in range(len(pred_bboxes)):
        bbox = pred_bboxes[j]
        pred = pred_classes[j]
        closest_bbox_index, iou = find_closest_bbox(bbox, ground_truth_bboxes)
        considered_ground_truth_indexes.append(closest_bbox_index)
        if closest_bbox_index is None:
            result = {'image_name': image_name, 'true_class': math.nan,
                      'pred_class': int(pred),'iou': float(iou), 'pred_confidence': math.nan}
        else:
            result = {'image_name': image_name, 'true_class': ground_truth_classes[closest_bbox_index],
                  'pred_class': int(pred),'iou': float(iou), 'pred_confidence': float(pred_probas[j][pred])}

        results = pd.concat([results, pd.DataFrame([result])], ignore_index=True)
    for i in range(len(ground_truth_bboxes)):
        if i not in considered_ground_truth_indexes:
            result = {'image_name': image_name, 'true_class': ground_truth_classes[i],
                      'pred_class': math.nan,'iou': math.nan, 'pred_confidence': math.nan}
            results = pd.concat([results, pd.DataFrame([result])], ignore_index=True)

  results = pd.concat([results, pd.DataFrame([result])], ignore_index=True)


In [17]:
results

Unnamed: 0,image_name,true_class,pred_class,iou,pred_confidence
0,frame_20190905091750_x_0002490.jpg,1,1,0.162151,0.912359
1,frame_20190905091750_x_0002490.jpg,1,1,0.088314,0.959211
2,frame_20190905091750_x_0002490.jpg,1,1,0.149410,0.872258
3,frame_20190905091750_x_0002490.jpg,7,2,0.189316,0.754010
4,frame_20190905091750_x_0002490.jpg,1,,,
...,...,...,...,...,...
5907,frame_20190905091750_xx_0001143.jpg,0,0,0.186102,0.768635
5908,frame_20190905091750_xx_0001143.jpg,1,,,
5909,frame_20190905091750_x_0004882.jpg,1,1,0.128156,0.956187
5910,frame_20190905091750_x_0004882.jpg,2,,,


In [18]:
from sklearn.metrics import auc, average_precision_score

def calculate_precision_recall(results, per_class = True, threshold = IOU_THRESHOLD):
    classes = results['true_class'].dropna().unique()
    TP = (results['true_class'] == results['pred_class']) & (results['iou'] >= threshold)
    FP = (results['iou'] < threshold) | (results['pred_class'] != results['true_class'])
    FN = (results['true_class'] != math.nan) & (results['pred_class'].isna())
    if per_class:
        precisions = []
        recalls = []
        for c in classes:
            tp = TP[results['true_class'] == c].sum()
            fp = FP[results['true_class'] == c].sum()
            fn = FN[results['true_class'] == c].sum()

            precision = tp / (tp + fp)
            recall = tp / (tp + fn)

            precisions.append(precision)
            recalls.append(recall)
        return precisions, recalls, classes

    else:
        tp = TP.sum()
        fp = FP.sum()
        fn = FN.sum()

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        return precision, recall

def calculate_map(results, per_class = True, threshold = IOU_THRESHOLD):
    if per_class:
        precisions_list_per_class = []
        recalls_list_per_class = []
        for threshold in np.arange(threshold,1,0.05):
            precisions, recalls, classes = calculate_precision_recall(results, per_class, threshold)
            precisions_list_per_class.append(precisions)
            recalls_list_per_class.append(recalls)
        map_per_class = []
        for i in range(len(classes)):
            precisions_list = []
            recalls_list = []
            for j in range(len(precisions_list_per_class)):
                precisions_list.append(precisions_list_per_class[j][i])
                recalls_list.append(recalls_list_per_class[j][i])
            map = round(auc(recalls_list, precisions_list),3)
            map_per_class.append(map)
        return map_per_class, classes
    else:
        precisions_list = []
        recalls_list = []
        for threshold in np.arange(threshold,1,0.05):
            precision, recall = calculate_precision_recall(results, per_class, threshold)

            precisions_list.append(precision)
            recalls_list.append(recall)

        map = round(auc(recalls_list, precisions_list),3)

        return map
    classes = results['true_class'].dropna().unique()

In [19]:
calculate_map(results, per_class = True, threshold = IOU_THRESHOLD)

([np.float64(0.0),
  np.float64(0.0),
  np.float64(0.0),
  np.float64(0.0),
  np.float64(0.0),
  np.float64(0.0),
  np.float64(0.0),
  np.float64(0.0)],
 array([1, 7, 2, 0, 6, 3, 5, 4], dtype=object))