In [None]:
!git clone https://github.com/IDEA-Research/DINO.git

In [None]:
!pip install -r /content/DINO/requirements.txt

In [None]:
!pip install yapf==0.40.1

In [None]:
!pip install 'numpy<1.24'



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
sys.path.append('/content/DINO')


In [None]:
# Step 1: Change to the correct directory
%cd /content/DINO/models/dino/ops

# Step 2: Run the setup.py command
!python setup.py build install

In [None]:
!python test.py

In [None]:
import numpy as np
print(np.__version__)

1.23.5


*Make sure a empty folder called COCODIR exists before running the below cell*

In [None]:
import shutil
import os
import random
import json

# Base directory for COCO format in Google Colab
base_dir = '/content/DINO/COCODIR'
image_target_train_dir = os.path.join(base_dir, 'train2017')
image_target_val_dir = os.path.join(base_dir, 'val2017')
annotation_dir = os.path.join(base_dir, 'annotations')
train_annotation_path = os.path.join(annotation_dir, 'instances_train2017.json')
val_annotation_path = os.path.join(annotation_dir, 'instances_val2017.json')

# Create directories if they don't exist
os.makedirs(image_target_train_dir, exist_ok=True)
os.makedirs(image_target_val_dir, exist_ok=True)
os.makedirs(annotation_dir, exist_ok=True)  # Ensure annotation directory exists

# Load your custom dataset annotations
annotation_file = '/content/DINO/random_sample_mavi_2_gt.json'  # Adjust this path if necessary
with open(annotation_file, 'r') as f:
    coco_data = json.load(f)

# Shuffle the images randomly
image_data = coco_data['images']
random.shuffle(image_data)

# Split the images into train (160 images) and val (40 images)
train_images = image_data[:160]
val_images = image_data[160:200]

# Function to copy images to target directory
def copy_images(images, target_dir):
    for img_info in images:
        img_filename = img_info['file_name']
        src_path = os.path.join('/content/DINO/Pedestrian_dataset_for_internship_assignment', img_filename)  # Adjust this path if necessary
        dst_path = os.path.join(target_dir, img_filename)

        if os.path.exists(src_path):
            shutil.copy(src_path, dst_path)
        else:
            print(f"Image not found: {src_path}")

# Copy train images
copy_images(train_images, image_target_train_dir)

# Copy validation images
copy_images(val_images, image_target_val_dir)

# Filter annotations based on the split images
def filter_annotations(images, annotations):
    image_ids = [img['id'] for img in images]
    filtered_annotations = [ann for ann in annotations if ann['image_id'] in image_ids]
    return filtered_annotations

# Create new annotations for train and val
train_annotations = filter_annotations(train_images, coco_data['annotations'])
val_annotations = filter_annotations(val_images, coco_data['annotations'])

# Create new annotation files in COCO format
def create_annotation_file(images, annotations, save_path):
    new_coco_structure = {
        'images': images,
        'annotations': annotations,
        'categories': coco_data['categories'],
        'info': coco_data.get('info', {}),
        'licenses': coco_data.get('licenses', [])
    }

    with open(save_path, 'w') as f:
        json.dump(new_coco_structure, f)
    print(f"Annotations saved to {save_path}")

# Save train and val annotations
create_annotation_file(train_images, train_annotations, train_annotation_path)
create_annotation_file(val_images, val_annotations, val_annotation_path)

print("Train and validation images and annotations have been successfully created.")


Annotations saved to /content/DINO/COCODIR/annotations/instances_train2017.json
Annotations saved to /content/DINO/COCODIR/annotations/instances_val2017.json
Train and validation images and annotations have been successfully created.


In [None]:
%cd /content/DINO

/content/DINO


# **Evaluation for 12 epoch setting**

In [None]:

coco_path = "/content/DINO/COCODIR"
checkpoint_path = "/content/drive/MyDrive/checkpoint0011_4scale.pth"
eval_script_path = "/content/DINO/scripts/DINO_eval.sh"

!bash {eval_script_path} {coco_path} {checkpoint_path}


# **Evaluation for 24 epoch setting**

In [None]:

coco_path = "/content/DINO/COCODIR"
checkpoint_path = "/content/drive/MyDrive/checkpoint0011_4scale24.pth"
eval_script_path = "/content/DINO/scripts/DINO_eval.sh"

!bash {eval_script_path} {coco_path} {checkpoint_path}


# **Evaluation for 36 epoch**

In [None]:

coco_path = "/content/DINO/COCODIR"
checkpoint_path = "/content/drive/MyDrive/checkpoint0011_4scale36.pth"
eval_script_path = "/content/DINO/scripts/DINO_eval.sh"

!bash {eval_script_path} {coco_path} {checkpoint_path}


# **Average Precision (AP) values obtained from the validation set**

In [None]:
import subprocess
import re


def run_evaluation(coco_path, checkpoint_path, eval_script_path):
    command = f"bash {eval_script_path} {coco_path} {checkpoint_path}"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    return result.stdout


def extract_ap_metrics(output):
    ap_metrics = {}


    ap_50_95 = re.search(r"Average Precision.+IoU=0\.50:0\.95.+area=.+all.+maxDets=100.+?= (0\.\d+)", output)
    ap_50 = re.search(r"Average Precision.+IoU=0\.50.+area=.+all.+maxDets=100.+?= (0\.\d+)", output)
    ap_75 = re.search(r"Average Precision.+IoU=0\.75.+area=.+all.+maxDets=100.+?= (0\.\d+)", output)
    ap_small = re.search(r"Average Precision.+IoU=0\.50:0\.95.+area=.+small.+maxDets=100.+?= (0\.\d+)", output)
    ap_medium = re.search(r"Average Precision.+IoU=0\.50:0\.95.+area=.+medium.+maxDets=100.+?= (0\.\d+)", output)
    ap_large = re.search(r"Average Precision.+IoU=0\.50:0\.95.+area=.+large.+maxDets=100.+?= (0\.\d+)", output)

    if ap_50_95: ap_metrics['AP@[IoU=0.50:0.95]'] = float(ap_50_95.group(1))
    if ap_50: ap_metrics['AP@[IoU=0.50]'] = float(ap_50.group(1))
    if ap_75: ap_metrics['AP@[IoU=0.75]'] = float(ap_75.group(1))
    if ap_small: ap_metrics['AP@Small'] = float(ap_small.group(1))
    if ap_medium: ap_metrics['AP@Medium'] = float(ap_medium.group(1))
    if ap_large: ap_metrics['AP@Large'] = float(ap_large.group(1))

    return ap_metrics


coco_path = "/content/DINO/COCODIR"
eval_script_path = "/content/DINO/scripts/DINO_eval.sh"


checkpoints = {
    "12_epoch": "/content/drive/MyDrive/checkpoint0011_4scale.pth",
    "24_epoch": "/content/drive/MyDrive/checkpoint0011_4scale24.pth",
    "36_epoch": "/content/drive/MyDrive/checkpoint0011_4scale36.pth"
}


ap_results = {}
for epoch, checkpoint_path in checkpoints.items():
    print(f"Running evaluation for {epoch}...")
    output = run_evaluation(coco_path, checkpoint_path, eval_script_path)
    ap_results[epoch] = extract_ap_metrics(output)


import pandas as pd

df = pd.DataFrame.from_dict(ap_results, orient='index')
print(df)


Running evaluation for 12_epoch...
Running evaluation for 24_epoch...
Running evaluation for 36_epoch...
          AP@[IoU=0.50:0.95]  AP@[IoU=0.50]  AP@[IoU=0.75]  AP@Small  AP@Large
12_epoch               0.461          0.461          0.486     0.406     0.684
24_epoch               0.461          0.461          0.486     0.406     0.684
36_epoch               0.461          0.461          0.486     0.406     0.684


In [None]:
import os, sys
import torch, json
import numpy as np

from main import build_model_main
from util.slconfig import SLConfig
from datasets import build_dataset
from util.visualizer import COCOVisualizer
from util import box_ops

In [None]:
%cd /content/DINO/models/dino/ops

/content/DINO/models/dino/ops


In [None]:
!python setup.py build_ext --inplace

In [None]:
model_config_path = "/content/DINO/config/DINO/DINO_4scale.py"
model_checkpoint_path = "/content/drive/MyDrive/checkpoint0011_4scale.pth"

In [None]:
args = SLConfig.fromfile(model_config_path)
args.device = 'cuda'
model, criterion, postprocessors = build_model_main(args)
checkpoint = torch.load(model_checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model'])
_ = model.eval()

In [None]:
# load coco names
with open('/content/DINO/util/coco_id2name.json') as f:
    id2name = json.load(f)
    id2name = {int(k):v for k,v in id2name.items()}

**Visualize images from a dataloader**

In [None]:

args.dataset_file = 'coco'
args.coco_path = "/content/DINO/COCODIR"
args.fix_size = False

# Build the validation dataset
dataset_val = build_dataset(image_set='val', args=args)


data_aug_params: {
  "scales": [
    480,
    512,
    544,
    576,
    608,
    640,
    672,
    704,
    736,
    768,
    800
  ],
  "max_size": 1333,
  "scales2_resize": [
    400,
    500,
    600
  ],
  "scales2_crop": [
    384,
    600
  ]
}
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


**Get an Example and Visualize it**

In [None]:
image, targets = dataset_val[9]

**The image being generated after running the below cell is the actual result**

In [None]:
# Define id2name for the single class
id2name = {1: "person"}  # Assuming class ID for "person" is 1

# Build gt_dict for visualization
box_label = [id2name[int(item)] for item in targets['labels']]
gt_dict = {
    'boxes': targets['boxes'],
    'image_id': targets['image_id'],
    'size': targets['size'],
    'box_label': box_label,
}

# Visualize
vslzr = COCOVisualizer()
vslzr.visualize(image, gt_dict, savedir=None)


**Visualize Model Predictions**

In [None]:
output = model.cuda()(image[None].cuda())
output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [None]:
thershold = 0.3

scores = output['scores']
labels = output['labels']
boxes = box_ops.box_xyxy_to_cxcywh(output['boxes'])
select_mask = scores > thershold

**The image being generated after running the below cell is the predcited result**

In [None]:

filtered_labels = labels[select_mask]
filtered_boxes = boxes[select_mask]
id2name = {1: "person"}
box_label = [id2name[int(item)] for item in filtered_labels if int(item) in id2name]
pred_dict = {
    'boxes': filtered_boxes,
    'size': targets['size'],
    'box_label': box_label
}

vslzr.visualize(image, pred_dict, savedir=None)


**Model Evaluation: Ground Truth vs. Predicted Bounding Box Visualization (Epochs 12, 24, 36)-Analysis on Pre-trained model**

In [None]:
import os, torch, json
from main import build_model_main
from util.slconfig import SLConfig
from datasets import build_dataset
from util.visualizer import COCOVisualizer
from util import box_ops

# Common configuration and dataset loading
model_config_path = "/content/DINO/config/DINO/DINO_4scale.py"
args = SLConfig.fromfile(model_config_path)
args.device = 'cuda'
args.dataset_file = 'coco'
args.coco_path = "/content/DINO/COCODIR"
args.fix_size = False

# Build the validation dataset
dataset_val = build_dataset(image_set='val', args=args)

# COCO visualizer instance
vslzr = COCOVisualizer()

# Single class dictionary for "person"
id2name = {1: "person"}

# Function to visualize ground truth (actual) bounding boxes
def visualize_actual(image, targets):
    # Ground Truth Visualization (Actual)
    gt_box_label = [id2name[int(item)] for item in targets['labels']]
    gt_dict = {
        'boxes': targets['boxes'],
        'image_id': targets['image_id'],
        'size': targets['size'],
        'box_label': gt_box_label,
    }

    # Visualize the actual image with ground truth boxes
    print("Visualizing Actual Bounding Boxes")
    vslzr.visualize(image, gt_dict, savedir=None)

# Function to visualize model predictions (predicted)
def visualize_predicted(image, output, targets, threshold=0.3):
    # Model Prediction Visualization
    scores = output['scores']
    labels = output['labels']
    boxes = box_ops.box_xyxy_to_cxcywh(output['boxes'])
    select_mask = scores > threshold
    filtered_labels = labels[select_mask]
    filtered_boxes = boxes[select_mask]

    # Create a dictionary for the predicted boxes
    pred_box_label = [id2name[int(item)] for item in filtered_labels if int(item) in id2name]
    pred_dict = {
        'boxes': filtered_boxes,
        'size': targets['size'],
        'box_label': pred_box_label
    }

    # Visualize the predicted image with predicted boxes
    print("Visualizing Predicted Bounding Boxes")
    vslzr.visualize(image, pred_dict, savedir=None)

# Prepare function to load a model and predict results for specific images (image 0 and image 7)
def load_model_and_predict(checkpoint_path, image_indices=[0, 7]):
    # Load model and checkpoint
    model, criterion, postprocessors = build_model_main(args)
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(checkpoint['model'])
    _ = model.eval()

    # Get predictions and visualize for the specified images
    for i in image_indices:
        image, targets = dataset_val[i]

        # Visualize the actual (ground truth) bounding boxes
        visualize_actual(image, targets)

        # Get model predictions
        output = model.cuda()(image[None].cuda())
        output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]

        # Visualize the predicted bounding boxes
        visualize_predicted(image, output, targets)

# Run prediction and visualization for different checkpoints and images 0, 7
print("Running for 12 epoch checkpoint:")
load_model_and_predict("/content/drive/MyDrive/checkpoint0011_4scale.pth")

print("Running for 24 epoch checkpoint:")
load_model_and_predict("/content/drive/MyDrive/checkpoint0011_4scale24.pth")

print("Running for 36 epoch checkpoint:")
load_model_and_predict("/content/drive/MyDrive/checkpoint0011_4scale36.pth")


**The images above illustrate the performance of the pretrained models at various epoch checkpoints. Given the simplicity and lower crowd density of the scenes, the model successfully detects pedestrians**



---



**Instances where the model failed to accurately detect objects occurred in more complex images, often containing multiple obstacles. In these cases, the model misclassified or missed objects due to the increased scene complexity**

In [None]:
import os
import torch
from main import build_model_main
from util.slconfig import SLConfig
from datasets import build_dataset
from util.visualizer import COCOVisualizer
from util import box_ops

# Common configuration and dataset loading
model_config_path = "/content/DINO/config/DINO/DINO_4scale.py"
args = SLConfig.fromfile(model_config_path)
args.device = 'cuda'
args.dataset_file = 'coco'
args.coco_path = "/content/DINO/COCODIR"
args.fix_size = False

# Build the validation dataset
dataset_val = build_dataset(image_set='val', args=args)

# COCO visualizer instance
vslzr = COCOVisualizer()

# Single class dictionary for "person"
id2name = {1: "person"}

# Function to visualize ground truth (actual) bounding boxes
def visualize_actual(image, targets, epoch):
    gt_box_label = [id2name[int(item)] for item in targets['labels']]
    gt_dict = {
        'boxes': targets['boxes'],
        'image_id': targets['image_id'],
        'size': targets['size'],
        'box_label': gt_box_label,
    }
    print("Visualizing Actual Bounding Boxes")
    vslzr.visualize(image, gt_dict, savedir=f'/content/results/epoch_{epoch}_actual.jpg')

# Function to visualize model predictions (predicted)
def visualize_predicted(image, output, targets, epoch, threshold=0.3):
    scores = output['scores']
    labels = output['labels']
    boxes = box_ops.box_xyxy_to_cxcywh(output['boxes'])
    select_mask = scores > threshold
    filtered_labels = labels[select_mask]
    filtered_boxes = boxes[select_mask]

    pred_box_label = [id2name[int(item)] for item in filtered_labels if int(item) in id2name]
    pred_dict = {
        'boxes': filtered_boxes,
        'size': targets['size'],
        'box_label': pred_box_label
    }

    try:
        assert len(pred_dict['box_label']) == targets['boxes'].shape[0], \
            f"{len(pred_dict['box_label'])} = {targets['boxes'].shape[0]}, Wrong prediction"
    except AssertionError:
        print("Wrong Prediction detected: the number of predicted boxes does not match ground truth.")
        pred_dict['box_label'] = ['Wrong Prediction'] * len(filtered_boxes)

    print("Visualizing Predicted Bounding Boxes")
    vslzr.visualize(image, pred_dict, savedir=f'/content/results/epoch_{epoch}_predicted.jpg')

# Function to load a model and predict results for specific images
def load_model_and_predict(checkpoint_path, image_indices=[6], epoch=None):
    try:
        model, criterion, postprocessors = build_model_main(args)
        checkpoint = torch.load(checkpoint_path, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        model.eval()

        for i in image_indices:
            image, targets = dataset_val[i]

            # Visualize the actual (ground truth) bounding boxes
            visualize_actual(image, targets, epoch)

            # Get model predictions
            output = model.cuda()(image[None].cuda())
            output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]

            # Visualize the predicted bounding boxes
            visualize_predicted(image, output, targets, epoch)
    except Exception as e:
        print(f"Error during loading or prediction for checkpoint {checkpoint_path}: {e}")

# Create a directory to save results if it doesn't exist
os.makedirs('/content/results', exist_ok=True)


data_aug_params: {
  "scales": [
    480,
    512,
    544,
    576,
    608,
    640,
    672,
    704,
    736,
    768,
    800
  ],
  "max_size": 1333,
  "scales2_resize": [
    400,
    500,
    600
  ],
  "scales2_crop": [
    384,
    600
  ]
}
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


*Run the above code before running tests for different epoch checkpoints*

# **Load and Predict for 12th Epoch Checkpoint**

In [None]:
# Run prediction for 12th Epoch Checkpoint
print("Running for 12th Epoch Checkpoint:")
load_model_and_predict("/content/drive/MyDrive/checkpoint0011_4scale.pth", epoch=12)


# **Load and Predict for 24th Epoch Checkpoint**

In [None]:
# Run prediction for 24th Epoch Checkpoint
print("Running for 24th Epoch Checkpoint:")
load_model_and_predict("/content/drive/MyDrive/checkpoint0011_4scale24.pth", epoch=24)


# **Load and Predict for 36th Epoch Checkpoint**

In [None]:
print("Running for 36th Epoch Checkpoint:")
load_model_and_predict("/content/drive/MyDrive/checkpoint0011_4scale36.pth", epoch=36)

Failure in accurate Object detection



1.  Despite different epoch checkpoints, the pre-trained model struggles with pedestrian detection.
2.   Over 80% of the dataset exhibits in-correct detections.
3. Complex scenes hinder accurate identification
4. Identifies extra classes





---





1.   **checkpoint.pth** : performs best on simple images
2.   **checkpoint12.pth,checkpoint24.pth,checkpoint36.pth** :performs best on complex images, with light varying and minute complex scenes followed by other checkpoints





# **Testing the model with no extra augmentation techniques**

In [None]:
%cd /content/DINO

/content/DINO


In [None]:
from PIL import Image
import datasets.transforms as T

In [None]:
image = Image.open("/content/DINO/16065.jpg").convert("RGB")

In [None]:
transform = T.Compose([
    T.RandomResize([800], max_size=1333),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
image, _ = transform(image, None)

In [None]:
model_config_path = "config/DINO/DINO_4scale.py"
model_checkpoint_path = "/content/drive/MyDrive/checkpoint.pth"#/content/drive/MyDrive/checkpoint.pth
#/content/DINO/logs/DINO/R50-MS4/checkpoint.pth

In [None]:
args = SLConfig.fromfile(model_config_path)
args.device = 'cuda'
model, criterion, postprocessors = build_model_main(args)
checkpoint = torch.load(model_checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model'])
_ = model.eval()

  checkpoint = torch.load(model_checkpoint_path, map_location='cpu')


In [None]:
output = model.cuda()(image[None].cuda())
output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]

In [None]:
thershold = 0.3 # set a thershold

vslzr = COCOVisualizer()

scores = output['scores']
labels = output['labels']
boxes = box_ops.box_xyxy_to_cxcywh(output['boxes'])
select_mask = scores > thershold

box_label = [id2name[int(item)] for item in labels[select_mask]]
pred_dict = {
    'boxes': boxes[select_mask],
    'size': torch.Tensor([image.shape[1], image.shape[2]]),
    'box_label': box_label
}
vslzr.visualize(image, pred_dict, savedir=None, dpi=100)

# **24 epoch checkpoint with no augmentation**

In [None]:
%cd /content/DINO

/content/DINO


In [None]:
!bash /content/DINO/scripts/DINO_train.sh /content/DINO/COCODIR \
--pretrain_model_path /content/drive/MyDrive/checkpoint0011_4scale24.pth \
--finetune_ignore label_enc.weight class_embed | tee /content/DINO/results/train_log.txt

In [None]:
!python main.py \
    --config_file config/DINO/DINO_4scale.py \
    --output_dir /content/DINO/results \
    --pretrain_model_path /content/DINO/logs/DINO/R50-MS4/checkpoint.pth \
    --coco_path /content/DINO/COCODIR \
    --eval \
    --options dn_scalar=100 embed_init_tgt=TRUE \
    dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
    dn_box_noise_scale=1.0

# **Models below are trained with extra data augmentations**

# **Fine-Tuning the Pre-Trained Model 12 epoch checkpoint**

In [None]:
%cd /content/DINO

/content/DINO


In [None]:
!bash /content/DINO/scripts/DINO_train.sh /content/DINO/COCODIR \
--pretrain_model_path /content/drive/MyDrive/checkpoint0011_4scale.pth \
--finetune_ignore label_enc.weight class_embed | tee /content/DINO/results/train_log.txt

# **Re-Evaluating the results on the validation-set**

In [None]:
!python main.py \
    --config_file config/DINO/DINO_4scale.py \
    --output_dir /content/DINO/results \
    --pretrain_model_path /content/drive/MyDrive/checkpoint12.pth \
    --coco_path /content/DINO/COCODIR \
    --eval \
    --options dn_scalar=100 embed_init_tgt=TRUE \
    dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
    dn_box_noise_scale=1.0

# **Testing the fine-tuned model on custom images/ Validation images**

In [None]:
from PIL import Image
import datasets.transforms as T

In [None]:
image = Image.open("/content/DINO/16154.jpg").convert("RGB")

In [None]:
transform = T.Compose([
    T.RandomResize([800], max_size=1333),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
image, _ = transform(image, None)

In [None]:
model_config_path = "config/DINO/DINO_4scale.py"
model_checkpoint_path = "/content/DINO/logs/DINO/R50-MS4/checkpoint.pth"#/content/drive/MyDrive/checkpoint.pth
#/content/DINO/logs/DINO/R50-MS4/checkpoint.pth

In [None]:
args = SLConfig.fromfile(model_config_path)
args.device = 'cuda'
model, criterion, postprocessors = build_model_main(args)
checkpoint = torch.load(model_checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model'])
_ = model.eval()

  checkpoint = torch.load(model_checkpoint_path, map_location='cpu')


In [None]:
output = model.cuda()(image[None].cuda())
output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]

In [None]:
thershold = 0.3 # set a thershold

vslzr = COCOVisualizer()

scores = output['scores']
labels = output['labels']
boxes = box_ops.box_xyxy_to_cxcywh(output['boxes'])
select_mask = scores > thershold

box_label = [id2name[int(item)] for item in labels[select_mask]]
pred_dict = {
    'boxes': boxes[select_mask],
    'size': torch.Tensor([image.shape[1], image.shape[2]]),
    'box_label': box_label
}
vslzr.visualize(image, pred_dict, savedir=None, dpi=100)

# **Fine-Tuning pre trained 4- Scale 24 epoch checkpoint**

In [None]:
%cd /content/DINO

/content/DINO


In [None]:
!bash /content/DINO/scripts/DINO_train.sh /content/DINO/COCODIR \
--pretrain_model_path /content/drive/MyDrive/checkpoint0011_4scale24.pth \
--finetune_ignore label_enc.weight class_embed | tee /content/DINO/results/train_log.txt

**Re-Evaluating the results on the validation-set(24)**

In [None]:
!python main.py \
    --config_file config/DINO/DINO_4scale.py \
    --output_dir /content/DINO/results \
    --pretrain_model_path /content/DINO/logs/DINO/R50-MS4/checkpoint.pth \
    --coco_path /content/DINO/COCODIR \
    --eval \
    --options dn_scalar=100 embed_init_tgt=TRUE \
    dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
    dn_box_noise_scale=1.0

# **Testing the fine-tuned model on custom images/ Validation images for 24 epoch ckpt**

In [None]:
from PIL import Image
import datasets.transforms as T

In [None]:
image = Image.open("/content/DINO/9260.jpg").convert("RGB")

In [None]:
transform = T.Compose([
    T.RandomResize([800], max_size=1333),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
image, _ = transform(image, None)

In [None]:
model_config_path = "config/DINO/DINO_4scale.py"
model_checkpoint_path = "/content/drive/MyDrive/checkpoint.pth"
#/content/drive/MyDrive/checkpoint.pth
#/content/drive/MyDrive/checkpoint12.pth
#/content/DINO/logs/DINO/R50-MS4/checkpoint.pth


In [None]:
args = SLConfig.fromfile(model_config_path)
args.device = 'cuda'
model, criterion, postprocessors = build_model_main(args)
checkpoint = torch.load(model_checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model'])
_ = model.eval()

In [None]:
output = model.cuda()(image[None].cuda())
output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]

In [None]:
thershold = 0.3 # set a thershold

vslzr = COCOVisualizer()

scores = output['scores']
labels = output['labels']
boxes = box_ops.box_xyxy_to_cxcywh(output['boxes'])
select_mask = scores > thershold

box_label = [id2name[int(item)] for item in labels[select_mask]]
pred_dict = {
    'boxes': boxes[select_mask],
    'size': torch.Tensor([image.shape[1], image.shape[2]]),
    'box_label': box_label
}
vslzr.visualize(image, pred_dict, savedir=None, dpi=100)

# **Fine-Tuning pre trained 4- Scale 36 epoch checkpoint**

In [None]:
%cd /content/DINO

/content/DINO


In [None]:
!bash /content/DINO/scripts/DINO_train.sh /content/DINO/COCODIR \
--pretrain_model_path /content/drive/MyDrive/checkpoint0011_4scale36.pth \
--finetune_ignore label_enc.weight class_embed | tee /content/DINO/results/train_log.txt


**Re-Evaluating the results on the validation-set**

In [None]:
!python main.py \
    --config_file config/DINO/DINO_4scale.py \
    --output_dir /content/DINO/results \
    --pretrain_model_path /content/DINO/logs/DINO/R50-MS4/checkpoint.pth \
    --coco_path /content/DINO/COCODIR \
    --eval \
    --options dn_scalar=100 embed_init_tgt=TRUE \
    dn_label_coef=1.0 dn_bbox_coef=1.0 use_ema=False \
    dn_box_noise_scale=1.0


# **Testing the fine-tuned model on custom images/ Validation images 36**

In [None]:
from PIL import Image
import datasets.transforms as T

In [None]:
image = Image.open("/content/DINO/17137.jpg").convert("RGB")

In [None]:
transform = T.Compose([
    T.RandomResize([800], max_size=1333),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
image, _ = transform(image, None)

In [None]:
model_config_path = "config/DINO/DINO_4scale.py"
model_checkpoint_path = "/content/DINO/results/fused_model_checkpoint.pth"
#/content/drive/MyDrive/checkpoint.pth
#/content/drive/MyDrive/checkpoint12.pth
#/content/drive/MyDrive/checkpoint24.pth
#/content/drive/MyDrive/checkpoint36.pth


In [None]:
args = SLConfig.fromfile(model_config_path)
args.device = 'cuda'
model, criterion, postprocessors = build_model_main(args)
checkpoint = torch.load(model_checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model'])
_ = model.eval()

  checkpoint = torch.load(model_checkpoint_path, map_location='cpu')


In [None]:
output = model.cuda()(image[None].cuda())
output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]

In [None]:
thershold = 0.3 # set a thershold

vslzr = COCOVisualizer()

scores = output['scores']
labels = output['labels']
boxes = box_ops.box_xyxy_to_cxcywh(output['boxes'])
select_mask = scores > thershold

box_label = [id2name[int(item)] for item in labels[select_mask]]
pred_dict = {
    'boxes': boxes[select_mask],
    'size': torch.Tensor([image.shape[1], image.shape[2]]),
    'box_label': box_label
}
vslzr.visualize(image, pred_dict, savedir=None, dpi=100)

# **Visualizing each fine tuned model based on different epoch setting to compare**

In [None]:
from PIL import Image
import datasets.transforms as T


# Define the image transformation
transform = T.Compose([
    T.RandomResize([800], max_size=1333),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])


image = Image.open("/content/DINO/6458.jpg").convert("RGB")
image, _ = transform(image, None)


checkpoints = {

    "checkpoint.pth": "/content/drive/MyDrive/checkpoint.pth",
    "checkpoint24.pth":"/content/drive/MyDrive/checkpoint24norm.pth",
    "checkpoint36.pth":"/content/drive/MyDrive/checkpoint36norm.pth",
    "checkpoint12withaug.pth": "/content/drive/MyDrive/checkpoint12.pth",
    "checkpoint24withaug.pth": "/content/drive/MyDrive/checkpoint24.pth",
    "checkpoint36withaug.pth": "/content/drive/MyDrive/checkpoint36.pth"
}


vslzr = COCOVisualizer()


for title, checkpoint_path in checkpoints.items():
    model_config_path = "config/DINO/DINO_4scale.py"
    args = SLConfig.fromfile(model_config_path)
    args.device = 'cuda'
    model, criterion, postprocessors = build_model_main(args)
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(checkpoint['model'])
    _ = model.eval()

    output = model.cuda()(image[None].cuda())
    output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]
    threshold = 0.3  # set a threshold
    scores = output['scores']
    labels = output['labels']
    boxes = box_ops.box_xyxy_to_cxcywh(output['boxes'])
    select_mask = scores > threshold
    box_label = [id2name[int(item)] for item in labels[select_mask]]
    pred_dict = {
        'boxes': boxes[select_mask],
        'size': torch.Tensor([image.shape[1], image.shape[2]]),
        'box_label': box_label
    }

    # Visualize the results
    print(f"{title}:")
    vslzr.visualize(image, pred_dict, savedir=None, dpi=100)

*The average precision values were extracted frome valuation scores extracted after the Fine Tuning process *

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data from the table
models = [
    "Checkpoint 12 with Aug",
    "Checkpoint 36 with Aug",
    "Checkpoint 36 no Aug",
    "Checkpoint 24 no Aug",
    "Checkpoint 12 no Aug",
    "Checkpoint 24 with Aug"
]
medium_ap = [0.361, 0.342, 0.328, 0.321, 0.316, 0.312]
ap_50 = [0.694, 0.670, 0.645, 0.633, 0.610, 0.625]
ar = [0.610, 0.592, 0.578, 0.585, 0.540, 0.563]

# Set up the bar width and position
bar_width = 0.25
x = np.arange(len(models))

# Create the bar chart
fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - bar_width, medium_ap, bar_width, label="Medium AP @ 0.50:0.95")
bars2 = ax.bar(x, ap_50, bar_width, label="AP @ 0.50")
bars3 = ax.bar(x + bar_width, ar, bar_width, label="AR")

# Add labels and title
ax.set_xlabel("Model Checkpoints")
ax.set_ylabel("Metrics")
ax.set_title("Model Performance on Medium AP, AP @ 0.50, and AR (Person Detection)")
ax.set_xticks(x)
ax.set_xticklabels(models, rotation=45, ha="right")
ax.legend()

# Display values on each bar for clarity
def add_values(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f"{height:.3f}",
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # Offset the text a bit
                    textcoords="offset points",
                    ha="center", va="bottom")

add_values(bars1)
add_values(bars2)
add_values(bars3)

plt.tight_layout()
plt.show()


# **Ensemble Predcition with visualization of the best predcited model-image**

In [None]:
from PIL import Image
import datasets.transforms as T
import torch
from torchvision.ops import nms

# Define the image transformation
transform = T.Compose([
    T.RandomResize([800], max_size=1333),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Load and transform the image
image_path = "/content/DINO/6458.jpg"
image = Image.open(image_path).convert("RGB")
image, _ = transform(image, None)

# Define checkpoints
checkpoints = {

    "checkpoint.pth": "/content/drive/MyDrive/checkpoint.pth",
    "checkpoint24.pth":"/content/drive/MyDrive/checkpoint24norm.pth",
    "checkpoint36.pth":"/content/drive/MyDrive/checkpoint36norm.pth",
    "checkpoint12withaug.pth": "/content/drive/MyDrive/checkpoint12.pth",
    "checkpoint24withaug.pth": "/content/drive/MyDrive/checkpoint24.pth",
    "checkpoint36withaug.pth": "/content/drive/MyDrive/checkpoint36.pth"
}



# Define id2name for the single class
id2name = {1: "person"}  # Class ID for "person" is 1

# Initialize best checkpoint variables
best_checkpoint = None
best_person_count = 0
best_pred_dict = None

# Function to filter bounding boxes for each unique person index
def filter_duplicate_boxes(boxes, scores, iou_threshold=0.5):
    selected_indices = nms(boxes, scores, iou_threshold)
    return boxes[selected_indices], scores[selected_indices]

for title, checkpoint_path in checkpoints.items():
    # Load the model for each checkpoint
    model, criterion, postprocessors = build_model_main(args)
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(checkpoint['model'])
    model.eval()  # Set to evaluation mode

    # Run the model
    output = model.cuda()(image[None].cuda())
    output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]

    # Apply confidence threshold and label filter
    threshold = 0.3
    scores = output['scores']
    labels = output['labels']
    boxes = output['boxes']  # Use boxes directly for NMS (in xyxy format)

    # Select only "person" detections
    select_mask = (scores > threshold) & (labels == 1)
    selected_boxes = boxes[select_mask]
    selected_scores = scores[select_mask]

    # Filter for unique bounding boxes per person
    unique_boxes, unique_scores = filter_duplicate_boxes(selected_boxes, selected_scores)

    # Count the unique persons detected
    detected_person_count = len(unique_boxes)

    print(f"Checkpoint: {title}, Detected People Count (unique): {detected_person_count}")

    if detected_person_count > best_person_count:
        best_person_count = detected_person_count
        best_checkpoint = title
        box_label = [id2name[1] for _ in range(detected_person_count)]
        best_pred_dict = {
            'boxes': unique_boxes.cpu(),
            'size': torch.Tensor([image.shape[1], image.shape[2]]),
            'box_label': box_label
        }



In [None]:
# Replace model_checkpoint_path with the best checkpoint found in Cell 1
model_config_path = "config/DINO/DINO_4scale.py"
model_checkpoint_path = checkpoints[best_checkpoint]  # Path of the best checkpoint

# Load the model with best checkpoint
args = SLConfig.fromfile(model_config_path)
args.device = 'cuda'
model, criterion, postprocessors = build_model_main(args)
checkpoint = torch.load(model_checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model'])
_ = model.eval()

# Run the model on the image
output = model.cuda()(image[None].cuda())
output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]

# Set threshold and initialize visualizer
threshold = 0.3
vslzr = COCOVisualizer()

# Filter detections based on the threshold
scores = output['scores']
labels = output['labels']
boxes = box_ops.box_xyxy_to_cxcywh(output['boxes'])
select_mask = scores > threshold

# Prepare prediction dictionary for visualization
box_label = [id2name[int(item)] for item in labels[select_mask]]
pred_dict = {
    'boxes': boxes[select_mask],
    'size': torch.Tensor([image.shape[1], image.shape[2]]),
    'box_label': box_label
}

# Visualize the predictions
vslzr.visualize(image, pred_dict, savedir=None, dpi=100)


# **Ground truth image generator**

In [None]:
import json
import os
from PIL import Image, ImageDraw
from IPython.display import display

# Path to the COCO annotation file
annotation_file = '/content/DINO/random_sample_mavi_2_gt.json'  # Adjust this path if necessary
image_dir = '/content/DINO/Pedestrian_dataset_for_internship_assignment'  # Path to the image dataset

# Load the annotations file
with open(annotation_file, 'r') as f:
    coco_data = json.load(f)

# Define the image filename you want to visualize
target_image_filename = '6962.jpg'  # Update this with the actual filename

# Find the image ID and details for the target image
target_image = next((img for img in coco_data['images'] if img['file_name'] == target_image_filename), None)

if target_image:
    image_id = target_image['id']
    image_path = os.path.join(image_dir, target_image_filename)

    # Load the image
    image = Image.open(image_path).convert("RGB")
    draw = ImageDraw.Draw(image)

    # Find annotations (bounding boxes) for the target image
    annotations = [ann for ann in coco_data['annotations'] if ann['image_id'] == image_id]

    # Draw each bounding box on the image
    for ann in annotations:
        bbox = ann['bbox']
        # COCO format is [x, y, width, height]; convert to [x0, y0, x1, y1]
        x0, y0, width, height = bbox
        x1, y1 = x0 + width, y0 + height
        draw.rectangle([x0, y0, x1, y1], outline="red", width=3)

    # Display the image with bounding boxes in Colab
    display(image)
else:
    print(f"Image with filename '{target_image_filename}' not found in the annotations.")


# **Visualizing Attention maps**

In [None]:
from PIL import Image
import datasets.transforms as T

# Assuming the visualizer code provided in `visualizer.py` is already defined
vslzr = COCOVisualizer()


In [None]:
%cd /content/DINO

/content/DINO


*the below cell is to analyze the models architecture*

In [None]:
# Define model configuration and checkpoint paths
model_config_path = "config/DINO/DINO_4scale.py"
model_checkpoint_path = "/content/drive/MyDrive/checkpoint36.pth"
# Load model configuration
args = SLConfig.fromfile(model_config_path)
args.device = 'cuda'

# Build and load the model with the checkpoint
model, criterion, postprocessors = build_model_main(args)
checkpoint = torch.load(model_checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model'])
model.eval()  # Set model to evaluation mode


In [None]:
import os
import random
import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import torch.nn.functional as F
from main import build_model_main
from util.slconfig import SLConfig
from datasets import build_dataset
from util.visualizer import COCOVisualizer
from util import box_ops
import datasets.transforms as T

# Config and paths
model_config_path = "config/DINO/DINO_4scale.py"
fine_tuned_checkpoint_path = "/content/drive/MyDrive/checkpoint12.pth"
image_dir = "/content/DINO/COCODIR/val2017"

# Load model configuration and build model
args = SLConfig.fromfile(model_config_path)
args.device = 'cuda'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Build and load the model with checkpoint
model, criterion, postprocessors = build_model_main(args)
checkpoint = torch.load(fine_tuned_checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model'])
model.eval().to(device)

# Prepare input image transformation
transform = T.Compose([
    T.RandomResize([800], max_size=1333),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Dictionary to hold attention weights for each layer
attention_weights = {}
decoder_layers = [f"transformer.decoder.layers.{i}" for i in range(6)]

# Register hooks to capture attention weights in decoder layers
def register_hooks(model):
    for name, module in model.named_modules():
        if hasattr(module, 'self_attn') and name in decoder_layers:
            module.self_attn.register_forward_hook(
                lambda mod, inp, out, name=name: attention_weights.setdefault(name, []).append(out[0].detach().cpu().numpy())
            )

register_hooks(model)

# Load 5 random images from the specified directory
image_files = random.sample(os.listdir(image_dir), 5)

for image_file in image_files:
    image_path = os.path.join(image_dir, image_file)

    # Prepare input image
    image = Image.open(image_path).convert("RGB")
    transformed_image, _ = transform(image, None)

    # Run inference and capture attention weights
    with torch.no_grad():
        output = model(transformed_image[None].to(device))
        bbox_output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).to(device))[0]

    # Display original image
    plt.figure(figsize=(10, 10))
    plt.imshow(image)
    plt.axis('off')
    plt.title(f"Original Image - {image_file}")
    plt.show()

    # Visualize attention maps for each decoder layer from 0 to 5
    for layer in decoder_layers:
        if layer in attention_weights:
            attn_map = attention_weights[layer][0][0, 0]  # Only head 0
            spatial_size = int(np.sqrt(attn_map.shape[-1]))
            if spatial_size * spatial_size == attn_map.shape[-1]:  # Ensure it's a square attention map
                attn_map = attn_map.reshape(spatial_size, spatial_size)

                # Interpolate to match the exact original image size
                attn_map = torch.tensor(attn_map).unsqueeze(0).unsqueeze(0)
                attn_map = F.interpolate(attn_map, size=(image.size[1], image.size[0]), mode='bilinear', align_corners=False)
                attn_map = attn_map.squeeze().numpy()

                # Display the attention map overlay with correct scaling
                plt.figure(figsize=(10, 10))
                plt.imshow(np.array(image) / 255.0, interpolation='none')
                plt.imshow(attn_map, cmap='viridis', alpha=0.6)
                plt.axis('off')
                plt.title(f"Attention Map - {layer} - {image_file}")
                plt.show()

    # Clear attention weights for the next image
    attention_weights.clear()


# **OUTPUT-with ensemble bounding box predcition with attention map**

**Run the below cells to visulize the bounding box and the attention map**

In [None]:
from PIL import Image
import datasets.transforms as T
import torch
from torchvision.ops import nms
import os
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F

from main import build_model_main
from util.slconfig import SLConfig
from util.visualizer import COCOVisualizer
from util import box_ops
from datasets import build_dataset


model_config_path = "config/DINO/DINO_4scale.py"


image_path = "/content/DINO/3276.jpg" # replace your image path here


checkpoints = {
    "checkpoint.pth": "/content/drive/MyDrive/checkpoint.pth",
    "checkpoint36.pth": "/content/drive/MyDrive/checkpoint36norm.pth",
    "checkpoint12withaug.pth": "/content/drive/MyDrive/checkpoint12.pth",
    "checkpoint36withaug.pth": "/content/drive/MyDrive/checkpoint36.pth"
}


id2name = {1: "person"}


In [None]:

transform = T.Compose([
    T.RandomResize([800], max_size=1333),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])


image = Image.open(image_path).convert("RGB")
image, _ = transform(image, None)


best_checkpoint = None
best_person_count = 0
best_pred_dict = None


def filter_duplicate_boxes(boxes, scores, iou_threshold=0.5):
    selected_indices = nms(boxes, scores, iou_threshold)
    return boxes[selected_indices], scores[selected_indices]


for title, checkpoint_path in checkpoints.items():
    # Load the model for each checkpoint
    args = SLConfig.fromfile(model_config_path)
    args.device = 'cuda'
    model, criterion, postprocessors = build_model_main(args)
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(checkpoint['model'])
    model.eval().cuda()


    output = model(image[None].cuda())
    output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]

    threshold = 0.3
    scores = output['scores']
    labels = output['labels']
    boxes = output['boxes']

    select_mask = (scores > threshold) & (labels == 1)
    selected_boxes = boxes[select_mask]
    selected_scores = scores[select_mask]

    unique_boxes, unique_scores = filter_duplicate_boxes(selected_boxes, selected_scores)
    detected_person_count = len(unique_boxes)

    print(f"Checkpoint: {title}, Detected People Count (unique): {detected_person_count}")


    if detected_person_count > best_person_count:
        best_person_count = detected_person_count
        best_checkpoint = title
        box_label = [id2name[1] for _ in range(detected_person_count)]
        best_pred_dict = {
            'boxes': unique_boxes.cpu(),
            'size': torch.Tensor([image.shape[1], image.shape[2]]),
            'box_label': box_label
        }

print(f"Best performing checkpoint: {best_checkpoint} with detected people count: {best_person_count}")
model_checkpoint_path = checkpoints[best_checkpoint]


  checkpoint = torch.load(checkpoint_path, map_location='cpu')


Checkpoint: checkpoint.pth, Detected People Count (unique): 5
Checkpoint: checkpoint36.pth, Detected People Count (unique): 9
Checkpoint: checkpoint12withaug.pth, Detected People Count (unique): 8
Checkpoint: checkpoint36withaug.pth, Detected People Count (unique): 6
Best performing checkpoint: checkpoint36.pth with detected people count: 9


# **Visualizing bounding box detection**

In [None]:

args = SLConfig.fromfile(model_config_path)
args.device = 'cuda'
model, criterion, postprocessors = build_model_main(args)
checkpoint = torch.load(model_checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model'])
model.eval().cuda()


output = model(image[None].cuda())
output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]
threshold = 0.3
scores = output['scores']
labels = output['labels']
boxes = box_ops.box_xyxy_to_cxcywh(output['boxes'])
select_mask = scores > threshold
box_label = [id2name[int(item)] for item in labels[select_mask]]
pred_dict = {
    'boxes': boxes[select_mask],
    'size': torch.Tensor([image.shape[1], image.shape[2]]),
    'box_label': box_label
}


vslzr = COCOVisualizer()
vslzr.visualize(image, pred_dict, savedir=None, dpi=100)


# **Visualizing attention maps**

In [None]:
#
args = SLConfig.fromfile(model_config_path)
args.device = 'cuda'
model, criterion, postprocessors = build_model_main(args)
checkpoint = torch.load(model_checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model'])
model.eval().cuda()

attention_weights = {}
decoder_layers = [f"transformer.decoder.layers.{i}" for i in range(6)]


def register_hooks(model):
    for name, module in model.named_modules():
        if hasattr(module, 'self_attn') and name in decoder_layers:
            module.self_attn.register_forward_hook(
                lambda mod, inp, out, name=name: attention_weights.setdefault(name, []).append(out[0].detach().cpu().numpy())
            )

register_hooks(model)


image = Image.open(image_path).convert("RGB")
transformed_image, _ = transform(image, None)


with torch.no_grad():
    output = model(transformed_image[None].cuda())
    bbox_output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]


plt.figure(figsize=(8, 8))
plt.imshow(image)
plt.axis('off')
plt.title("Original Image")
plt.show()


fig, axes = plt.subplots(1, 6, figsize=(20, 5))
for idx, layer in enumerate(decoder_layers):
    if layer in attention_weights:
        attn_map = attention_weights[layer][0][0, 0]
        spatial_size = int(np.sqrt(attn_map.shape[-1]))
        if spatial_size * spatial_size == attn_map.shape[-1]:
            attn_map = attn_map.reshape(spatial_size, spatial_size)
            attn_map = F.interpolate(torch.tensor(attn_map).unsqueeze(0).unsqueeze(0),
                                     size=(image.size[1], image.size[0]),
                                     mode='bilinear', align_corners=False).squeeze().numpy()


            axes[idx].imshow(image, alpha=0.6)
            axes[idx].imshow(attn_map, cmap='viridis', alpha=0.6)
            axes[idx].axis('off')
            axes[idx].set_title(f"Layer {idx}")

plt.show()
attention_weights.clear()
