In [None]:
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
from pathlib import Path
import json
from collections import defaultdict
import yaml
from tqdm.notebook import tqdm
import subprocess
from datetime import datetime

In [None]:
yaml_path = '/home/data/configs/pace_v3_orig.yaml'

with open(yaml_path, 'r') as f:
	full_yaml = yaml.safe_load(f)

# Load test dataset annotations
test_images_dir = full_yaml['test'] + "/images"
test_labels_dir = full_yaml['test'] + "/labels"

# Get all test images
image_files = list(Path(test_images_dir).glob("*.jpg")) + list(Path(test_images_dir).glob("*.png"))
print(f"Found {len(image_files)} test images")

In [None]:
# Analyze occlusion, object count, and brightness for each image
image_stats = []

for img_path in tqdm(image_files):
	img = cv2.imread(str(img_path))
	h, w = img.shape[:2]
	
	label_path = Path(test_labels_dir) / f"{img_path.stem}.txt"
	if not label_path.exists():
		continue
	
	# Parse YOLO format labels
	boxes = []
	with open(label_path, 'r') as f:
		for line in f:
			parts = line.strip().split()
			class_id, x_center, y_center, width, height = map(float, parts[:5])
			boxes.append([x_center, y_center, width, height])
	
	# Calculate occlusion metric (overlap between bounding boxes)
	total_overlap = 0
	total_area = 0
	
	for i, box1 in enumerate(boxes):
		x1, y1, w1, h1 = box1
		area1 = w1 * h1
		total_area += area1
		
		for j, box2 in enumerate(boxes[i+1:], i+1):
			x2, y2, w2, h2 = box2
			
			# Calculate intersection
			x_left = max(x1 - w1/2, x2 - w2/2)
			x_right = min(x1 + w1/2, x2 + w2/2)
			y_top = max(y1 - h1/2, y2 - h2/2)
			y_bottom = min(y1 + h1/2, y2 + h2/2)
			
			if x_right > x_left and y_bottom > y_top:
				intersection = (x_right - x_left) * (y_bottom - y_top)
				total_overlap += intersection
	
	# Occlusion percentage
	occlusion_pct = (total_overlap / total_area * 100) if total_area > 0 else 0
	
	# Calculate brightness (mean luminosity)
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	brightness = np.mean(gray)
	
	image_stats.append({
		'image': img_path.name,
		'num_objects': len(boxes),
		# 'occlusion_pct': occlusion_pct,
		'brightness': brightness
	})

print(f"Processed {len(image_stats)} images with labels")

In [None]:
### most images have ~0 occlusion, bin them together; bin the rest into 5 equal-ish bins
epsilon = 1e-5
occ_pcts = [x['occlusion_pct'] for x in image_stats if x['occlusion_pct'] > epsilon]
occ_pcts.sort()
bin_bounds = [0] + occ_pcts[::len(occ_pcts)//5] + [100]

# Filter out occlusion bin bounds which don't have any images
valid_bin_bounds = [bin_bounds[0]]
for i in range(1, len(bin_bounds)-1):
	count = sum(1 for x in image_stats if bin_bounds[i] < x['occlusion_pct'] <= bin_bounds[i+1])
	if count > 0:
		valid_bin_bounds.append(bin_bounds[i])
if sum(1 for x in image_stats if bin_bounds[-1] < x['occlusion_pct']) > 0:
	valid_bin_bounds.append(bin_bounds[-1])

bin_bounds = valid_bin_bounds
print("Occlusion bin bounds:", bin_bounds)

max_objects = max(x['num_objects'] for x in image_stats)
object_bins = list(range(int(max_objects) + 1))
object_bins = [x for x in object_bins if sum(1 for img_stat in image_stats if img_stat['num_objects'] == x) > 0]
print("Number of objects:", object_bins)

# Create brightness bins (5 equal-ish bins)
brightness_values = [x['brightness'] for x in image_stats]
brightness_values.sort()
brightness_bin_bounds = [brightness_values[i*len(brightness_values)//5] for i in range(5)]
brightness_bin_bounds.append(max(brightness_values) + 1)
print("Brightness bin bounds:", brightness_bin_bounds)

In [None]:
# Define bins for occlusion percentage
occlusion_bins = bin_bounds
occlusion_labels = [f'{occlusion_bins[i]:.2f}-{occlusion_bins[i+1]:.2f}%' for i in range(len(occlusion_bins)-1)]

# Define bins for number of objects (discrete counts)
object_labels = [str(i) for i in object_bins]

# Define bins for brightness
brightness_labels = [f'{brightness_bin_bounds[i]:.1f}-{brightness_bin_bounds[i+1]:.1f}' for i in range(len(brightness_bin_bounds)-1)]

# Bin the data
for img_stat in image_stats:
	occ_pct = img_stat['occlusion_pct']
	for i in range(len(occlusion_bins)-1):
		if occlusion_bins[i] <= occ_pct <= occlusion_bins[i+1]:
			img_stat['occlusion_bin'] = occlusion_labels[i]
			break
	
	### Object count binning
	img_stat['object_bin'] = str(img_stat['num_objects'])
	
	### Brightness binning
	brightness = img_stat['brightness']
	for i in range(len(brightness_bin_bounds)-1):
		if brightness_bin_bounds[i] <= brightness < brightness_bin_bounds[i+1]:
			img_stat['brightness_bin'] = brightness_labels[i]
			break

# Count distributions
occlusion_counts = {}
object_counts = {}
brightness_counts = {}

for img_stat in image_stats:
	occ_bin = img_stat.get('occlusion_bin')
	obj_bin = img_stat.get('object_bin')
	bright_bin = img_stat.get('brightness_bin')
	
	occlusion_counts[occ_bin] = occlusion_counts.get(occ_bin, 0) + 1
	object_counts[obj_bin] = object_counts.get(obj_bin, 0) + 1
	brightness_counts[bright_bin] = brightness_counts.get(bright_bin, 0) + 1

print("Distribution by Occlusion:")
for bin_name in sorted(occlusion_counts.keys()):
	print(f"{bin_name}: {occlusion_counts[bin_name]}")

print("\nDistribution by Number of Objects:")
for bin_name in sorted(object_counts.keys(), key=int):
	print(f"{bin_name}: {object_counts[bin_name]}")

print("\nDistribution by Brightness:")
for bin_name in sorted(brightness_counts.keys()):
	print(f"{bin_name}: {brightness_counts[bin_name]}")

occ_bin_names = sorted(occlusion_counts.keys())
obj_bin_names = sorted(object_counts.keys(), key=int)
brightness_bin_names = sorted(brightness_counts.keys())

In [None]:
# Visualize distributions
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5))

# Occlusion distribution
occ_bins = sorted(occlusion_counts.keys())
occ_values = [occlusion_counts[b] for b in occ_bins]
ax1.bar(range(len(occ_bins)), occ_values, color='skyblue')
ax1.set_title('Distribution of Images by Occlusion Level')
ax1.set_xlabel('Occlusion Percentage')
ax1.set_ylabel('Number of Images')
ax1.set_xticks(range(len(occ_bins)))
ax1.set_xticklabels(occ_bins, rotation=45)

# Object count distribution
obj_bins = sorted(object_counts.keys(), key=int)
obj_values = [object_counts[b] for b in obj_bins]
ax2.bar(range(len(obj_bins)), obj_values, color='lightcoral')
ax2.set_title('Distribution of Images by Object Count')
ax2.set_xlabel('Number of Objects')
ax2.set_ylabel('Number of Images')
ax2.set_xticks(range(len(obj_bins)))
ax2.set_xticklabels(obj_bins)

# Brightness distribution
bright_bins = sorted(brightness_counts.keys())
bright_values = [brightness_counts[b] for b in bright_bins]
ax3.bar(range(len(bright_bins)), bright_values, color='lightgreen')
ax3.set_title('Distribution of Images by Brightness')
ax3.set_xlabel('Brightness Level')
ax3.set_ylabel('Number of Images')
ax3.set_xticks(range(len(bright_bins)))
ax3.set_xticklabels(bright_bins, rotation=45)

plt.tight_layout()
plt.show()

# Show sample statistics
occ_values = [x['occlusion_pct'] for x in image_stats]
obj_values = [x['num_objects'] for x in image_stats]
bright_values = [x['brightness'] for x in image_stats]

print(f"\nOcclusion Statistics:")
print(f"Mean: {np.mean(occ_values):.2f}%")
print(f"Median: {np.median(occ_values):.2f}%")
print(f"Max: {max(occ_values):.2f}%")

print(f"\nObject Count Statistics:")
print(f"Mean: {np.mean(obj_values):.2f}")
print(f"Median: {np.median(obj_values):.2f}")
print(f"Max: {max(obj_values)}")

print(f"\nBrightness Statistics:")
print(f"Mean: {np.mean(bright_values):.2f}")
print(f"Median: {np.median(bright_values):.2f}")
print(f"Max: {max(bright_values):.2f}")

# Create binned datasets

In [None]:
# Create YAML files for each bin and run YOLO evaluation
bin_datasets_dir = Path("/home/data/bin_datasets/")
bin_datasets_dir.mkdir(exist_ok=True)

In [None]:
# Create datasets for occlusion bins
for bin_name in occ_bin_names:
	bin_images = [x['image'] for x in image_stats if x.get('occlusion_bin') == bin_name]

	# Create dataset directory
	safe_bin_name = bin_name.replace('%', 'pct').replace('.', '_').replace('-', '_to_')
	dataset_dir = bin_datasets_dir / f"occlusion_{safe_bin_name}"
	dataset_dir.mkdir(exist_ok=True)
	
	images_dir = dataset_dir / "images"
	labels_dir = dataset_dir / "labels"
	images_dir.mkdir(exist_ok=True)
	labels_dir.mkdir(exist_ok=True)
	
	# Copy images and labels
	for img_name in bin_images:
		src_img = Path(test_images_dir) / img_name
		src_label = Path(test_labels_dir) / f"{Path(img_name).stem}.txt"
		
		if src_img.exists():
			os.system(f"cp '{src_img}' '{images_dir}/'")
		if src_label.exists():
			os.system(f"cp '{src_label}' '{labels_dir}/'")
	
	# Create YAML file
	yaml_content = {
		'train': str(dataset_dir),
		'val': str(dataset_dir),
		'test': str(dataset_dir),
		'names': full_yaml['names']
	}
	
	yaml_file = dataset_dir / "dataset.yaml"
	with open(yaml_file, 'w') as f:
		yaml.dump(yaml_content, f)
	
	print(f"Created occlusion bin {bin_name}: {len(bin_images)} images")

In [None]:
# Create datasets for object count bins
for bin_name in obj_bin_names:
	bin_images = [x['image'] for x in image_stats if x.get('object_bin') == bin_name]
	
	# Create dataset directory
	dataset_dir = bin_datasets_dir / f"objects_{bin_name}"
	dataset_dir.mkdir(exist_ok=True)
	
	images_dir = dataset_dir / "images"
	labels_dir = dataset_dir / "labels"
	images_dir.mkdir(exist_ok=True)
	labels_dir.mkdir(exist_ok=True)
	
	# Copy images and labels
	for img_name in bin_images:
		src_img = Path(test_images_dir) / img_name
		src_label = Path(test_labels_dir) / f"{Path(img_name).stem}.txt"
		
		if src_img.exists():
			os.system(f"cp '{src_img}' '{images_dir}/'")
		if src_label.exists():
			os.system(f"cp '{src_label}' '{labels_dir}/'")
	
	# Create YAML file
	yaml_content = {
		'train': str(dataset_dir),
		'val': str(dataset_dir),
		'test': str(dataset_dir),
		'names': full_yaml['names']
	}
	
	yaml_file = dataset_dir / "dataset.yaml"
	with open(yaml_file, 'w') as f:
		yaml.dump(yaml_content, f)
	
	print(f"Created object count bin {bin_name}: {len(bin_images)} images")

In [None]:
# Create datasets for brightness bins
for bin_name in brightness_bin_names:
	bin_images = [x['image'] for x in image_stats if x.get('brightness_bin') == bin_name]
	
	# Create dataset directory
	safe_bin_name = bin_name.replace('.', '_').replace('-', '_to_')
	dataset_dir = bin_datasets_dir / f"brightness_{safe_bin_name}"
	dataset_dir.mkdir(exist_ok=True)
	
	images_dir = dataset_dir / "images"
	labels_dir = dataset_dir / "labels"
	images_dir.mkdir(exist_ok=True)
	labels_dir.mkdir(exist_ok=True)
	
	# Copy images and labels
	for img_name in bin_images:
		src_img = Path(test_images_dir) / img_name
		src_label = Path(test_labels_dir) / f"{Path(img_name).stem}.txt"
		
		if src_img.exists():
			os.system(f"cp '{src_img}' '{images_dir}/'")
		if src_label.exists():
			os.system(f"cp '{src_label}' '{labels_dir}/'")
	
	# Create YAML file
	yaml_content = {
		'train': str(dataset_dir),
		'val': str(dataset_dir),
		'test': str(dataset_dir),
		'names': full_yaml['names']
	}
	
	yaml_file = dataset_dir / "dataset.yaml"
	with open(yaml_file, 'w') as f:
		yaml.dump(yaml_content, f)
	
	print(f"Created brightness bin {bin_name}: {len(bin_images)} images")