In [49]:
from ultralytics import YOLO
import pandas as pd
import numpy as np
import os
import imagesize

In [50]:
class_names = {
    0:'Person',
    1:'Car',
    2:'Truck',
    3:'UAV',
    4:'Aircraft',
    5:'Ship'
}

In [51]:
def inspect_dataset(
        target_dataset_root,  # ../datasets/new_dataset
        target_dataset_slice,  # train,test,val
):
 info = []  # list of lists, each list corresponds to an instance [cls_id, x, y, w, h, img]

 target_labels_dir = os.path.join(target_dataset_root, 'labels', target_dataset_slice)

 # Iterate over all files in the original dataset labels folder
 for filename in os.listdir(target_labels_dir):
  if filename.endswith('.txt'):
   # Read file
   with open(os.path.join(target_labels_dir, filename), "r") as f:
    # Iterate over instances in image and get present class ids
    for line in f:
     line_data = []
     # label data
     line_data = line.split()
     # Image name
     line_data.append(os.path.splitext(filename)[0])
     # Image size: could be done at image level and not row level
     img_path = os.path.join(target_dataset_root, 'images', target_dataset_slice,
                             os.path.splitext(filename)[0] + '.jpg')
     img_w, img_h = imagesize.get(img_path)
     line_data.extend([img_w, img_h])
     line_data.extend([img_path, os.path.join(target_labels_dir, filename)])
     # Append line data to info
     info.append(line_data)

 df = pd.DataFrame(info, columns=['new_class_id', 'xcn', 'ycn', 'wn', 'hn', 'img', 'img_w', 'img_h', 'image_path', 'label_path'])
 df = df.astype(
  {'new_class_id': 'int32', 'xcn': 'float32', 'ycn': 'float32', 'wn': 'float32', 'hn': 'float32', 'img': 'int64',
   'img_w': 'float32', 'img_h': 'float32', 'image_path': 'string', 'label_path': 'string'})
 df['class_name'] = df['new_class_id'].map(class_names)
 return df

In [52]:
df_val = inspect_dataset('/home/johnny/Projects/datasets/custom_dataset_v2/', 'val')

In [53]:
df_val

Unnamed: 0,new_class_id,xcn,ycn,wn,hn,img,img_w,img_h,image_path,label_path,class_name
0,0,0.430208,0.599219,0.572917,0.801562,11122,480.0,640.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person
1,3,0.912500,0.265278,0.028125,0.026852,7103,1920.0,1080.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,UAV
2,3,0.554688,0.314583,0.151562,0.165278,5292,1280.0,720.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,UAV
3,0,0.357776,0.605787,0.009604,0.016954,7649,1920.0,1080.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person
4,0,0.434849,0.627046,0.009969,0.039713,7649,1920.0,1080.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person
...,...,...,...,...,...,...,...,...,...,...,...
95016,2,0.480581,0.544305,0.041685,0.056822,1302,1624.0,1200.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Truck
95017,1,0.537520,0.543896,0.023863,0.023301,1302,1624.0,1200.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Car
95018,2,0.566215,0.542057,0.019634,0.023710,1302,1624.0,1200.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Truck
95019,1,0.519231,0.718456,0.067308,0.065913,8507,624.0,531.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Car


In [54]:
print(f"The number of objects is {len(df_val)}")
print(f"The number of images is {len(df_val['img'].unique())}")

The number of objects is 95021
The number of images is 13205


In [55]:
bin_edges = [0, 16**2, 32**2, 96**2, float('inf')]
bin_labels = ['Tiny', 'Small', 'Medium', 'Large']
df_val['bbox_area'] = (df_val['wn']*df_val['img_w'])*(df_val['hn']*df_val['img_h'])
df_val['bbox_size_category'] = pd.cut(df_val['bbox_area'], bins=bin_edges, labels=bin_labels, right=False)

In [56]:
df_val

Unnamed: 0,new_class_id,xcn,ycn,wn,hn,img,img_w,img_h,image_path,label_path,class_name,bbox_area,bbox_size_category
0,0,0.430208,0.599219,0.572917,0.801562,11122,480.0,640.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person,141075.000000,Large
1,3,0.912500,0.265278,0.028125,0.026852,7103,1920.0,1080.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,UAV,1566.000000,Medium
2,3,0.554688,0.314583,0.151562,0.165278,5292,1280.0,720.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,UAV,23086.000000,Large
3,0,0.357776,0.605787,0.009604,0.016954,7649,1920.0,1080.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person,337.636353,Small
4,0,0.434849,0.627046,0.009969,0.039713,7649,1920.0,1080.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person,820.914551,Small
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95016,2,0.480581,0.544305,0.041685,0.056822,1302,1624.0,1200.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Truck,4615.924316,Medium
95017,1,0.537520,0.543896,0.023863,0.023301,1302,1624.0,1200.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Car,1083.594849,Medium
95018,2,0.566215,0.542057,0.019634,0.023710,1302,1624.0,1200.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Truck,907.201843,Small
95019,1,0.519231,0.718456,0.067308,0.065913,8507,624.0,531.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Car,1470.000122,Medium


In [57]:
df_val['bbox_size_category'].value_counts()

bbox_size_category
Medium    35772
Small     21698
Large     20803
Tiny      16748
Name: count, dtype: int64

# 1) Filter by Image Size (≥ 1920x1080)

In [58]:
df_size = df_val[(df_val['img_w']>= 1920)&(df_val['img_h']>= 1080)]

In [59]:
df_size

Unnamed: 0,new_class_id,xcn,ycn,wn,hn,img,img_w,img_h,image_path,label_path,class_name,bbox_area,bbox_size_category
1,3,0.912500,0.265278,0.028125,0.026852,7103,1920.0,1080.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,UAV,1566.000000,Medium
3,0,0.357776,0.605787,0.009604,0.016954,7649,1920.0,1080.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person,337.636353,Small
4,0,0.434849,0.627046,0.009969,0.039713,7649,1920.0,1080.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person,820.914551,Small
5,0,0.478359,0.629602,0.015969,0.039630,7649,1920.0,1080.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person,1312.248047,Medium
6,0,0.496073,0.630537,0.017589,0.045704,7649,1920.0,1080.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person,1666.887085,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...
94952,1,0.411180,0.435370,0.008818,0.011482,9527,1920.0,1080.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Car,209.932404,Tiny
94953,0,0.430362,0.376731,0.008640,0.017351,9863,3840.0,2160.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person,1243.405396,Medium
94954,0,0.727029,0.421764,0.009267,0.010194,9863,3840.0,2160.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person,783.578552,Small
94955,0,0.740905,0.426007,0.006809,0.006644,9863,3840.0,2160.0,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...,Person,375.276093,Small


In [60]:
print(f"The number of objects is {len(df_size)}")
print(f"The number of images is {len(df_size['img'].unique())}")

The number of objects is 23156
The number of images is 3154


In [61]:
df_size['bbox_size_category'].value_counts()

bbox_size_category
Small     7458
Medium    7212
Tiny      6449
Large     2037
Name: count, dtype: int64

In [62]:
df_size['class_name'].value_counts()

class_name
Person      15897
Car          4584
UAV          2206
Ship          321
Truck         147
Aircraft        1
Name: count, dtype: int64

# 2) Filter by maximum number of objects present (≤ 100)

In [63]:
object_counts = df_size['img'].value_counts()
selected_images = object_counts[object_counts <= 100].index
df_filtered = df_size[df_size['img'].isin(selected_images)]

In [64]:
df_filtered.to_csv('filtered_validation.csv', index=False)

In [65]:
print(f"The number of objects is {len(df_filtered)}")
print(f"The number of images is {len(df_filtered['img'].unique())}")

The number of objects is 19455
The number of images is 3133


In [66]:
df_filtered['bbox_size_category'].value_counts()

bbox_size_category
Medium    6689
Small     6556
Tiny      4190
Large     2020
Name: count, dtype: int64

In [None]:
import pandas as pd
import os
import shutil
import yaml

base_dir = '/home/johnny/Projects/datasets/Client_Validation_Set'

subfolders = ['train', 'val', 'test']
folders = ['images', 'labels']
for folder in folders:
    for subfolder in subfolders:
        os.makedirs(os.path.join(base_dir, folder, subfolder), exist_ok=True)

for index, row in df_filtered.iterrows():
    image_path = row['image_path']
    label_path = row['label_path']
    shutil.copy(image_path, os.path.join(base_dir, 'images', 'val'))
    shutil.copy(label_path, os.path.join(base_dir, 'labels', 'val'))
yaml_data = {
    'path': '../small-fast-detector/inference_tools/Evaluation/datasets/Client_Validation_Set',  # PUT HERE THE PATH
    'train': 'images/train',
    'val': 'images/val',
    'test': '',
    'names': {
        0: 'uav',
        1: 'airplane',
        2: 'boat',
        3: 'car',
        4: 'person',
        5: 'truck'
    }

}
with open(os.path.join(base_dir, 'data.yaml'), 'w') as file:
    yaml.dump(yaml_data, file, default_flow_style=False)

print("Dataset organized and YAML file created.")

# PART 2

In [68]:
import yaml
import platform
import pandas as pd
import glob
from PIL import Image
import brambox as bb
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

In [69]:
def check_os():
    os = platform.system()

    if os == 'Darwin':
        return "MacOS"
    elif os == 'Linux':
        return "Linux"
    else:
        return "Unknown OS"
    
operating_system = check_os()


if operating_system == "MacOS":
    root_path = "/Users/johnny/Projects/"
elif operating_system == "Linux":
    root_path = "/home/johnny/Projects/"

In [70]:
from brambox.io.parser.detection import CocoParser

# Load detections

det = bb.io.load(CocoParser, '../../small-fast-detector/runs/detect/val/predictions.json')
print('detections:')
det['image'] = det['image'].astype(str).str.lstrip('0').astype(int)
det['class_label'] = det['class_label'].astype(int)
label_mapping = {
    0: 'person',
    1: 'car',
    2: 'truck',
    3: 'uav',
    4: 'airplane',
    5: 'boat'
}

# Applying the mapping to the 'class_label' column
det['class_label'] = det['class_label'].map(label_mapping)

display(det.head())

detections:


Unnamed: 0,image,class_label,id,x_top_left,y_top_left,width,height,confidence
0,5009,airplane,,1249.232,504.652,1609.672,887.401,0.01988
1,5009,uav,,1134.438,506.22,1728.182,884.41,0.89394
2,5009,person,,1733.487,575.757,98.935,117.171,0.00298
3,5019,person,,3.357,46.26,927.604,2777.971,0.04554
4,5019,person,,6.387,1015.198,921.734,1812.106,0.01079


In [71]:
from brambox.io.parser.annotation import CocoParser
# Load annotations
anno = bb.io.load(CocoParser(add_image_dims=True), '../../small-fast-detector/inference_tools/Evaluation/datasets/Client_Validation_Set_3133/annotations/instances_val2017.json')
anno['image'] = anno['image'].astype(str).str.lstrip('0').astype(int)

print('annotations:')
display(anno.head())

# save dataframes
det.to_csv('../../small-fast-detector/runs/detect/val/detections.csv', index=False)

anno.to_csv('../../small-fast-detector/runs/detect/val/annotations.csv', index=False)

annotations:


Unnamed: 0,image,class_label,id,x_top_left,y_top_left,width,height,occluded,truncated,lost,difficult,ignore,image_width,image_height
0,10000,person,10389.0,3386.0,2147.0,23.0,13.0,0.0,0.0,False,False,False,3840,2160
1,10000,person,10390.0,2965.0,797.0,70.0,62.0,0.0,0.0,False,False,False,3840,2160
2,10000,person,10391.0,2153.0,843.0,32.0,28.0,0.0,0.0,False,False,False,3840,2160
3,10000,person,10392.0,889.0,1017.0,146.0,51.0,0.0,0.0,False,False,False,3840,2160
4,10000,boat,10393.0,454.0,241.0,104.0,33.0,0.0,0.0,False,False,False,3840,2160


In [72]:
from tqdm.notebook import tqdm


def calculate_area(row):
    return row['width'] * row['height']

def iou(box_a, box_b):
    xA = max(box_a[0], box_b[0])
    yA = max(box_a[1], box_b[1])
    xB = min(box_a[2], box_b[2])
    yB = min(box_a[3], box_b[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    boxAArea = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
    boxBArea = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])

    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

def calculate_map(detected, actual, class_labels):
    aps = []
    if detected.empty or 'class_label' not in detected.columns:
        return 0

    for label in class_labels:
        if label not in detected['class_label'].values:
            aps.append(0)
            continue

        dc = detected[detected.class_label == label]
        ac = actual[actual.class_label == label]

        ap_coco = []
        for iou_threshold in range(50, 100, 5):
            if dc.empty:
                ap_coco.append(0)
                continue

            pr = bb.stat.pr(dc, ac, iou_threshold / 100, smooth=True)
            ap_coco.append(bb.stat.auc_interpolated(pr))

        aps.append(sum(ap_coco) / len(ap_coco))

    mAP_coco = sum(aps) / len(aps) if aps else 0
    return mAP_coco

def calculate_pr_curve(detected, actual, iou_threshold):
    """ Calcula la curva PR para un umbral de IoU específico. """
    matched_det = bb.stat.match_det(detected, actual, threshold=iou_threshold, 
                                    criteria=bb.stat.coordinates.iou, 
                                    ignore=bb.stat.IgnoreMethod.SINGLE)
    pr_curve = bb.stat.pr(matched_det, actual)
    return pr_curve

def calculate_recall_precision(tp, fn, fp):
    """ Calcula el recall y la precisión. """
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    return recall, precision

def calculate_ap(recalls, precisions):
    """ Calcula el Average Precision (AP) a partir de las curvas de recall y precision. """
    recalls = [0] + recalls + [1] 
    precisions = [0] + precisions + [0] 

    ap = np.sum((recalls[i] - recalls[i - 1]) * precisions[i] for i in range(1, len(recalls)))
    return ap

def calculate_metrics(detected, actual, class_labels, iou_threshold=0.5):
    precision = []
    recall = []

    for label in class_labels:
        tp_per_class = 0
        fp_per_class = 0
        fn_per_class = 0
        detected_class = detected[detected['class_label'] == label]
        actual_class = actual[actual['class_label'] == label]

        for _, det_row in detected_class.iterrows():
            box_det = [det_row['x_top_left'], det_row['y_top_left'],
                       det_row['x_top_left'] + det_row['width'], 
                       det_row['y_top_left'] + det_row['height']]

            best_iou = 0
            for _, act_row in actual_class.iterrows():
                box_act = [act_row['x_top_left'], act_row['y_top_left'],
                           act_row['x_top_left'] + act_row['width'], 
                           act_row['y_top_left'] + act_row['height']]
                current_iou = iou(box_det, box_act)
                best_iou = max(best_iou, current_iou)

            if best_iou >= iou_threshold:
                tp_per_class += 1
            else:
                fp_per_class += 1

        # Calcular FN
        fn_per_class = len(actual) - tp_per_class
    
        # Calcular recall y precisión
        recall_per_class, precision_per_class = calculate_recall_precision(tp_per_class, fn_per_class, fp_per_class)
        recall.append(recall_per_class)
        precision.append(precision_per_class)
    
    recall = np.mean(recall)
    precision = np.mean(precision)

    # Calcular AP por clase y luego calcular el promedio (mAP)
    mAP = calculate_map(detected, actual, class_labels)

    return recall, precision, mAP

image_stats = pd.DataFrame(columns=['name', 'width', 'height', 'num_of_gt_objects', 'lowest_area', 'biggest_area', 'num_of_predicted_objects', 'recall', 'precision', 'mAP'])

det_grouped = det.groupby('image', observed=True)
anno_grouped = anno.groupby('image', observed=True)

class_labels = anno['class_label'].unique().tolist()

total_images = set(anno['image'].unique().tolist() + det['image'].unique().tolist())
total_images = sorted(total_images)
for image_id in tqdm(total_images):
    width = height = num_of_gt_objects = lowest_area = biggest_area = num_of_predicted_objects = np.nan
    recall = precision = mAP = 0

    if image_id in anno_grouped.groups:
        image_data = anno_grouped.get_group(image_id).copy()
        width = image_data.iloc[0]['image_width']
        height = image_data.iloc[0]['image_height']
        num_of_gt_objects = len(image_data)
        image_data['area'] = image_data.apply(calculate_area, axis=1)
        lowest_area = image_data['area'].min() if not image_data['area'].empty else np.nan
        biggest_area = image_data['area'].max() if not image_data['area'].empty else np.nan

    if image_id in det_grouped.groups:
        det_data = det_grouped.get_group(image_id)
        num_of_predicted_objects = len(det_data)
        recall, precision, mAP = calculate_metrics(det_data, image_data, class_labels)

    new_row = {
        'name': image_id,
        'width': width,
        'height': height,
        'num_of_gt_objects': num_of_gt_objects,
        'lowest_area': lowest_area,
        'biggest_area': biggest_area,
        'num_of_predicted_objects': num_of_predicted_objects,
        'recall': recall,
        'precision': precision,
        'mAP': mAP,
        
    }
    image_stats = pd.concat([image_stats, pd.DataFrame([new_row])], ignore_index=True)


  0%|          | 0/3133 [00:00<?, ?it/s]

  image_stats = pd.concat([image_stats, pd.DataFrame([new_row])], ignore_index=True)


In [73]:
print(f"The number of images is {len(image_stats['name'].unique())}")

The number of images is 3133


In [74]:
grouped_df_filtered = df_filtered.groupby('img').agg(
    num_objects=('img', 'count'),
    image_path=('image_path', 'first'),  # Tomamos el primer 'image_path' para cada grupo
    label_path=('label_path', 'first')   # Tomamos el primer 'label_path' para cada grupo
).reset_index()

In [75]:
image_stats = pd.merge(image_stats, grouped_df_filtered[['img', 'image_path', 'label_path']], left_on='name', right_on='img', how='left')

In [76]:
image_stats.sort_values(by=['name'], ascending=True, inplace=True)
print(f"The number of images is {len(image_stats['name'].unique())}")

The number of images is 3133


In [77]:
image_stats.to_csv('../../small-fast-detector/runs/detect/val/image_stats.csv', index=False)

In [78]:
# check if there are images repeated
image_stats['name'].unique()
len(image_stats)

3133

In [126]:
df_filtered_2k = image_stats.sort_values(by=['recall'], ascending=False).head(2000)

In [121]:
df_filtered_2k

Unnamed: 0,name,width,height,num_of_gt_objects,lowest_area,biggest_area,num_of_predicted_objects,recall,precision,mAP,img,image_path,label_path
2749,9703,2249,2811,1,8880.0,8880.0,8,1.000000,0.125000,0.060692,9703,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...
72,5464,1920,1080,1,5828.0,5828.0,28,0.833333,0.037879,0.066044,5464,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...
678,6070,1920,1080,1,2448.0,2448.0,68,0.833333,0.030864,0.074300,6070,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...
603,5995,1920,1080,1,9225.0,9225.0,159,0.833333,0.005952,0.009493,5995,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...
2429,7911,3916,2842,1,23653.0,23653.0,75,0.833333,0.029762,0.045427,7911,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1022,6414,1920,1080,10,464.0,1632.0,138,0.300000,0.103383,0.169901,6414,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...
1598,6990,1920,1080,15,280.0,2079.0,117,0.300000,0.077887,0.156550,6990,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...
1231,6623,1920,1080,5,1050.0,6144.0,132,0.300000,0.043557,0.136234,6623,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...
427,5819,1920,1080,10,420.0,16960.0,60,0.300000,0.117665,0.287718,5819,/home/johnny/Projects/datasets/custom_dataset_...,/home/johnny/Projects/datasets/custom_dataset_...


In [114]:
import pandas as pd
import os
import shutil
import yaml
dataset = df_filtered_2k
print(f"The number of images is {len(dataset['name'].unique())}")
base_dir = '/home/johnny/Projects/datasets/Client_Validation_Set'
subfolders = ['train', 'val', 'test']
folders = ['images', 'labels']
for folder in folders:
    for subfolder in subfolders:
        os.makedirs(os.path.join(base_dir, folder, subfolder), exist_ok=True)

for index, row in dataset.iterrows():
    image_path = row['image_path']
    label_path = row['label_path']
    shutil.copy(image_path, os.path.join(base_dir, 'images', 'val'))
    shutil.copy(label_path, os.path.join(base_dir, 'labels', 'val'))
yaml_data = {
    'path': '../small-fast-detector/inference_tools/Evaluation/datasets/Client_Validation_Set',  # PUT HERE THE PATH
    'train': 'images/train',
    'val': 'images/val',
    'test': '',
    'names': {
        0: 'uav',
        1: 'airplane',
        2: 'boat',
        3: 'car',
        4: 'person',
        5: 'truck'
    }

}
with open(os.path.join(base_dir, 'data.yaml'), 'w') as file:
    yaml.dump(yaml_data, file, default_flow_style=False)

print("Dataset organized and YAML file created.")

The number of images is 2000
Dataset organized and YAML file created.


In [127]:
df_filtered[df_filtered['img'].isin(df_filtered_2k['img'])]['bbox_size_category'].value_counts()

bbox_size_category
Medium    6649
Small     6436
Tiny      4079
Large     2018
Name: count, dtype: int64

In [128]:
df_filtered[df_filtered['img'].isin(df_filtered_2k['img'])]['class_name'].value_counts()

class_name
Person      12127
Car          4494
UAV          2153
Ship          283
Truck         124
Aircraft        1
Name: count, dtype: int64

In [129]:
filtered_new = df_filtered[df_filtered['img'].isin(df_filtered_2k['img'])]
filtered_new[filtered_new['bbox_size_category'] == 'Tiny']['class_name'].value_counts()

class_name
Person    3831
UAV        164
Car         62
Ship        22
Name: count, dtype: int64