# Exploratory Data Analysis for YOLO Dataset

## Importing necessary libraries

In [47]:
import datetime
import json
import os

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

In [48]:
# Path setup - Adjust the root directory accordingly
cwd = os.getcwd()
project_path = os.path.join(cwd, "..", "..")
dataset_path = os.path.join(project_path, "data", "ships_v10i") # Replace with your dataset folder
splits = ['train', 'valid', 'test']

date_str = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
experiments_path = os.path.join(project_path, "experiments", "eda", date_str)
os.makedirs(experiments_path, exist_ok=True)
eda_summary = {}  # Dictionary to store all EDA results

In [49]:
# Function to get all image and label paths from a given split
def get_image_label_paths(split):
    images_dir = os.path.join(dataset_path, split, "images")
    labels_dir = os.path.join(dataset_path, split, "labels")
    image_files = sorted([os.path.join(images_dir, f) for f in os.listdir(images_dir) if f.endswith('.jpg') or f.endswith('.png')])
    label_files = sorted([os.path.join(labels_dir, f) for f in os.listdir(labels_dir) if f.endswith('.txt')])
    return image_files, label_files

In [50]:
# Initialize dictionaries to store data
split_data = {}
for split in splits:
    image_files, label_files = get_image_label_paths(split)
    split_data[split] = {'images': image_files, 'labels': label_files}

In [51]:
# 1. Counting the number of images and labels per split
image_label_stats = {}
for split in splits:
    image_label_stats[split] = {
        'num_images': len(split_data[split]['images']),
        'num_labels': len(split_data[split]['labels'])
    }
eda_summary['image_label_stats'] = image_label_stats

In [None]:
# 2. Bounding box statistics
def parse_yolo_label(label_path):
    with open(label_path, 'r') as file:
        lines = file.readlines()
    return [list(map(float, line.strip().split()[1:])) for line in lines]  # Ignore class ID

def analyze_bounding_boxes(split):
    all_bboxes = []
    bboxes_per_image = []

    for img_path, label_path in tqdm(
        zip(split_data[split]['images'], split_data[split]['labels']),
        desc=f"Processing BBoxes in {split}", total=len(split_data[split]['images'])
    ):
        bboxes = parse_yolo_label(label_path)
        img = cv2.imread(img_path)
        img_height, img_width = img.shape[:2]  # Get image dimensions

        # Scale bbox size to original pixel units
        corrected_bboxes = [
            (bbox[2] * img_width, bbox[3] * img_height) for bbox in bboxes
        ]  # width, height scaled
        bboxes_per_image.append(len(bboxes))
        all_bboxes.extend(corrected_bboxes)

    # Convert to DataFrame for easier calculations
    bboxes_df = pd.DataFrame(all_bboxes, columns=['width', 'height'])

    return {
        'mean_bboxes_per_image': np.mean(bboxes_per_image),
        'median_bboxes_per_image': np.median(bboxes_per_image),
        'mean_bbox_size_pixels': bboxes_df[['width', 'height']].mean().values.tolist(),
        'median_bbox_size_pixels': bboxes_df[['width', 'height']].median().values.tolist()
    }


bbox_stats = {split: analyze_bounding_boxes(split) for split in splits}
eda_summary['bbox_stats'] = bbox_stats

In [None]:
# 3. Image size statistics
def analyze_image_sizes(split):
    resolutions = []
    for image_path in tqdm(split_data[split]['images'], desc=f"Analyzing {split}"):
        img = cv2.imread(image_path)
        if img is not None:
            resolutions.append(img.shape[:2])  # Height, Width
    resolutions = np.array(resolutions)
    return {
        'mean_resolution': np.mean(resolutions, axis=0).tolist(),
        'median_resolution': np.median(resolutions, axis=0).tolist()
    }

image_stats = {split: analyze_image_sizes(split) for split in splits}
eda_summary['image_stats'] = image_stats

In [None]:
# 4. Save EDA summary to Markdown file
eda_summary_md_path = os.path.join(experiments_path, 'eda_summary.md')


def dict_to_markdown(eda_summary):
    md_content = "# Exploratory Data Analysis Summary\n\n"
    for section, stats in eda_summary.items():
        md_content += f"## {section.replace('_', ' ').title()}\n\n"
        if isinstance(stats, dict):
            for split, split_stats in stats.items():
                md_content += f"### {split.capitalize()}\n\n"
                for key, value in split_stats.items():
                    md_content += f"- **{key.replace('_',
                                                     ' ').title()}**: {value}\n"
                md_content += "\n"
        else:
            md_content += f"{stats}\n\n"
    return md_content


with open(eda_summary_md_path, 'w') as file:
    file.write(dict_to_markdown(eda_summary))

print(f"EDA summary saved to {eda_summary_md_path}")