In [None]:
###==========================================
# Section ~ 0: Importing Libraries
###==========================================

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import os
from pathlib import Path
import shutil
import random
from tqdm import tqdm

In [3]:
from collections import Counter, defaultdict
from itertools import combinations

In [4]:
from PIL import Image
import yaml

In [None]:
###==========================================
# Section ~ 1: Define Paths
###==========================================

In [5]:
project_root = Path.cwd().parent
project_root

WindowsPath('C:/Users/ADITHYA/OneDrive/Kesari')

In [19]:
dataset_dir = project_root / 'dataset'
raw_image_dir = dataset_dir / 'raw/images'
raw_label_dir = dataset_dir / 'raw/labels'
reports_figures_dir = project_root / 'reports/figures/2_data preparation'

train_dir = dataset_dir / 'train'
train_raw_images_dir = train_dir / 'raw_images'
train_raw_labels_dir = train_dir / 'raw_labels'

val_dir = dataset_dir / 'val'
val_images_dir = val_dir / 'images'
val_labels_dir = val_dir / 'labels'

test_dir = dataset_dir / 'test'
test_images_dir = test_dir / 'images'
test_labels_dir = test_dir / 'labels'

In [13]:
# Define subdirectories
subfolders = ["train", "valid", "test"]

# Create them
for sub in subfolders:
    (reports_figures_dir / sub).mkdir(parents=True, exist_ok=True)

In [15]:
train_report_dir = reports_figures_dir / "train"
valid_report_dir = reports_figures_dir / "valid"
test_report_dir = reports_figures_dir / "test"

In [None]:
###==========================================
# Section ~ 2: Create Directories
###==========================================

In [21]:
all_dirs = [
    train_raw_images_dir, train_raw_labels_dir,
    val_images_dir, val_labels_dir,
    test_images_dir, test_labels_dir,
    reports_figures_dir
]

for d in all_dirs:
    d.mkdir(parents=True, exist_ok=True)

In [None]:
###==========================================
# Section ~ 3: Load Dataset
###==========================================

In [23]:
def load_class_names_from_yaml(yaml_path: Path):
    with open(yaml_path, 'r') as f:
        data = yaml.safe_load(f)
    class_names = data.get("names", [])
    return class_names

In [25]:
# Usage
data_yaml_path = project_root / "config/data.yaml"
CLASS_NAMES = load_class_names_from_yaml(data_yaml_path)

In [27]:
# Classes
NUM_CLASSES = len(CLASS_NAMES)
print(f"Loaded {NUM_CLASSES} classes:")
print(CLASS_NAMES)

Loaded 17 classes:
['tree 1-2', 'tree 10-11', 'tree 11-12', 'tree 12-13', 'tree 13-14', 'tree 14-15', 'tree 15-16', 'tree 16-17', 'tree 17-18', 'tree 18-19', 'tree 2-3', 'tree 3-4', 'tree 4-5', 'tree 5-6', 'tree 6-7', 'tree 7-8', 'tree 9-10']


In [29]:
image_files = sorted(raw_image_dir.glob('*'))
label_files = sorted(raw_label_dir.glob('*'))

# Extract stem names (filename without extension)
image_names = {img.stem for img in image_files}
label_names = {lbl.stem for lbl in label_files}

# Compare
labeled = sorted(image_names & label_names)
unlabeled = sorted(image_names - label_names)
extra_labels = sorted(label_names - image_names)

# Reporting
total_images = len(image_names)
total_labels = len(label_names)

In [33]:
print("----- Labeling Report -----\n")
print(f"  -- Total images: {total_images}")
print(f"  -- Total label files: {total_labels}")
print(f"  -- Images with labels: {len(labeled)} ({(len(labeled)/total_images)*100:.2f}%)")
print(f"  -- Images without labels: {len(unlabeled)} ({(len(unlabeled)/total_images)*100:.2f}%)")
print(f"  -- Labels without corresponding images: {len(extra_labels)}")

----- Labeling Report -----

  -- Total images: 302
  -- Total label files: 301
  -- Images with labels: 301 (99.67%)
  -- Images without labels: 1 (0.33%)
  -- Labels without corresponding images: 0
