In [3]:
from pathlib import Path

dataset_path = Path("./dataset/ivyqo_augment_dataset")  
labels = sorted(dataset_path.rglob("*labels/*.txt"))  # all data in 'labels'

In [4]:
labels

[PosixPath('dataset/ivyqo_augment_dataset/test/labels/1_20210810_123700_jpg.rf.b14665f77247b730a2072ac97d208b32.txt'),
 PosixPath('dataset/ivyqo_augment_dataset/test/labels/1_20211206_123725_jpg.rf.b448d9643aa2cdf94a967ac5c3033bd0.txt'),
 PosixPath('dataset/ivyqo_augment_dataset/test/labels/20240324_100644_jpg.rf.95d42e450587185ecc9a681152022dc6.txt'),
 PosixPath('dataset/ivyqo_augment_dataset/test/labels/20240324_101131_jpg.rf.fdd6a9420df596f8f8112fab3496668b.txt'),
 PosixPath('dataset/ivyqo_augment_dataset/test/labels/20240324_103222_jpg.rf.569fa4192ab32e3b3479efce78f85d9e.txt'),
 PosixPath('dataset/ivyqo_augment_dataset/test/labels/20240324_105626_jpg.rf.3066d48a65eaa1e1c245b5f25caac7bc.txt'),
 PosixPath('dataset/ivyqo_augment_dataset/test/labels/20240324_110129_jpg.rf.f4d73bf7f46aedea0e232c1c74d35557.txt'),
 PosixPath('dataset/ivyqo_augment_dataset/test/labels/20240324_110719_jpg.rf.3a7ae731064fb7e2cba61109a9e9e03c.txt'),
 PosixPath('dataset/ivyqo_augment_dataset/test/labels/202403

In [5]:
import yaml 

yaml_file = "dataset/ivyqo_augment_dataset/data.yaml"  # data YAML with data directories and names dictionary
with open(yaml_file, "r", encoding="utf8") as y:
    classes = yaml.safe_load(y)["names"]
cls_idx = sorted(classes.keys())

In [6]:
cls_idx

[0, 1, 2, 3]

In [7]:
import pandas as pd

indx = [l.stem for l in labels]  # uses base filename as ID (no extension)
labels_df = pd.DataFrame([], columns=cls_idx, index=indx)

In [8]:
from collections import Counter

for label in labels:
    lbl_counter = Counter()

    with open(label, "r") as lf:
        lines = lf.readlines()

    for l in lines:
        # classes for YOLO label uses integer at first position of each line
        lbl_counter[int(l.split(" ")[0])] += 1

    labels_df.loc[label.stem] = lbl_counter

labels_df = labels_df.fillna(0.0)  # replace `nan` values with `0.0`

  labels_df = labels_df.fillna(0.0)  # replace `nan` values with `0.0`


In [10]:
from sklearn.model_selection import KFold

ksplit = 8
kf = KFold(n_splits=ksplit, shuffle=True, random_state=20)  # setting random_state for repeatable results

kfolds = list(kf.split(labels_df))

In [11]:
folds = [f"split_{n}" for n in range(1, ksplit + 1)]
folds_df = pd.DataFrame(index=indx, columns=folds)

for idx, (train, val) in enumerate(kfolds, start=1):
    folds_df[f"split_{idx}"].loc[labels_df.iloc[train].index] = "train"
    folds_df[f"split_{idx}"].loc[labels_df.iloc[val].index] = "val"

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  folds_df[f"split_{idx}"].loc[labels_df.iloc[train].index] = "train"
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to upda

In [12]:
fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx)

for n, (train_indices, val_indices) in enumerate(kfolds, start=1):
    train_totals = labels_df.iloc[train_indices].sum()
    val_totals = labels_df.iloc[val_indices].sum()

    # To avoid division by zero, we add a small value (1E-7) to the denominator
    ratio = val_totals / (train_totals + 1e-7)
    fold_lbl_distrb.loc[f"split_{n}"] = ratio

In [17]:
import datetime

supported_extensions = [".jpg", ".jpeg", ".png"]

# Initialize an empty list to store image file paths
images = []

# Loop through supported extensions and gather image files
for ext in supported_extensions:
    images.extend(sorted((dataset_path).rglob(f"*{ext}")))

# Create the necessary directories and dataset YAML files (unchanged)
save_path = Path(dataset_path / f"{datetime.date.today().isoformat()}_{ksplit}-Fold_Cross-val")
save_path.mkdir(parents=True, exist_ok=True)
ds_yamls = []

for split in folds_df.columns:
    # Create directories
    split_dir = save_path / split
    split_dir.mkdir(parents=True, exist_ok=True)
    (split_dir / "train" / "images").mkdir(parents=True, exist_ok=True)
    (split_dir / "train" / "labels").mkdir(parents=True, exist_ok=True)
    (split_dir / "val" / "images").mkdir(parents=True, exist_ok=True)
    (split_dir / "val" / "labels").mkdir(parents=True, exist_ok=True)

    # Create dataset YAML files
    dataset_yaml = split_dir / f"{split}_dataset.yaml"
    ds_yamls.append(dataset_yaml)

    with open(dataset_yaml, "w") as ds_y:
        yaml.safe_dump(
            {
                "train": "../train/images",
                "val": "../val/images",
                "names": classes,
            },
            ds_y,
        )

In [18]:
import shutil

for image, label in zip(images, labels):
    for split, k_split in folds_df.loc[image.stem].items():
        # Destination directory
        img_to_path = save_path / split / k_split / "images"
        lbl_to_path = save_path / split / k_split / "labels"

        # Copy image and label files to new directory (SamefileError if file already exists)
        shutil.copy(image, img_to_path / image.name)
        shutil.copy(label, lbl_to_path / label.name)

In [17]:
folds_df.to_csv(save_path / "kfold_datasplit.csv")
fold_lbl_distrb.to_csv(save_path / "kfold_label_distribution.csv")