# 資料整理

In [23]:
from pathlib import Path

dataset_path = Path("./my_dataset/mangoV5")
labels = sorted(dataset_path.rglob("*labels/*.txt"))
print(labels)

[WindowsPath('my_dataset/mangoV5/labels/004e5e0a-02083.txt'), WindowsPath('my_dataset/mangoV5/labels/00d96cda-01371.txt'), WindowsPath('my_dataset/mangoV5/labels/02dea6b1-01887.txt'), WindowsPath('my_dataset/mangoV5/labels/03b5da5a-01080.txt'), WindowsPath('my_dataset/mangoV5/labels/04919bb7-01758.txt'), WindowsPath('my_dataset/mangoV5/labels/04b688e2-00989.txt'), WindowsPath('my_dataset/mangoV5/labels/04e011b0-01732.txt'), WindowsPath('my_dataset/mangoV5/labels/059fcd8e-01406.txt'), WindowsPath('my_dataset/mangoV5/labels/069acead-01617.txt'), WindowsPath('my_dataset/mangoV5/labels/0790454d-01661.txt'), WindowsPath('my_dataset/mangoV5/labels/08921ecd-00382.txt'), WindowsPath('my_dataset/mangoV5/labels/08a5c76b-00753.txt'), WindowsPath('my_dataset/mangoV5/labels/08e469ca-01536.txt'), WindowsPath('my_dataset/mangoV5/labels/0ac01643-00411.txt'), WindowsPath('my_dataset/mangoV5/labels/0add070f-01872.txt'), WindowsPath('my_dataset/mangoV5/labels/0c686987-00360.txt'), WindowsPath('my_dataset

In [24]:
import yaml
yaml_file = "config/dataset_for_cross_vaildation.yaml"

with open (yaml_file,encoding="utf-8") as y:
    classes = yaml.safe_load(y)["names"] #讀取yaml 的類別名稱
cls_idx = sorted(classes.keys())
print(cls_idx)

[0, 1, 2]


In [25]:
#初始化dataframe
import pandas as pd

index  = [label.stem for label in labels] #抓取檔案名稱
# print(index)
labels_df = pd.DataFrame([],columns=cls_idx,index=index)

In [26]:
#計算每個類別的標籤數量
from collections import Counter
for label in labels:
    lbl_counter = Counter()
    
    with open(label) as lf:
        lines = lf.readlines()
    for line in lines:
        #YOLO 格式的 label 類別在每一列第一個位置
        lbl_counter[int(line.split(" ",1)[0])] =+ 1
    labels_df.loc[label.stem] = lbl_counter
    
labels_df = labels_df.fillna(0.0) #把無效的值填0
print(labels_df)

                  0    1    2
004e5e0a-02083  1.0  1.0  0.0
00d96cda-01371  1.0  1.0  1.0
02dea6b1-01887  1.0  1.0  1.0
03b5da5a-01080  1.0  1.0  1.0
04919bb7-01758  1.0  0.0  1.0
...             ...  ...  ...
fde4ffd8-01788  1.0  0.0  1.0
fe135301-01563  1.0  1.0  1.0
fe5ad3e9-00689  1.0  1.0  1.0
ff1bb6ba-01054  1.0  1.0  1.0
ff378991-01314  1.0  1.0  1.0

[332 rows x 3 columns]


  labels_df = labels_df.fillna(0.0) #把無效的值填0


# 數據拆分

In [27]:
import random

from sklearn.model_selection import KFold

random.seed(0)  # for reproducibility
ksplit = 5
kf = KFold(n_splits=ksplit, shuffle=True, random_state=20)  # setting random_state for repeatable results

kfolds = list(kf.split(labels_df))
for kfold in kfolds:
    print(kfold)
    print(len(list(kfold)))
# print(kfolds)

(array([  1,   2,   3,   4,   5,   6,   9,  10,  11,  12,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  48,  49,  50,  51,  52,  54,  55,  56,  57,  58,  59,  60,  64,  65,  66,  67,  68,  69,  70,  71,  73,  74,
        75,  76,  77,  78,  79,  80,  82,  83,  85,  86,  88,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 103, 105, 106, 107, 109, 111, 113, 114, 116, 117, 118, 119, 120, 121, 123, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
       150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 173, 176, 177, 178, 180, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 193, 194, 195, 197, 200, 201, 202, 203, 206, 208, 211, 212, 213, 214, 215, 216, 218, 221, 222, 223, 224, 226, 227, 228, 229, 230,
       231, 232, 233, 234, 235, 236, 237, 238, 2

In [28]:
folds = [f"split_{n}" for n in range(1, ksplit + 1)]
folds_df = pd.DataFrame(index=index, columns=folds)

for i, (train, test) in enumerate(kfolds, start=1):
    folds_df[f"split_{i}"].loc[labels_df.iloc[train].index] = "train"
    folds_df[f"split_{i}"].loc[labels_df.iloc[test].index] = "test"
    
print(folds_df)

               split_1 split_2 split_3 split_4 split_5
004e5e0a-02083    test   train   train   train   train
00d96cda-01371   train   train    test   train   train
02dea6b1-01887   train   train    test   train   train
03b5da5a-01080   train   train   train   train    test
04919bb7-01758   train   train   train   train    test
...                ...     ...     ...     ...     ...
fde4ffd8-01788   train    test   train   train   train
fe135301-01563    test   train   train   train   train
fe5ad3e9-00689   train    test   train   train   train
ff1bb6ba-01054   train   train   train    test   train
ff378991-01314   train    test   train   train   train

[332 rows x 5 columns]


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  folds_df[f"split_{i}"].loc[labels_df.iloc[train].index] = "train"
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update

In [29]:
#計算分佈
fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx)

for n, (train_indices, val_indices) in enumerate(kfolds, start=1):
    train_totals = labels_df.iloc[train_indices].sum()
    val_totals = labels_df.iloc[val_indices].sum()

    # 避免除以0
    ratio = val_totals / (train_totals + 1e-7)
    fold_lbl_distrb.loc[f"split_{n}"] = ratio

In [None]:
#創建資料夾跟 datasets
import datetime

supported_extensions = [".jpg", ".jpeg", ".png"]

# Initialize an empty list to store image file paths
images = []

# Loop through supported extensions and gather image files
for ext in supported_extensions:
    images.extend(sorted((dataset_path / "images").rglob(f"*{ext}")))

date = datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=8))).strftime("%Y-%m-%d-%H.%M.%S")


# Create the necessary directories and dataset YAML files
child_path = f"{date}_{ksplit}-Fold_Cross-val"
save_path = Path(f"../datasets/{child_path}")
save_path.mkdir(parents=True, exist_ok=True)
ds_yamls = []

for split in folds_df.columns:
    # Create directories
    split_dir = save_path / split
    split_dir.mkdir(parents=True, exist_ok=True)
    (split_dir / "train" / "images").mkdir(parents=True, exist_ok=True)
    (split_dir / "train" / "labels").mkdir(parents=True, exist_ok=True)
    (split_dir / "test" / "images").mkdir(parents=True, exist_ok=True)
    (split_dir / "test" / "labels").mkdir(parents=True, exist_ok=True)

    # Create dataset YAML files
    dataset_yaml = split_dir / f"{split}_dataset.yaml"
    ds_yamls.append(dataset_yaml)

    with open(dataset_yaml, "w") as ds_y:
        yaml.safe_dump(
            {
                "path": f"{child_path}/{split}/train",
                "train": "train/autosplit_train.txt",
                "val": "train/autosplit_val.txt",
                # "test": "test",
                "names": classes,
            },
            ds_y,
        )

In [31]:
#將照片分類放入
import shutil

from tqdm import tqdm

for image, label in tqdm(zip(images, labels), total=len(images), desc="Copying files"):
    for split, k_split in folds_df.loc[image.stem].items():
        # Destination directory
        img_to_path = save_path / split / k_split / "images"
        lbl_to_path = save_path / split / k_split / "labels"

        # Copy image and label files to new directory (SamefileError if file already exists)
        shutil.copy(image, img_to_path / image.name)
        shutil.copy(label, lbl_to_path / label.name)

Copying files: 100%|██████████| 332/332 [00:03<00:00, 110.36it/s]


In [32]:
#autosplit val train
from ultralytics.data.split import autosplit
for i in range(1,ksplit+1):
    autosplit(f"{save_path}/split_{i}/train/images",weights=(0.8, 0.2, 0.0), annotated_only=True)


Autosplitting images from datasets\2025-08-20-17.41.05_5-Fold_Cross-val\split_1\train\images, using *.txt labeled images only


100%|██████████| 265/265 [00:00<00:00, 6299.97it/s]

Autosplitting images from datasets\2025-08-20-17.41.05_5-Fold_Cross-val\split_2\train\images, using *.txt labeled images only



100%|██████████| 265/265 [00:00<00:00, 6371.07it/s]

Autosplitting images from datasets\2025-08-20-17.41.05_5-Fold_Cross-val\split_3\train\images, using *.txt labeled images only



100%|██████████| 266/266 [00:00<00:00, 6811.47it/s]

Autosplitting images from datasets\2025-08-20-17.41.05_5-Fold_Cross-val\split_4\train\images, using *.txt labeled images only



100%|██████████| 266/266 [00:00<00:00, 7346.80it/s]

Autosplitting images from datasets\2025-08-20-17.41.05_5-Fold_Cross-val\split_5\train\images, using *.txt labeled images only



100%|██████████| 266/266 [00:00<00:00, 6323.88it/s]


In [33]:
#儲存紀錄
folds_df.to_csv(save_path / "kfold_datasplit.csv")
fold_lbl_distrb.to_csv(save_path / "kfold_label_distribution.csv")