In [2]:
import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split, KFold

In [13]:
dataset_path = "D:\\TA_Lalu_Lintas\\TA-Lalu-Lintas\\dataset_yolo" # Folder utama dataset (berisi images/ dan labels/)
output_path = "D:\\TA_Lalu_Lintas\\TA-Lalu-Lintas\\split_dataset"           # Folder output hasil split
test_ratio = 0.2                                                            # Rasio test (20%)
k_folds = 4                                                                 # Jumlah fold untuk cross-validation
seed = 40                                                                   # Seed untuk reproduktibilitas

In [6]:
def create_dir(path):
    """Membuat direktori jika belum ada"""
    if not os.path.exists(path):
        os.makedirs(path)

In [7]:
def copy_files(file_list, src_dir, dest_dir):
    """Menyalin file dari sumber ke tujuan"""
    for file in file_list:
        shutil.copy(
            os.path.join(src_dir, file),
            os.path.join(dest_dir, file)
        )

In [14]:
# List semua file image (format: .jpg/.png)
image_files = [f for f in os.listdir(f"{dataset_path}/images") if f.endswith(('.jpg', '.png'))]
image_files = np.array(image_files)

# Split train-test
train_files, test_files = train_test_split(
    image_files, 
    test_size=test_ratio, 
    random_state=seed
)

# Salin file ke direktori train/test
dirs_to_create = [
    f"{output_path}/train/images",
    f"{output_path}/train/labels",
    f"{output_path}/test/images",
    f"{output_path}/test/labels"
]

for dir in dirs_to_create:
    create_dir(dir)

# Salin file train
copy_files(train_files, f"{dataset_path}/images", f"{output_path}/train/images")
copy_files([f.replace('.jpg', '.txt').replace('.png', '.txt') for f in train_files], 
           f"{dataset_path}/labels", f"{output_path}/train/labels")

# Salin file test
copy_files(test_files, f"{dataset_path}/images", f"{output_path}/test/images")
copy_files([f.replace('.jpg', '.txt').replace('.png', '.txt') for f in test_files], 
           f"{dataset_path}/labels", f"{output_path}/test/labels")

In [15]:
kf = KFold(n_splits=k_folds, shuffle=True, random_state=seed)

# Split data training menjadi K fold
for fold, (train_idx, val_idx) in enumerate(kf.split(train_files)):
    fold_dir = f"{output_path}/cross_val/fold_{fold+1}"
    
    # Buat direktori fold
    create_dir(f"{fold_dir}/train/images")
    create_dir(f"{fold_dir}/train/labels")
    create_dir(f"{fold_dir}/val/images")
    create_dir(f"{fold_dir}/val/labels")
    
    # Ambil file untuk train/val
    train_fold_files = train_files[train_idx]
    val_fold_files = train_files[val_idx]
    
    # Salin file ke fold
    copy_files(train_fold_files, f"{dataset_path}/images", f"{fold_dir}/train/images")
    copy_files([f.replace('.jpg', '.txt').replace('.png', '.txt') for f in train_fold_files], 
            f"{dataset_path}/labels", f"{fold_dir}/train/labels")
    
    copy_files(val_fold_files, f"{dataset_path}/images", f"{fold_dir}/val/images")
    copy_files([f.replace('.jpg', '.txt').replace('.png', '.txt') for f in val_fold_files], 
            f"{dataset_path}/labels", f"{fold_dir}/val/labels")

print("Split dataset selesai!")

Split dataset selesai!


In [20]:
import glob

# Masukkan path folder yang ingin dihitung
folder_path = "D:\\TA_Lalu_Lintas\\TA-Lalu-Lintas\\split_dataset\\cross_val\\fold_1\\val\\labels"

# Menghitung jumlah file dalam folder
file_count = len(glob.glob(f"{folder_path}/*"))  # Menghitung semua file

print(f"Jumlah file dalam folder '{folder_path}': {file_count}")

Jumlah file dalam folder 'D:\TA_Lalu_Lintas\TA-Lalu-Lintas\split_dataset\cross_val\fold_1\val\labels': 202
