## Splitting Data Notebook

This Notebook is for splitting the data from the raw labelling file into training, validation and testing sets.

In [1]:
import os
import random
import shutil

In [4]:
DATA_DIR = r"C:\Users\lapt1\Downloads\Hailo AI\ppe-detection" # Ganti dengan path ke direktori dataset Anda
IMAGES_DIR = os.path.join(DATA_DIR, 'images')
LABELS_DIR = os.path.join(DATA_DIR, 'labels')

In [6]:
OUTPUT_BASE_DIR = r'C:\Users\lapt1\Downloads\Hailo AI\ppe-dataset' # Direktori output untuk dataset yang sudah dibagi
TRAIN_RATIO = 0.8  # 80% untuk training
VAL_RATIO = 0.15 # 15% untuk validation
TEST_RATIO = 0.05 

In [10]:
os.makedirs(os.path.join(OUTPUT_BASE_DIR, 'images', 'train'), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_BASE_DIR, 'images', 'val'), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_BASE_DIR, 'labels', 'train'), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_BASE_DIR, 'labels', 'val'), exist_ok=True)

if TEST_RATIO > 0:
    os.makedirs(os.path.join(OUTPUT_BASE_DIR, 'images', 'test'), exist_ok=True)
    os.makedirs(os.path.join(OUTPUT_BASE_DIR, 'labels', 'test'), exist_ok=True)

In [11]:
image_files = [f for f in os.listdir(IMAGES_DIR) if f.endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
random.shuffle(image_files) # Acak urutan file

In [12]:
num_images = len(image_files)
num_train = int(num_images * TRAIN_RATIO)
num_val = int(num_images * VAL_RATIO)
num_test = num_images - num_train - num_val # Sisa untuk test

train_files = image_files[:num_train]
val_files = image_files[num_train : num_train + num_val]
test_files = image_files[num_train + num_val :]

In [13]:
print(f"Total gambar: {num_images}")
print(f"Train: {len(train_files)} gambar")
print(f"Validation: {len(val_files)} gambar")
print(f"Test: {len(test_files)} gambar")

Total gambar: 768
Train: 614 gambar
Validation: 115 gambar
Test: 39 gambar


In [14]:
def copy_files(file_list, target_image_dir, target_label_dir):
    for fname in file_list:
        # Nama file tanpa ekstensi untuk label
        base_name = os.path.splitext(fname)[0]
        label_fname = base_name + '.txt'

        # Salin gambar
        shutil.copy(os.path.join(IMAGES_DIR, fname), os.path.join(target_image_dir, fname))
        # Salin label
        shutil.copy(os.path.join(LABELS_DIR, label_fname), os.path.join(target_label_dir, label_fname))

# --- Salin ke Direktori Masing-masing ---
print("Menyalin file training...")
copy_files(train_files,
           os.path.join(OUTPUT_BASE_DIR, 'images', 'train'),
           os.path.join(OUTPUT_BASE_DIR, 'labels', 'train'))

print("Menyalin file validation...")
copy_files(val_files,
           os.path.join(OUTPUT_BASE_DIR, 'images', 'val'),
           os.path.join(OUTPUT_BASE_DIR, 'labels', 'val'))

if TEST_RATIO > 0:
    print("Menyalin file testing...")
    copy_files(test_files,
               os.path.join(OUTPUT_BASE_DIR, 'images', 'test'),
               os.path.join(OUTPUT_BASE_DIR, 'labels', 'test'))

Menyalin file training...
Menyalin file validation...
Menyalin file testing...


In [15]:
shutil.copy(os.path.join(DATA_DIR, 'classes.txt'), os.path.join(OUTPUT_BASE_DIR, 'classes.txt'))
print("Dataset berhasil dibagi dan disalin!")

Dataset berhasil dibagi dan disalin!
