<a href="https://colab.research.google.com/github/Triniti0/klasifikasi-penyakit-daun-cabai/blob/main/Preprocessing_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PREPOCESSING DAN SPLITTING DATASET



In [1]:
import os
import shutil
import random
import numpy as np
import cv2
from tqdm import tqdm

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


konfigurasi awal

In [2]:
ORIGINAL_DATASET = "/content/drive/MyDrive/Seminar Hasil/dataset_original"
PROCESSED_DATASET = "/content/drive/MyDrive/Seminar Hasil/dataset_processed"

TRAIN_RATIO = 0.7
VAL_RATIO = 0.15
TEST_RATIO = 0.15

IMG_SIZE = 224
SEED = 42

random.seed(SEED)
np.random.seed(SEED)

direktori baru pada drive

In [3]:
def create_directory_structure():
    if os.path.exists(PROCESSED_DATASET):
        shutil.rmtree(PROCESSED_DATASET)

    for split in ['train', 'val', 'test']:
        for class_name in os.listdir(ORIGINAL_DATASET):
            os.makedirs(os.path.join(PROCESSED_DATASET, split, class_name), exist_ok=True)

create_directory_structure()

# fungsi augmentasi rotasi, flip, zoom

In [4]:
def augment_image(image):
    img = image.copy()

    # Random rotation
    angle = random.uniform(-25, 25)
    h, w = img.shape[:2]
    M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1)
    img = cv2.warpAffine(img, M, (w, h))

    # Random horizontal flip
    if random.random() > 0.5:
        img = cv2.flip(img, 1)

    # Random zoom
    zoom_factor = random.uniform(0.8, 1.2)
    resized = cv2.resize(img, None, fx=zoom_factor, fy=zoom_factor)

    if zoom_factor > 1:
        resized = resized[:h, :w]
    else:
        padded = np.zeros_like(img)
        padded[:resized.shape[0], :resized.shape[1]] = resized
        resized = padded

    return resized

# splitting (stratified) dan copy data ke direktori baru

In [5]:
train_counts = {}

for class_name in os.listdir(ORIGINAL_DATASET):

    class_path = os.path.join(ORIGINAL_DATASET, class_name)
    images = os.listdir(class_path)
    random.shuffle(images)

    total = len(images)
    train_end = int(TRAIN_RATIO * total)
    val_end = int((TRAIN_RATIO + VAL_RATIO) * total)

    train_imgs = images[:train_end]
    val_imgs = images[train_end:val_end]
    test_imgs = images[val_end:]

    train_counts[class_name] = len(train_imgs)

    for split_name, split_data in zip(
        ['train', 'val', 'test'],
        [train_imgs, val_imgs, test_imgs]
    ):

        for img_name in tqdm(split_data, desc=f"{class_name} - {split_name}"):

            img_path = os.path.join(class_path, img_name)
            image = cv2.imread(img_path)
            image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
            image = image.astype(np.float32) / 255.0

            save_path = os.path.join(PROCESSED_DATASET, split_name, class_name, img_name)
            cv2.imwrite(save_path, image)

Anthracnose - train: 100%|██████████| 113/113 [00:59<00:00,  1.88it/s]
Anthracnose - val: 100%|██████████| 24/24 [00:11<00:00,  2.05it/s]
Anthracnose - test: 100%|██████████| 25/25 [00:14<00:00,  1.67it/s]
White spot - train: 100%|██████████| 136/136 [00:57<00:00,  2.38it/s]
White spot - val: 100%|██████████| 29/29 [00:17<00:00,  1.67it/s]
White spot - test: 100%|██████████| 30/30 [00:14<00:00,  2.03it/s]
Bacterial Spot - train: 100%|██████████| 106/106 [00:58<00:00,  1.80it/s]
Bacterial Spot - val: 100%|██████████| 23/23 [00:13<00:00,  1.74it/s]
Bacterial Spot - test: 100%|██████████| 23/23 [00:13<00:00,  1.69it/s]
Healthy Leaf - train: 100%|██████████| 320/320 [01:22<00:00,  3.86it/s]
Healthy Leaf - val: 100%|██████████| 69/69 [00:15<00:00,  4.52it/s]
Healthy Leaf - test: 100%|██████████| 69/69 [00:10<00:00,  6.81it/s]
yellow disease - train: 100%|██████████| 1352/1352 [02:22<00:00,  9.46it/s]
yellow disease - val: 100%|██████████| 290/290 [00:16<00:00, 17.11it/s]
yellow disease - te

# target balancing data terpusat pada training

In [6]:
max_train_count = max(train_counts.values())
print("Target per class (train):", max_train_count)

Target per class (train): 1352


# oversampling dengan augmentasi

In [7]:
for class_name in os.listdir(os.path.join(PROCESSED_DATASET, 'train')):

    class_path = os.path.join(PROCESSED_DATASET, 'train', class_name)
    images = os.listdir(class_path)

    current_count = len(images)
    deficit = max_train_count - current_count

    print(f"{class_name}: current={current_count}, need={deficit}")

    if deficit > 0:

        i = 0
        while deficit > 0:
            img_name = images[i % len(images)]
            img_path = os.path.join(class_path, img_name)

            image = cv2.imread(img_path)
            aug_img = augment_image(image)
            aug_img = aug_img.astype(np.float32) / 255.0

            new_name = f"aug_bal_{deficit}_{img_name}"
            cv2.imwrite(os.path.join(class_path, new_name), aug_img)

            deficit -= 1
            i += 1

Anthracnose: current=113, need=1239
White spot: current=136, need=1216
Bacterial Spot: current=106, need=1246
Healthy Leaf: current=320, need=1032
yellow disease: current=1352, need=0
Curl Virus: current=296, need=1056
Cercospora Leaf Spot: current=122, need=1230
Nutrition Deficiency: current=310, need=1042
