In [1]:
import os
import shutil
from PIL import Image
from tqdm import tqdm


In [2]:
# 📂 Original dataset location
original_datasets = {
    "plantdoc": r"C:\Users\tiwar\plant_disease_detection\Dataset\plantdoc",
    "agri_india": r"C:\Users\tiwar\plant_disease_detection\Dataset\agri_india",
    "wheat_disease": r"C:\Users\tiwar\plant_disease_detection\Dataset\wheat_disease"
}

# 📂 Final cleaned dataset location
processed_path = r"C:\Users\tiwar\plant_disease_detection\processed_dataset"

# 📏 Set standard image size for all inputs (used in DL models)
standard_size = (224, 224)


In [3]:
def clean_and_resize_images(source_path, target_path, size=(224, 224)):
    """
    Clean, resize and copy images from source to target path.
    Skips corrupted images.
    """
    total_images = 0
    corrupted_count = 0

    for class_folder in os.listdir(source_path):
        class_path = os.path.join(source_path, class_folder)
        if not os.path.isdir(class_path):
            continue

        # Create same class folder in target
        target_class_path = os.path.join(target_path, class_folder)
        os.makedirs(target_class_path, exist_ok=True)

        # Process images in each class
        for img_file in tqdm(os.listdir(class_path), desc=f"📁 Processing {class_folder}"):
            src_img_path = os.path.join(class_path, img_file)
            try:
                with Image.open(src_img_path) as img:
                    img = img.convert("RGB")
                    img = img.resize(size)
                    
                    # Uniform name (no spaces, no special chars)
                    clean_name = f"{class_folder.replace(' ', '_')}_{total_images}.jpg"
                    save_path = os.path.join(target_class_path, clean_name)
                    img.save(save_path)
                    total_images += 1

            except Exception as e:
                corrupted_count += 1
                continue

    print(f"\n✅ Done: {total_images} images saved.")
    print(f"⚠️ Corrupted/Skipped: {corrupted_count}")


In [4]:
# Make sure final target folder exists
os.makedirs(processed_path, exist_ok=True)

# Loop through and process each dataset
for name, path in original_datasets.items():
    print(f"\n🔄 Processing Dataset: {name}")
    clean_and_resize_images(path, processed_path)



🔄 Processing Dataset: plantdoc


📁 Processing Apple_leaf: 100%|████████████████████████████████████████████████████████| 88/88 [00:04<00:00, 18.46it/s]
📁 Processing Apple_rust_leaf: 100%|█████████████████████████████████████████████████| 106/106 [00:05<00:00, 20.70it/s]
📁 Processing Apple_Scab_Leaf: 100%|███████████████████████████████████████████████████| 93/93 [00:03<00:00, 26.73it/s]
📁 Processing Bell_pepper_leaf: 100%|██████████████████████████████████████████████████| 42/42 [00:01<00:00, 27.03it/s]
📁 Processing Bell_pepper_leaf_spot: 100%|█████████████████████████████████████████████| 83/83 [00:04<00:00, 18.02it/s]
📁 Processing Blueberry_leaf: 100%|██████████████████████████████████████████████████| 117/117 [00:06<00:00, 17.71it/s]
📁 Processing Cherry_leaf: 100%|███████████████████████████████████████████████████████| 57/57 [00:04<00:00, 13.83it/s]
📁 Processing Corn_Gray_leaf_spot: 100%|███████████████████████████████████████████████| 67/67 [00:02<00:00, 25.93it/s]
📁 Processing Corn_leaf_blight: 100%|████████████


✅ Done: 2922 images saved.
⚠️ Corrupted/Skipped: 0

🔄 Processing Dataset: agri_india


📁 Processing American Bollworm on Cotton: 100%|███████████████████████████████████████| 56/56 [00:03<00:00, 17.17it/s]
📁 Processing Anthracnose on Cotton: 100%|█████████████████████████████████████████████| 29/29 [00:01<00:00, 16.07it/s]
📁 Processing Army worm: 100%|█████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 40.30it/s]
📁 Processing bacterial_blight in Cotton: 100%|██████████████████████████████████████| 489/489 [00:11<00:00, 41.00it/s]
📁 Processing Becterial Blight in Rice: 100%|██████████████████████████████████████| 1584/1584 [00:26<00:00, 60.41it/s]
📁 Processing bollrot on Cotton: 100%|███████████████████████████████████████████████████| 2/2 [00:00<00:00, 53.64it/s]
📁 Processing bollworm on Cotton: 100%|████████████████████████████████████████████████| 22/22 [00:00<00:00, 25.56it/s]
📁 Processing Brownspot: 100%|█████████████████████████████████████████████████████| 1640/1640 [00:24<00:00, 66.25it/s]
📁 Processing Common_Rust: 100%|█████████████████


✅ Done: 15341 images saved.
⚠️ Corrupted/Skipped: 1

🔄 Processing Dataset: wheat_disease


📁 Processing Aphid: 100%|███████████████████████████████████████████████████████████| 903/903 [00:17<00:00, 50.39it/s]
📁 Processing Black Rust: 100%|██████████████████████████████████████████████████████| 576/576 [01:59<00:00,  4.82it/s]
📁 Processing Blast: 100%|███████████████████████████████████████████████████████████| 647/647 [00:35<00:00, 18.31it/s]
📁 Processing Brown Rust: 100%|████████████████████████████████████████████████████| 1271/1271 [02:05<00:00, 10.17it/s]
📁 Processing Common Root Rot: 100%|█████████████████████████████████████████████████| 614/614 [00:14<00:00, 41.47it/s]
📁 Processing Fusarium Head Blight: 100%|████████████████████████████████████████████| 611/611 [00:10<00:00, 57.06it/s]
📁 Processing Healthy: 100%|███████████████████████████████████████████████████████| 1000/1000 [01:34<00:00, 10.53it/s]
📁 Processing Leaf Blight: 100%|█████████████████████████████████████████████████████| 842/842 [00:34<00:00, 24.16it/s]
📁 Processing Mildew: 100%|██████████████████████


✅ Done: 13104 images saved.
⚠️ Corrupted/Skipped: 0


In [5]:
def count_processed_images(processed_dir):
    total = 0
    class_wise = {}

    for cls in os.listdir(processed_dir):
        cls_path = os.path.join(processed_dir, cls)
        if os.path.isdir(cls_path):
            count = len(os.listdir(cls_path))
            class_wise[cls] = count
            total += count

    print("📊 Image count per class:\n")
    for k, v in class_wise.items():
        print(f"🟢 {k}: {v} images")
    
    print(f"\n✅ TOTAL IMAGES: {total}")

# ✅ Run this
count_processed_images(processed_path)


📊 Image count per class:

🟢 American Bollworm on Cotton: 56 images
🟢 Anthracnose on Cotton: 29 images
🟢 Aphid: 903 images
🟢 Apple_leaf: 88 images
🟢 Apple_rust_leaf: 106 images
🟢 Apple_Scab_Leaf: 93 images
🟢 Army worm: 40 images
🟢 bacterial_blight in Cotton: 489 images
🟢 Becterial Blight in Rice: 1584 images
🟢 Bell_pepper_leaf: 42 images
🟢 Bell_pepper_leaf_spot: 83 images
🟢 Black Rust: 576 images
🟢 Blast: 647 images
🟢 Blueberry_leaf: 117 images
🟢 bollrot on Cotton: 2 images
🟢 bollworm on Cotton: 22 images
🟢 Brown Rust: 1271 images
🟢 Brownspot: 1640 images
🟢 Cherry_leaf: 57 images
🟢 Common Root Rot: 614 images
🟢 Common_Rust: 1306 images
🟢 Corn_Gray_leaf_spot: 67 images
🟢 Corn_leaf_blight: 194 images
🟢 Corn_rust_leaf: 117 images
🟢 Cotton Aphid: 39 images
🟢 cotton mealy bug: 93 images
🟢 cotton whitefly: 55 images
🟢 Flag Smut: 179 images
🟢 Fusarium Head Blight: 611 images
🟢 grape_leaf: 75 images
🟢 grape_leaf_black_rot: 79 images
🟢 Gray_Leaf_Spot: 574 images
🟢 Healthy: 1000 images
🟢 Healthy 