In [8]:
import os
import shutil
import random
from tqdm import tqdm

# ✅ Raw dataset path (already merged and cleaned)
raw_dataset_path = r"C:\Users\tiwar\plant_disease_detection\Dataset"

# ✅ Preprocessed output folder (train/test will be created here)
preprocessed_path = r"C:\Users\tiwar\plant_disease_detection\preprocessed_dataset"

train_dir = os.path.join(preprocessed_path, "train")
test_dir = os.path.join(preprocessed_path, "test")

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

print("✅ Folder setup complete")


✅ Folder setup complete


In [9]:
all_image_paths = []

for dataset_subfolder in os.listdir(raw_dataset_path):
    dataset_path = os.path.join(raw_dataset_path, dataset_subfolder)

    if not os.path.isdir(dataset_path):
        continue

    for class_name in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_name)
        if not os.path.isdir(class_path):
            continue

        for img_name in os.listdir(class_path):
            img_path = os.path.join(class_path, img_name)
            all_image_paths.append((img_path, class_name))

print(f"🧠 Total images found: {len(all_image_paths)}")


🧠 Total images found: 31368


In [10]:
# Shuffle to avoid bias
random.shuffle(all_image_paths)

# 80-20 split
split_ratio = 0.8
split_index = int(len(all_image_paths) * split_ratio)

train_data = all_image_paths[:split_index]
test_data = all_image_paths[split_index:]

print(f"✅ Training images: {len(train_data)}")
print(f"✅ Testing images: {len(test_data)}")


✅ Training images: 25094
✅ Testing images: 6274


In [11]:
def copy_data(data, target_dir):
    for img_path, class_name in tqdm(data, desc=f"📦 Copying to {target_dir}"):
        class_folder = os.path.join(target_dir, class_name)
        os.makedirs(class_folder, exist_ok=True)

        file_name = os.path.basename(img_path)
        target_path = os.path.join(class_folder, file_name)

        try:
            shutil.copy(img_path, target_path)
        except Exception as e:
            print(f"⚠️ Error copying {img_path}: {e}")

# Copying data
copy_data(train_data, train_dir)
copy_data(test_data, test_dir)

print("✅ All files copied successfully.")


📦 Copying to C:\Users\tiwar\plant_disease_detection\preprocessed_dataset\train: 100%|█| 25094/25094 [05:06<00:00, 81.8
📦 Copying to C:\Users\tiwar\plant_disease_detection\preprocessed_dataset\test:   1%| | 57/6274 [00:01<01:48, 57.14it/s

⚠️ Error copying C:\Users\tiwar\plant_disease_detection\Dataset\agri_india\Healthy Wheat\Wheat___Healthy: [Errno 13] Permission denied: 'C:\\Users\\tiwar\\plant_disease_detection\\Dataset\\agri_india\\Healthy Wheat\\Wheat___Healthy'


📦 Copying to C:\Users\tiwar\plant_disease_detection\preprocessed_dataset\test: 100%|█| 6274/6274 [01:20<00:00, 78.23it

✅ All files copied successfully.





In [14]:
import os

train_path = r"C:\Users\tiwar\plant_disease_detection\preprocessed_dataset\train"
test_path = r"C:\Users\tiwar\plant_disease_detection\preprocessed_dataset\test"

def verify_dataset_structure(train_dir, test_dir):
    if not os.path.exists(train_dir):
        print("❌ Train folder not found!")
        return
    if not os.path.exists(test_dir):
        print("❌ Test folder not found!")
        return

    train_classes = sorted([d for d in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, d))])
    test_classes = sorted([d for d in os.listdir(test_dir) if os.path.isdir(os.path.join(test_dir, d))])

    print(f"✅ Train Classes Found: {len(train_classes)}")
    print(f"✅ Test Classes Found:  {len(test_classes)}")

    if train_classes != test_classes:
        print("🚨 Mismatch in train and test class names!")
        print(f"🔍 Train-only classes: {set(train_classes) - set(test_classes)}")
        print(f"🔍 Test-only classes:  {set(test_classes) - set(train_classes)}")
    else:
        print("✅ Train and Test classes are perfectly aligned.")

    # Count total images
    total_train_images = sum([len(files) for r, d, files in os.walk(train_dir)])
    total_test_images = sum([len(files) for r, d, files in os.walk(test_dir)])

    print(f"🖼️ Total Train Images: {total_train_images}")
    print(f"🖼️ Total Test Images:  {total_test_images}")
    print("🏁 Dataset verification complete!")

# Run verification
verify_dataset_structure(train_path, test_path)


✅ Train Classes Found: 83
✅ Test Classes Found:  83
✅ Train and Test classes are perfectly aligned.
🖼️ Total Train Images: 25092
🖼️ Total Test Images:  6271
🏁 Dataset verification complete!
