In [1]:
import os
import pandas as pd
from pathlib import Path
import random

In [17]:
base_dir = Path("../dataset/raw")
output_dir = Path("../dataset/dataset_metadata")
PERCENT_USED = 0.2

In [18]:
data = []
for category_dir in base_dir.iterdir():
    if category_dir.is_dir():
        image_files = sorted(list(category_dir.glob("*.jpg")))
        sample_files = image_files[:int(len(image_files) * PERCENT_USED)]
        
        for file_path in sample_files:
            filename = file_path.name
            
            parts = category_dir.name.split("_")
            plant_type = parts[0]
            disease_type = parts[-1].split(".")[0].lower()  

            data.append({
                "filename": filename,
                "category": category_dir.name,
                "plant_type": plant_type,
                "disease_type": disease_type
            })

df = pd.DataFrame(data)
# Fix some disease category names
df.loc[df['disease_type'] == 'measles)', 'disease_type'] = 'measles'
df.loc[df['disease_type'] == 'greening)', 'disease_type'] = 'Citrus greening'
df.loc[df['disease_type'] == 'spot)', 'disease_type'] = 'spot'
df.loc[df['disease_type'] == '', 'disease_type'] = 'rust'

train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

output_dir.mkdir(exist_ok=True) 
train_df.to_csv(output_dir / "train_set.csv", index=False)
test_df.to_csv(output_dir / "test_set.csv", index=False)

In [19]:
print('All Disease Types:')
print(df['disease_type'].unique())
print('\n')

print('All Plant Types:')
print(df['plant_type'].unique())
print('\n')

All Disease Types:
['scab' 'rot' 'rust' 'healthy' 'mildew' 'spot' 'blight' 'measles'
 'Citrus greening' 'scorch' 'mold' 'mite' 'virus']


All Plant Types:
['Apple' 'Blueberry' 'Cherry' 'Corn' 'Grape' 'Orange' 'Peach' 'Pepper,'
 'Potato' 'Raspberry' 'Soybean' 'Squash' 'Strawberry' 'Tomato']


