In [2]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import sys

sys.path.append('..')

# Set the path to your images_reshaped directory
base_path = '../images_reshaped'

# Initialize lists to store data
data = []

# Iterate through the directory structure
for category in ['deadly', 'edible', 'poisonous', 'conditionally_edible']:
    category_path = os.path.join(base_path, category)
    for species_folder in os.listdir(category_path):
        species_path = os.path.join(category_path, species_folder)
        if os.path.isdir(species_path):
            for image_file in os.listdir(species_path):
                if image_file.lower().endswith(('.jpg', '.jpeg', '.png')):
                    image_path = os.path.join(species_path, image_file)
                    
                    # Load and preprocess the image
                    img = Image.open(image_path)
                    img_array = np.array(img) / 255.0  # Normalize to [0, 1]
                    
                    data.append({
                        'image_path': image_path,
                        'category': category,
                        'species': species_folder,
                        'image': img_array
                    })

# Create the DataFrame
df = pd.DataFrame(data)

# Add the 'edible' column
df['edible'] = (df['category'] == 'edible').astype(int)

# Encode categories and species
le_category = LabelEncoder()
le_species = LabelEncoder()
df['category_encoded'] = le_category.fit_transform(df['category'])
df['species_encoded'] = le_species.fit_transform(df['species'])

# Save the DataFrame without the 'image' column
df_save = df.drop(columns=['image'])
df_save.to_pickle('mushroom_metadata.pkl')

# Save the image data separately
np.save('mushroom_images.npy', np.stack(df['image'].values))

print("Data preprocessing completed.")
print(f"Metadata saved as 'mushroom_metadata.pkl'.")
print(f"Image data saved as 'mushroom_images.npy'.")
print(f"Total images processed: {len(df)}")
print(f"Number of edible mushrooms: {df['edible'].sum()}")
print(f"Number of non-edible mushrooms: {len(df) - df['edible'].sum()}")
print(f"Number of unique species: {df['species'].nunique()}")
print(f"Image shape: {df['image'].iloc[0].shape}")

Data preprocessing completed.
Metadata saved as 'mushroom_metadata.pkl'.
Image data saved as 'mushroom_images.npy'.
Total images processed: 8781
Number of edible mushrooms: 2475
Number of non-edible mushrooms: 6306
Number of unique species: 247
Image shape: (256, 256, 3)


In [11]:
df['species_encoded'].nunique()

247