In [None]:
import os
from PIL import Image
import shutil

In [None]:

input_dir = "./raw_dataset"  
output_dir = "./cleaned_dataset" 

In [None]:

os.makedirs(output_dir, exist_ok=True)

In [None]:
def clean_dataset(input_dir, output_dir):
    # Iterate through subdirectories (soil types)
    for soil_type in os.listdir(input_dir):
        soil_path = os.path.join(input_dir, soil_type)
        
        # Skip if not a directory
        if not os.path.isdir(soil_path):
            continue
        
        print(f"Processing soil type: {soil_type}")
        
        # Create corresponding output directory for soil type
        cleaned_soil_path = os.path.join(output_dir, soil_type)
        os.makedirs(cleaned_soil_path, exist_ok=True)
        
        # Process images in the subdirectory
        for i, filename in enumerate(os.listdir(soil_path)):
            file_path = os.path.join(soil_path, filename)
            
            try:
                # Open image to ensure it's valid
                with Image.open(file_path) as img:
                    # Convert to RGB to ensure consistency
                    img = img.convert("RGB")
                    
                    # Resize image (optional, e.g., 256x256)
                    img = img.resize((256, 256))
                    
                    # Save cleaned image with consistent naming
                    cleaned_filename = f"{soil_type}_{i+1}.jpg"
                    cleaned_path = os.path.join(cleaned_soil_path, cleaned_filename)
                    img.save(cleaned_path, "JPEG")
            
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
                # Move problematic files to a separate folder
                error_dir = "./error_files"
                os.makedirs(error_dir, exist_ok=True)
                shutil.move(file_path, os.path.join(error_dir, filename))
    print("Data cleaning completed successfully.")                        

In [None]:
# Run the cleaning function
clean_dataset(input_dir, output_dir)