In [None]:
import os
import numpy as np
import cv2
from tqdm import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

def calculate_dataset_statistics(image_paths, sample_size=None, num_workers=8, verbose=True):
    """
    Calculate the global mean and standard deviation of a dataset.
    
    Args:
        image_paths: List of paths to all images in the dataset
        sample_size: Number of images to sample (to speed up calculation)
                     If None, uses all images
        num_workers: Number of parallel workers for processing
        verbose: Whether to print detailed logs
        
    Returns:
        tuple: (mean, std) where each is a numpy array [r, g, b]
    """
    if not image_paths:
        raise ValueError("No image paths provided")
        
    print(f"Calculating global statistics from {len(image_paths)} images...")
    
    # Sample images if needed
    if sample_size is not None and sample_size < len(image_paths):
        np.random.shuffle(image_paths)
        image_paths = image_paths[:sample_size]
        print(f"Sampling {sample_size} images for calculation")
    
    # Try to read the first image to verify paths are correct
    if verbose:
        test_path = image_paths[0]
        print(f"Testing image reading with first image: {test_path}")
        try:
            test_img = cv2.imread(test_path)
            if test_img is None:
                print(f"WARNING: First image could not be read. Check if path exists: {os.path.exists(test_path)}")
                print(f"Full absolute path: {os.path.abspath(test_path)}")
            else:
                print(f"Successfully read test image with shape {test_img.shape}")
        except Exception as e:
            print(f"Error reading first image: {e}")
    
    # Initialize arrays for means and stds
    r_means, g_means, b_means = [], [], []
    r_stds, g_stds, b_stds = [], [], []
    
    error_count = 0
    processed_count = 0
    
    # Function to process a single image
    def process_image(img_path):
        nonlocal error_count
        try:
            # Read image
            img = cv2.imread(img_path)
            if img is None:
                error_count += 1
                if verbose and error_count <= 5:  # Limit error messages to avoid flooding console
                    print(f"Warning: Could not read {img_path}")
                    print(f"  Path exists: {os.path.exists(img_path)}")
                    print(f"  File size: {os.path.getsize(img_path) if os.path.exists(img_path) else 'N/A'} bytes")
                return None
            
            # Convert to RGB (OpenCV loads in BGR)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            # Reshape to get all pixels and normalize to [0, 1]
            img = img.astype(np.float32) / 255.0
            
            # Calculate mean and std for each channel
            means = np.mean(img, axis=(0, 1))
            stds = np.std(img, axis=(0, 1))
            
            return means, stds
        except Exception as e:
            error_count += 1
            if verbose and error_count <= 5:
                print(f"Error processing {img_path}: {e}")
            return None
    
    # Process images in parallel
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Submit tasks
        future_to_path = {
            executor.submit(process_image, path): path 
            for path in image_paths
        }
        
        # Process results as they complete
        for future in tqdm(as_completed(future_to_path), total=len(image_paths)):
            result = future.result()
            if result is not None:
                processed_count += 1
                means, stds = result
                r_means.append(means[0])
                g_means.append(means[1])
                b_means.append(means[2])
                r_stds.append(stds[0])
                g_stds.append(stds[1])
                b_stds.append(stds[2])
    
    print(f"Processed {processed_count}/{len(image_paths)} images successfully")
    print(f"Encountered errors with {error_count} images")
    
    # Calculate overall mean and std
    if not r_means:
        raise ValueError(f"No valid images found for calculation. All {len(image_paths)} images failed to process.")
    
    # Global mean is the mean of all individual image means
    global_mean = [
        np.mean(r_means),
        np.mean(g_means),
        np.mean(b_means)
    ]
    
    # For global std, we need a weighted combination of the std values
    # This is an approximation using the mean of standard deviations
    global_std = [
        np.mean(r_stds),
        np.mean(g_stds),
        np.mean(b_stds)
    ]
    
    return global_mean, global_std

def get_image_paths(data_dir, extensions=('.jpg', '.jpeg', '.png')):
    """
    Get all image paths from a directory and its subdirectories.
    
    Args:
        data_dir: Root directory containing images
        extensions: Tuple of valid file extensions
        
    Returns:
        list: All image paths
    """
    if not os.path.exists(data_dir):
        raise ValueError(f"Data directory does not exist: {data_dir}")
        
    if not os.path.isdir(data_dir):
        raise ValueError(f"Data path is not a directory: {data_dir}")
        
    image_paths = []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.lower().endswith(extensions):
                image_paths.append(os.path.join(root, file))
    
    print(f"Found {len(image_paths)} images with extensions {extensions}")
    if len(image_paths) > 0:
        print(f"First few images: {image_paths[:3]}")
    
    return image_paths

# Example usage
if __name__ == "__main__":
    # Set your dataset directory
    DATASET_DIR = r'C:\Users\SIMON\Desktop\AMLS2_Cassava_CV\cassava-leaf-disease-classification'
    
    print(f"Using dataset directory: {os.path.abspath(DATASET_DIR)}")
    
    # Get all image paths
    image_paths = get_image_paths(DATASET_DIR)
    
    if not image_paths:
        print("No images found. Please check the directory path and image extensions.")
        exit(1)
    
    # Calculate statistics (using a sample for faster calculation)
    # Increase sample_size for more accurate statistics
    global_mean, global_std = calculate_dataset_statistics(
        image_paths, 
        sample_size=5000,  # Sample 5000 images (adjust as needed)
        num_workers=8,     # Use 8 parallel workers (adjust based on your CPU)
        verbose=True       # Enable detailed logging
    )
    
    # Print the results
    print(f"Global Mean: {global_mean}")
    print(f"Global Std: {global_std}")
    
    # Save to file
    np.save('global_mean.npy', global_mean)
    np.save('global_std.npy', global_std)
    
    print("Statistics saved to global_mean.npy and global_std.npy")
    
    # Example of how to use these values in your model
    print("\nIn your model code, set these values:")
    print(f"GLOBAL_MEAN = {global_mean}")
    print(f"GLOBAL_STD = {global_std}")

Using dataset directory: c:\Users\SIMON\Desktop\AMLS2_Cassava_CV\notebooks\path\to\your\dataset


ValueError: Data directory does not exist: path/to/your/dataset