In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split
import logging


In [2]:
def organize_data(data_dir, output_dir):
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )

    if not os.path.exists(data_dir):
        raise ValueError(f"Data directory does not exist: {data_dir}")
    
    for split in ['train', 'val', 'test']:
        os.makedirs(os.path.join(output_dir, split), exist_ok=True)
    
    for category in ['digits', 'symbols']:
        category_dir = os.path.join(data_dir, category)
        if not os.path.exists(category_dir):
            logging.warning(f"Category directory not found: {category_dir}")
            continue

        class_dirs = [d for d in os.listdir(category_dir) 
                     if os.path.isdir(os.path.join(category_dir, d))]
        
        if not class_dirs:
            logging.warning(f"No class directories found in {category_dir}")
            continue
            
        for class_name in class_dirs:
            class_dir = os.path.join(category_dir, class_name)
   
            image_files = [f for f in os.listdir(class_dir) 
                         if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            
            if not image_files:
                logging.warning(f"No images found in class directory: {class_dir}")
                continue
                
            logging.info(f"Processing {len(image_files)} images for class {class_name}")
            
            try:

                train_val, test = train_test_split(image_files, test_size=0.2, random_state=42)
                train, val = train_test_split(train_val, test_size=0.2, random_state=42)
                
  
                for split, files in zip(['train', 'val', 'test'], [train, val, test]):
                    split_dir = os.path.join(output_dir, split, class_name)
                    os.makedirs(split_dir, exist_ok=True)
                    
                    for file in files:
                        src = os.path.join(class_dir, file)
                        dst = os.path.join(split_dir, file)
                        shutil.copy(src, dst)
                        
                logging.info(f"Successfully processed class {class_name}: "
                           f"train={len(train)}, val={len(val)}, test={len(test)}")
                
            except Exception as e:
                logging.error(f"Error processing class {class_name}: {str(e)}")
                continue


In [3]:
def verify_data_structure(data_dir):
    """Verify the input data structure and return summary statistics."""
    stats = {'total_images': 0, 'categories': {}}
    
    if not os.path.exists(data_dir):
        logging.error(f"Data directory does not exist: {data_dir}")
        return stats
    
    for category in ['digits', 'symbols']:
        category_dir = os.path.join(data_dir, category)
        if not os.path.exists(category_dir):
            logging.warning(f"Category directory not found: {category_dir}")
            continue
            
        stats['categories'][category] = {}
        for class_name in os.listdir(category_dir):
            class_dir = os.path.join(category_dir, class_name)
            if os.path.isdir(class_dir):
                image_files = [f for f in os.listdir(class_dir) 
                             if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
                stats['categories'][category][class_name] = len(image_files)
                stats['total_images'] += len(image_files)
    
    return stats


In [4]:
if __name__ == "__main__":
    data_dir = '..\data'
    output_dir = '..\dataset'
    
    # First verify the data structure
    stats = verify_data_structure(data_dir)
    logging.info(f"Data structure statistics: {stats}")
    
    if stats['total_images'] > 0:
        organize_data(data_dir, output_dir)
    else:
        logging.error("No images found in the data directory structure")