In [1]:
import os
import shutil
import datasets
from PIL import Image
from tqdm import tqdm
from collections import defaultdict
import random


In [3]:
#--- Configuration ---
RAW_DATASET_ROOT = r"C:\Users\msi\Desktop\msl-images\calibrated"
TXT_FILES_ROOT = r"C:\Users\msi\Desktop\msl-images"
PROJECT_DATA_ROOT = r"C:\Users\msi\Desktop\TECH_UP\accelerated_network_SoC\data"

# Master label mapping
label_names = [
    "apxs", "apxs cal target", "chemcam cal target", "chemin inlet open",
    "drill", "drill holes", "drt front", "drt side", "ground", "horizon",
    "inlet", "mahli", "mahli cal target", "mastcam", "mastcam cal target",
    "observation tray", "portion box", "portion tube", "portion tube opening",
    "rems uv sensor", "rover rear deck", "scoop", "sun", "turret", "wheel"
]

def process_and_rebalance():
    # We will gather all available samples first to ensure a good split
    all_data = []
    
    # Read the master train-calibrated-shuffled.txt (or combine all txt files)
    all_txt_files = ["train-calibrated-shuffled.txt", "val-calibrated-shuffled.txt", "test-calibrated-shuffled.txt"]
    
    for txt_file in all_txt_files:
        txt_path = os.path.join(TXT_FILES_ROOT, txt_file)
        with open(txt_path, 'r') as f:
            for line in f:
                img_rel_path, label_id_str = line.strip().split()
                all_data.append((os.path.basename(img_rel_path), int(label_id_str)))

    # 2. Group by class to check distribution
    class_groups = defaultdict(list)
    for img_name, label_id in all_data:
        class_groups[label_id].append(img_name)

    # 3. Process and Save with fixed percentages (e.g., 70% Train, 15% Val, 15% Test)
    for label_id, images in class_groups.items():
        random.shuffle(images)
        label_name = label_names[label_id]
        
        # Calculate split indices
        n = len(images)
        train_end = int(n * 0.70)
        val_end = train_end + int(n * 0.15)
        
        split_map = {
            "train": images[:train_end],
            "validation": images[train_end:val_end],
            "test": images[val_end:]
        }

        for split_name, split_images in split_map.items():
            save_dir = os.path.join(PROJECT_DATA_ROOT, split_name, label_name)
            os.makedirs(save_dir, exist_ok=True)
            
            for i, img_name in enumerate(split_images):
                img_path = os.path.join(RAW_DATASET_ROOT, img_name)
                if os.path.exists(img_path):
                    # Standardizing for FPGA: 64x64 Grayscale
                    img = Image.open(img_path).convert("L").resize((64, 64))
                    img.save(os.path.join(save_dir, f"{label_name}_{i}.jpg"))

    print("\n✅ Data Re-organized and Balanced!")

process_and_rebalance()


✅ Data Re-organized and Balanced!
