In [1]:
import os
import shutil
import random
from math import floor

def distribute_noniid_clients(
    good_dir,
    bad_dir,
    output_base_dir,
    num_clients=5,
    skew=None,
    seed=42
):
    """
    Distribute data with optional non-IID class ratio but equal total per client.
    """
    random.seed(seed)

    good_files = sorted([os.path.join(good_dir, f) for f in os.listdir(good_dir) if f.endswith('.wav')])
    bad_files = sorted([os.path.join(bad_dir, f) for f in os.listdir(bad_dir) if f.endswith('.wav')])

    # Ensure even partitioning
    per_client_total = min(len(good_files), len(bad_files)) // num_clients
    random.shuffle(good_files)
    random.shuffle(bad_files)

    print(f"[INFO] Total Good Files: {len(good_files)}")
    print(f"[INFO] Total Bad Files: {len(bad_files)}")
    
    print(f"[INFO] Target per client total: {per_client_total} samples (mix of good & bad)")

    good_index = 0
    bad_index = 0

    for i in range(num_clients):
        g_ratio = skew[i * 2] if skew else 0.5
        b_ratio = 1 - g_ratio  # Ensure good+bad = 1.0

        g_count = int(per_client_total * g_ratio)
        b_count = per_client_total - g_count

        g_samples = good_files[good_index:good_index + g_count]
        b_samples = bad_files[bad_index:bad_index + b_count]

        good_index += g_count
        bad_index += b_count

        # Setup client folders
        g_client_dir = os.path.join(output_base_dir, f'client_{i+1}', 'good')
        b_client_dir = os.path.join(output_base_dir, f'client_{i+1}', 'bad')
        os.makedirs(g_client_dir, exist_ok=True)
        os.makedirs(b_client_dir, exist_ok=True)

        for f in g_samples:
            shutil.copy(f, os.path.join(g_client_dir, os.path.basename(f)))
        for f in b_samples:
            shutil.copy(f, os.path.join(b_client_dir, os.path.basename(f)))

        print(f"\nðŸ“¦ Client {i+1} Total: {len(g_samples) + len(b_samples)}")
        print(f"   â””â”€ Good files: {len(g_samples)}")
        print(f"   â””â”€ Bad  files: {len(b_samples)}")

distribute_noniid_clients(
    good_dir="resources/material/train-data/augmented-good-material-taps",
    bad_dir="resources/material/train-data/augmented-bad-material-taps",
    output_base_dir="resources/material/train-data/federated/skewed",
    num_clients=5,
    skew = [
        0.7, 0.3,  # Client 1
        0.5, 0.5,  # Client 2
        0.3, 0.7,  # Client 3
        0.6, 0.4,  # Client 4
        0.4, 0.6,  # Client 5
    ],
    seed=42
)


[INFO] Total Good Files: 11025
[INFO] Total Bad Files: 11619
[INFO] Target per client total: 2205 samples (mix of good & bad)

ðŸ“¦ Client 1 Total: 2205
   â””â”€ Good files: 1543
   â””â”€ Bad  files: 662

ðŸ“¦ Client 2 Total: 2205
   â””â”€ Good files: 1102
   â””â”€ Bad  files: 1103

ðŸ“¦ Client 3 Total: 2205
   â””â”€ Good files: 661
   â””â”€ Bad  files: 1544

ðŸ“¦ Client 4 Total: 2205
   â””â”€ Good files: 1323
   â””â”€ Bad  files: 882

ðŸ“¦ Client 5 Total: 2205
   â””â”€ Good files: 882
   â””â”€ Bad  files: 1323
