In [1]:
import os
import shutil
import random

# Define dataset paths
dataset_path = "D:/Major Project/Final Proper/Dataset"
output_path = "D:/Major Project/Final Proper/Split_Dataset"

american_path = os.path.join(dataset_path, "American")
chinese_path = os.path.join(dataset_path, "Chinese")

# Define clients
clients = {
    "C1": {"Chinese": 0.8, "American": 0.2},
    "C2": {"Chinese": 0.2, "American": 0.8},
}

# Categories
categories = ["Drowsy", "NonDrowsy"]

# Function to split and copy data
def split_data(source_path, dest_path, split_ratio):
    for category in categories:
        category_path = os.path.join(source_path, category)
        dest_category_path = os.path.join(dest_path, category)
        os.makedirs(dest_category_path, exist_ok=True)

        files = os.listdir(category_path)
        random.shuffle(files)
        split_count = int(len(files) * split_ratio)

        for file in files[:split_count]:
            shutil.copy(os.path.join(category_path, file), os.path.join(dest_category_path, file))

# Processing clients
for client, proportions in clients.items():
    client_path = os.path.join(output_path, client)
    
    # Split American data
    split_data(american_path, client_path, proportions["American"])
    
    # Split Chinese data
    split_data(chinese_path, client_path, proportions["Chinese"])

print("Dataset split completed successfully!")


Dataset split completed successfully!


In [1]:
import os
import shutil
import random

# Define dataset paths
dataset_path = "D:/Major Project/Final Proper/Dataset"
output_path = "D:/Major Project/Final Proper/Split_Dataset_with_rounds"

american_path = os.path.join(dataset_path, "American")
chinese_path = os.path.join(dataset_path, "Chinese")

# Define clients and proportions
clients = {
    "C1": {"Chinese": 0.6, "American": 0.4},
    "C2": {"Chinese": 0.4, "American": 0.6},
}

# Categories (classes)
categories = ["Drowsy", "NonDrowsy"]

# Define dataset size per round
per_class_size = 1000  # Each round has 1000 Drowsy + 1000 Non-Drowsy = 2000 total
rounds = 5  # Number of rounds

# Function to shuffle and split data
def prepare_shuffled_data(source_path):
    shuffled_data = {}
    for category in categories:
        category_path = os.path.join(source_path, category)
        files = os.listdir(category_path)
        random.shuffle(files)  # Shuffle once at the start
        shuffled_data[category] = files  # Store shuffled files
    return shuffled_data

# Process each client
for client, proportions in clients.items():
    print(f"\nProcessing {client}...")
    client_base_path = os.path.join(output_path, client)

    # Prepare shuffled data
    shuffled_american = prepare_shuffled_data(american_path)
    shuffled_chinese = prepare_shuffled_data(chinese_path)

    # Track indices for splitting
    index_tracker = {category: {"Chinese": 0, "American": 0} for category in categories}

    for round_num in range(1, rounds + 1):
        round_path = os.path.join(client_base_path, f"Round_{round_num}")
        os.makedirs(round_path, exist_ok=True)

        for category in categories:
            # Compute correct proportions
            split_chinese = int(per_class_size * proportions["Chinese"])
            split_american = int(per_class_size * proportions["American"])

            # Get available images
            remaining_chinese = len(shuffled_chinese[category]) - index_tracker[category]["Chinese"]
            remaining_american = len(shuffled_american[category]) - index_tracker[category]["American"]

            # Ensure we don't exceed available images
            split_chinese = min(split_chinese, remaining_chinese)
            split_american = min(split_american, remaining_american)

            # Get image subsets
            start_c = index_tracker[category]["Chinese"]
            end_c = start_c + split_chinese
            start_a = index_tracker[category]["American"]
            end_a = start_a + split_american

            round_chinese = shuffled_chinese[category][start_c:end_c]
            round_american = shuffled_american[category][start_a:end_a]

            # Update index tracker
            index_tracker[category]["Chinese"] = end_c
            index_tracker[category]["American"] = end_a

            # Copy files to destination
            dest_category_path = os.path.join(round_path, category)
            os.makedirs(dest_category_path, exist_ok=True)

            for file in round_chinese:
                shutil.copy(os.path.join(chinese_path, category, file), os.path.join(dest_category_path, file))

            for file in round_american:
                shutil.copy(os.path.join(american_path, category, file), os.path.join(dest_category_path, file))

            print(f"  Round {round_num}: {category} - {len(round_chinese)} Chinese, {len(round_american)} American")

        # Warn if we ran out of images
        if index_tracker["Drowsy"]["Chinese"] >= len(shuffled_chinese["Drowsy"]) or index_tracker["Drowsy"]["American"] >= len(shuffled_american["Drowsy"]):
            print(f"⚠️ WARNING: Not enough images left for further rounds after Round {round_num}. Stopping early!")
            break

print("✅ Dataset successfully split into 5 rounds with proper shuffling and correct proportions!")



Processing C1...
  Round 1: Drowsy - 600 Chinese, 400 American
  Round 1: NonDrowsy - 600 Chinese, 400 American
  Round 2: Drowsy - 600 Chinese, 400 American
  Round 2: NonDrowsy - 600 Chinese, 400 American
  Round 3: Drowsy - 600 Chinese, 400 American
  Round 3: NonDrowsy - 600 Chinese, 400 American
  Round 4: Drowsy - 600 Chinese, 400 American
  Round 4: NonDrowsy - 600 Chinese, 400 American
  Round 5: Drowsy - 600 Chinese, 400 American
  Round 5: NonDrowsy - 600 Chinese, 400 American

Processing C2...
  Round 1: Drowsy - 400 Chinese, 600 American
  Round 1: NonDrowsy - 400 Chinese, 600 American
  Round 2: Drowsy - 400 Chinese, 600 American
  Round 2: NonDrowsy - 400 Chinese, 600 American
  Round 3: Drowsy - 400 Chinese, 600 American
  Round 3: NonDrowsy - 400 Chinese, 600 American
  Round 4: Drowsy - 400 Chinese, 600 American
  Round 4: NonDrowsy - 400 Chinese, 600 American
  Round 5: Drowsy - 400 Chinese, 600 American
  Round 5: NonDrowsy - 400 Chinese, 600 American
✅ Dataset succ

In [1]:
import os
import shutil
import random

# Define dataset paths
dataset_path = "D:/Major Project/Final Proper/Dataset"
output_path = "D:/Major Project/Final Proper/Split_Dataset_with_rounds"

american_path = os.path.join(dataset_path, "American")
chinese_path = os.path.join(dataset_path, "Chinese")

# Define clients and their data proportions
clients = {
    "C1": {"Chinese": 0.6, "American": 0.4},
    "C2": {"Chinese": 0.4, "American": 0.6},
}

# Categories (classes)
categories = ["Drowsy", "NonDrowsy"]

# Define dataset constraints
total_images = 5000  # Use only 5,000 images (ignoring extra)
per_class_total = total_images // 2  # 2,500 Drowsy + 2,500 Non-Drowsy
per_round = per_class_total // 5  # 500 Drowsy + 500 Non-Drowsy total per round
per_category_per_round = per_round // 2  # 250 per class (Drowsy/Non-Drowsy) per round

# Function to get a limited, shuffled subset of images
def get_limited_shuffled_data(source_path):
    shuffled_data = {}
    for category in categories:
        category_path = os.path.join(source_path, category)
        files = os.listdir(category_path)
        random.shuffle(files)  # Shuffle at start
        shuffled_data[category] = files[:per_class_total]  # Take only 2,500 images
    return shuffled_data

# Process each client
for client, proportions in clients.items():
    print(f"\nProcessing {client}...")
    client_base_path = os.path.join(output_path, client)

    # Prepare shuffled data (limited to 5,000 total)
    shuffled_american = get_limited_shuffled_data(american_path)
    shuffled_chinese = get_limited_shuffled_data(chinese_path)

    # Track indices for splitting
    index_tracker = {category: {"Chinese": 0, "American": 0} for category in categories}

    for round_num in range(1, 6):  # 5 rounds
        round_path = os.path.join(client_base_path, f"Round_{round_num}")
        os.makedirs(round_path, exist_ok=True)

        for category in categories:
            # Compute correct proportions for each class
            split_chinese = int(per_category_per_round * proportions["Chinese"])
            split_american = int(per_category_per_round * proportions["American"])

            # Get available images
            start_c = index_tracker[category]["Chinese"]
            end_c = start_c + split_chinese
            start_a = index_tracker[category]["American"]
            end_a = start_a + split_american

            round_chinese = shuffled_chinese[category][start_c:end_c]
            round_american = shuffled_american[category][start_a:end_a]

            # Update index tracker
            index_tracker[category]["Chinese"] = end_c
            index_tracker[category]["American"] = end_a

            # Copy files to destination
            dest_category_path = os.path.join(round_path, category)
            os.makedirs(dest_category_path, exist_ok=True)

            for file in round_chinese:
                shutil.copy(os.path.join(chinese_path, category, file), os.path.join(dest_category_path, file))

            for file in round_american:
                shutil.copy(os.path.join(american_path, category, file), os.path.join(dest_category_path, file))

            print(f"  Round {round_num}: {category} - {len(round_chinese)} Chinese, {len(round_american)} American")

print("✅ Dataset successfully split into 5 rounds using only 5,000 images!")



Processing C1...
  Round 1: Drowsy - 400 Chinese, 100 American
  Round 1: NonDrowsy - 400 Chinese, 100 American
  Round 2: Drowsy - 400 Chinese, 100 American
  Round 2: NonDrowsy - 400 Chinese, 100 American
  Round 3: Drowsy - 400 Chinese, 100 American
  Round 3: NonDrowsy - 400 Chinese, 100 American
  Round 4: Drowsy - 400 Chinese, 100 American
  Round 4: NonDrowsy - 400 Chinese, 100 American
  Round 5: Drowsy - 400 Chinese, 100 American
  Round 5: NonDrowsy - 400 Chinese, 100 American

Processing C2...
  Round 1: Drowsy - 100 Chinese, 400 American
  Round 1: NonDrowsy - 100 Chinese, 400 American
  Round 2: Drowsy - 100 Chinese, 400 American
  Round 2: NonDrowsy - 100 Chinese, 400 American
  Round 3: Drowsy - 100 Chinese, 400 American
  Round 3: NonDrowsy - 100 Chinese, 400 American
  Round 4: Drowsy - 100 Chinese, 400 American
  Round 4: NonDrowsy - 100 Chinese, 400 American
  Round 5: Drowsy - 100 Chinese, 400 American
  Round 5: NonDrowsy - 100 Chinese, 400 American
✅ Dataset succ

In [2]:
import os
import shutil
import random

# Define dataset paths
dataset_path = "D:/Major Project/Final Proper/Dataset"
output_path = "D:/Major Project/Final Proper/Split_Dataset_with_rounds_with_test"
test_output_path = os.path.join(output_path, "Test_Set")

american_path = os.path.join(dataset_path, "American")
chinese_path = os.path.join(dataset_path, "Chinese")

# Define clients and their data proportions
clients = {
    "C1": {"Chinese": 0.6, "American": 0.4},
    "C2": {"Chinese": 0.4, "American": 0.6},
}

# Categories (classes)
categories = ["Drowsy", "NonDrowsy"]

# Define dataset constraints
total_images = 10000  # Total dataset size
train_size = total_images // 2  # 5,000 for training (split across 5 rounds)
test_size = total_images // 2  # 5,000 for testing (fixed)

per_class_train = train_size // 2  # 2,500 Drowsy + 2,500 Non-Drowsy for training
per_class_test = test_size // 2  # 2,500 Drowsy + 2,500 Non-Drowsy for testing

per_round_train = per_class_train // 5  # 500 total per round
per_category_per_round = per_round_train // 2  # 250 per class per round

# Function to get a limited, shuffled subset of images
def get_limited_shuffled_data(source_path, limit):
    shuffled_data = {}
    for category in categories:
        category_path = os.path.join(source_path, category)
        files = os.listdir(category_path)
        random.shuffle(files)  # Shuffle at start
        shuffled_data[category] = files[:limit]  # Take only `limit` images
    return shuffled_data

# Prepare shuffled data (split into training and testing)
shuffled_american = get_limited_shuffled_data(american_path, total_images // 2)
shuffled_chinese = get_limited_shuffled_data(chinese_path, total_images // 2)

# Separate test data from train data
test_american = {category: shuffled_american[category][per_class_train:] for category in categories}
test_chinese = {category: shuffled_chinese[category][per_class_train:] for category in categories}

# Track indices for splitting training data
index_tracker = {category: {"Chinese": 0, "American": 0} for category in categories}

# Process each client (Training Data)
for client, proportions in clients.items():
    print(f"\nProcessing {client} (Training)...")
    client_base_path = os.path.join(output_path, client)

    for round_num in range(1, 6):  # 5 rounds
        round_path = os.path.join(client_base_path, f"Round_{round_num}")
        os.makedirs(round_path, exist_ok=True)

        for category in categories:
            # Compute correct proportions for each class
            split_chinese = int(per_category_per_round * proportions["Chinese"])
            split_american = int(per_category_per_round * proportions["American"])

            # Get available images
            start_c = index_tracker[category]["Chinese"]
            end_c = start_c + split_chinese
            start_a = index_tracker[category]["American"]
            end_a = start_a + split_american

            round_chinese = shuffled_chinese[category][start_c:end_c]
            round_american = shuffled_american[category][start_a:end_a]

            # Update index tracker
            index_tracker[category]["Chinese"] = end_c
            index_tracker[category]["American"] = end_a

            # Copy files to destination
            dest_category_path = os.path.join(round_path, category)
            os.makedirs(dest_category_path, exist_ok=True)

            for file in round_chinese:
                shutil.copy(os.path.join(chinese_path, category, file), os.path.join(dest_category_path, file))

            for file in round_american:
                shutil.copy(os.path.join(american_path, category, file), os.path.join(dest_category_path, file))

            print(f"  Round {round_num}: {category} - {len(round_chinese)} Chinese, {len(round_american)} American")

# Process Testing Data (Remaining 5,000 images)
print("\nProcessing Test Set...")
for category in categories:
    test_category_path = os.path.join(test_output_path, category)
    os.makedirs(test_category_path, exist_ok=True)

    # Copy test images
    for file in test_chinese[category]:
        shutil.copy(os.path.join(chinese_path, category, file), os.path.join(test_category_path, file))

    for file in test_american[category]:
        shutil.copy(os.path.join(american_path, category, file), os.path.join(test_category_path, file))

    print(f"  Test Set: {category} - {len(test_chinese[category])} Chinese, {len(test_american[category])} American")

print("✅ Dataset successfully split into 5 training rounds and 1 test set!")



Processing C1 (Training)...
  Round 1: Drowsy - 150 Chinese, 100 American
  Round 1: NonDrowsy - 150 Chinese, 100 American
  Round 2: Drowsy - 150 Chinese, 100 American
  Round 2: NonDrowsy - 150 Chinese, 100 American
  Round 3: Drowsy - 150 Chinese, 100 American
  Round 3: NonDrowsy - 150 Chinese, 100 American
  Round 4: Drowsy - 150 Chinese, 100 American
  Round 4: NonDrowsy - 150 Chinese, 100 American
  Round 5: Drowsy - 150 Chinese, 100 American
  Round 5: NonDrowsy - 150 Chinese, 100 American

Processing C2 (Training)...
  Round 1: Drowsy - 100 Chinese, 150 American
  Round 1: NonDrowsy - 100 Chinese, 150 American
  Round 2: Drowsy - 100 Chinese, 150 American
  Round 2: NonDrowsy - 100 Chinese, 150 American
  Round 3: Drowsy - 100 Chinese, 150 American
  Round 3: NonDrowsy - 100 Chinese, 150 American
  Round 4: Drowsy - 100 Chinese, 150 American
  Round 4: NonDrowsy - 100 Chinese, 150 American
  Round 5: Drowsy - 100 Chinese, 150 American
  Round 5: NonDrowsy - 100 Chinese, 150 A