## Extracting previous processed dataste

In [None]:
import zipfile
import pandas as pd

with zipfile.ZipFile("/content/drive/MyDrive/Student/Mohit/Dataset/Skin_Cancer_Malignant_vs_Benign/archive.zip", 'r') as zip_ref:
    zip_ref.extractall("/content")


train_load_path = '/content/drive/MyDrive/Student/Mohit/Dataset/Federated_Learning_Project_2/Processed_CSV/train_df.csv'
test_load_path = '/content/drive/MyDrive/Student/Mohit/Dataset/Federated_Learning_Project_2/Processed_CSV/test_df.csv'
val_load_path = '/content/drive/MyDrive/Student/Mohit/Dataset/Federated_Learning_Project_2/Processed_CSV/val_df.csv'

# Loading dataframes from the CSV files
train_df = pd.read_csv(train_load_path)
test_df = pd.read_csv(test_load_path)
val_df = pd.read_csv(val_load_path)

print("Train DataFrame:")
display(train_df)

print("\nTest DataFrame:")
display(test_df)

print("\nValidation DataFrame:")
display(val_df)

Train DataFrame:


Unnamed: 0,file_path,label
0,/content/data/train/benign/183.jpg,benign
1,/content/data/train/benign/852.jpg,benign
2,/content/data/train/benign/1515.jpg,benign
3,/content/data/train/benign/807.jpg,benign
4,/content/data/train/benign/1580.jpg,benign
...,...,...
2632,/content/data/train/malignant/685.jpg,malignant
2633,/content/data/train/malignant/1481.jpg,malignant
2634,/content/data/train/malignant/1247.jpg,malignant
2635,/content/data/train/malignant/550.jpg,malignant



Test DataFrame:


Unnamed: 0,file_path,label
0,/content/data/test/benign/878.jpg,benign
1,/content/data/test/malignant/1026.jpg,malignant
2,/content/data/test/malignant/1499.jpg,malignant
3,/content/data/test/benign/1350.jpg,benign
4,/content/data/test/benign/894.jpg,benign
...,...,...
325,/content/data/test/benign/626.jpg,benign
326,/content/data/test/benign/824.jpg,benign
327,/content/data/test/benign/1412.jpg,benign
328,/content/data/test/malignant/237.jpg,malignant



Validation DataFrame:


Unnamed: 0,file_path,label
0,/content/data/test/malignant/317.jpg,malignant
1,/content/data/test/malignant/1074.jpg,malignant
2,/content/data/test/benign/1655.jpg,benign
3,/content/data/test/malignant/706.jpg,malignant
4,/content/data/test/benign/1479.jpg,benign
...,...,...
325,/content/data/test/malignant/1333.jpg,malignant
326,/content/data/test/benign/317.jpg,benign
327,/content/data/test/benign/1208.jpg,benign
328,/content/data/test/malignant/1056.jpg,malignant


## Now assign "client_id" for each sample based on stratified splitting

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

# Paths
base_path = '/content/drive/MyDrive/Student/Mohit/Dataset/Federated_Learning_Project_2/Processed_CSV/'
train_path = base_path + 'train_df.csv'
val_path = base_path + 'val_df.csv'
test_path = base_path + 'test_df.csv'

# Load CSVs
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

# -----------------------------
# âœ… Function to split & assign client IDs
# -----------------------------
def stratified_split_and_assign_client_id(df, num_clients=10, random_state=42):
    """
    Splits the dataset into `num_clients` parts using StratifiedKFold
    and assigns a client_id column (0 to num_clients-1).
    """
    df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)  # shuffle for randomness
    skf = StratifiedKFold(n_splits=num_clients, shuffle=True, random_state=random_state)

    client_ids = np.zeros(len(df), dtype=int)
    for client_id, (_, idx) in enumerate(skf.split(df, df['label'])):
        client_ids[idx] = client_id

    df['client_id'] = client_ids
    return df

# -----------------------------
# Apply stratified splitting
# -----------------------------
train_df_with_client = stratified_split_and_assign_client_id(train_df, num_clients=10)
val_df_with_client = stratified_split_and_assign_client_id(val_df, num_clients=10)
test_df_with_client = stratified_split_and_assign_client_id(test_df, num_clients=10)

# -----------------------------
# Save new CSVs with client IDs
# -----------------------------
train_df_with_client.to_csv(base_path + 'train_df_with_client.csv', index=False)
val_df_with_client.to_csv(base_path + 'val_df_with_client.csv', index=False)
test_df_with_client.to_csv(base_path + 'test_df_with_client.csv', index=False)

print("âœ… Stratified split complete â€” 10 clients assigned for each dataset.")
print(f"Train samples: {len(train_df_with_client)} | Val: {len(val_df_with_client)} | Test: {len(test_df_with_client)}")

# -----------------------------
# Check distribution summary
# -----------------------------
for name, df in zip(["Train", "Validation", "Test"],
                    [train_df_with_client, val_df_with_client, test_df_with_client]):
    print(f"\nðŸ“Š {name} Set Client Distributions:")
    for cid in sorted(df['client_id'].unique()):
        subset = df[df['client_id'] == cid]
        dist = subset['label'].value_counts().to_dict()
        total = len(subset)
        print(f"  Client {cid}: {dist} (Total={total})")


âœ… Stratified split complete â€” 10 clients assigned for each dataset.
Train samples: 2637 | Val: 330 | Test: 330

ðŸ“Š Train Set Client Distributions:
  Client 0: {'benign': 144, 'malignant': 120} (Total=264)
  Client 1: {'benign': 144, 'malignant': 120} (Total=264)
  Client 2: {'benign': 144, 'malignant': 120} (Total=264)
  Client 3: {'benign': 144, 'malignant': 120} (Total=264)
  Client 4: {'benign': 144, 'malignant': 120} (Total=264)
  Client 5: {'benign': 144, 'malignant': 120} (Total=264)
  Client 6: {'benign': 144, 'malignant': 120} (Total=264)
  Client 7: {'benign': 144, 'malignant': 119} (Total=263)
  Client 8: {'benign': 144, 'malignant': 119} (Total=263)
  Client 9: {'benign': 144, 'malignant': 119} (Total=263)

ðŸ“Š Validation Set Client Distributions:
  Client 0: {'benign': 19, 'malignant': 14} (Total=33)
  Client 1: {'benign': 19, 'malignant': 14} (Total=33)
  Client 2: {'benign': 19, 'malignant': 14} (Total=33)
  Client 3: {'benign': 19, 'malignant': 14} (Total=33)
  Cl