In [None]:
from pathlib import Path

import numpy as np

# Set up paths
PROJECT_PATH = Path("..")  # scripts is one level down from project root
DATASET_PATH = PROJECT_PATH / "dataset" / "marketing" / "processed"

print(f"Dataset path: {DATASET_PATH}")
print(f"Dataset path exists: {DATASET_PATH.exists()}")


def load_data_partition(client_id, data_type):
    """Load data for a specific client and data type (mock/private)"""
    path = DATASET_PATH / str(client_id) / data_type
    X = np.load(path / "X_train.npy")
    y = np.load(path / "y_train.npy")
    return X, y


def load_server_test():
    """Load server test data"""
    path = DATASET_PATH / "server_test"
    X = np.load(path / "X_test.npy")
    y = np.load(path / "y_test.npy")
    return X, y


# Load all data
print("Loading all data partitions...")

# Client 0 data
X0_mock, y0_mock = load_data_partition(0, "mock")
X0_private, y0_private = load_data_partition(0, "private")

# Client 1 data
X1_mock, y1_mock = load_data_partition(1, "mock")
X1_private, y1_private = load_data_partition(1, "private")

# Server test data
X_test, y_test = load_server_test()

print("✅ All data loaded successfully!")

# Bank Marketing Dataset Exploration

This notebook explores the processed bank marketing dataset to verify the structure and content of:
- Client 0: mock vs private data
- Client 1: mock vs private data  
- Server test data

## Dataset Structure
```
dataset/marketing/processed/
├── 0/                                  # first data owner
│   ├── mock/ (10 rows sample)
│   └── private/ (full client 0 data)
├── 1/                                  # second data owner
│   ├── mock/ (10 rows sample)
│   └── private/ (full client 1 data)
└── server_test/                        # (test set for evaluation - aggregation server / data scientist)
```


In [None]:
from pathlib import Path

import matplotlib.pyplot as plt

# Set up paths
PROJECT_PATH = Path("..")  # scripts is one level down from project root
DATASET_PATH = PROJECT_PATH / "dataset" / "marketing" / "processed"

print(f"Dataset path: {DATASET_PATH}")
print(f"Dataset path exists: {DATASET_PATH.exists()}")

## Load All Data Files


In [None]:
def load_data_partition(client_id, data_type):
    """Load data for a specific client and data type (mock/private)"""
    path = DATASET_PATH / str(client_id) / data_type
    X = np.load(path / "X_train.npy")
    y = np.load(path / "y_train.npy")
    return X, y


def load_server_test():
    """Load server test data"""
    path = DATASET_PATH / "server_test"
    X = np.load(path / "X_test.npy")
    y = np.load(path / "y_test.npy")
    return X, y


# Load all data
print("Loading all data partitions...")

# Client 0 data
X0_mock, y0_mock = load_data_partition(0, "mock")
X0_private, y0_private = load_data_partition(0, "private")

# Client 1 data
X1_mock, y1_mock = load_data_partition(1, "mock")
X1_private, y1_private = load_data_partition(1, "private")

# Server test data
X_test, y_test = load_server_test()

print("✅ All data loaded successfully!")

## Data Shape Verification

In [None]:
def print_data_info(name, X, y):
    """Print information about a dataset"""
    print(f"\n{name}:")
    print(f"  Features shape: {X.shape}")
    print(f"  Labels shape: {y.shape}")
    print(f"  Number of samples: {len(X)}")
    print(f"  Number of features: {X.shape[1]}")
    print(f"  Label distribution: {np.bincount(y.astype(int))} (0s and 1s)")
    print(f"  Positive rate: {y.mean():.3f}")


# Print information for all datasets
print("=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)

print_data_info("Client 0 - Mock", X0_mock, y0_mock)
print_data_info("Client 0 - Private", X0_private, y0_private)

print_data_info("Client 1 - Mock", X1_mock, y1_mock)
print_data_info("Client 1 - Private", X1_private, y1_private)

print_data_info("Server Test", X_test, y_test)

## Verify Mock Data is Subset of Private Data

In [None]:
def verify_mock_is_subset(client_id, X_mock, y_mock, X_private, y_private):
    """Verify that mock data is the first 10 rows of private data"""
    print(f"\nClient {client_id} - Mock vs Private Verification:")

    # Check if mock data matches first 10 rows of private data
    X_private_first10 = X_private[:10]
    y_private_first10 = y_private[:10]

    X_match = np.array_equal(X_mock, X_private_first10)
    y_match = np.array_equal(y_mock, y_private_first10)

    print(f"  ✅ Features match first 10 rows: {X_match}")
    print(f"  ✅ Labels match first 10 rows: {y_match}")

    if X_match and y_match:
        print("  🎉 Mock data is correctly the first 10 rows of private data!")
    else:
        print("  ❌ Mock data does NOT match first 10 rows of private data!")

    return X_match and y_match


# Verify both clients
client0_valid = verify_mock_is_subset(0, X0_mock, y0_mock, X0_private, y0_private)
client1_valid = verify_mock_is_subset(1, X1_mock, y1_mock, X1_private, y1_private)

print(f"\n{'='*50}")
print(
    f"OVERALL VALIDATION: {'✅ PASSED' if client0_valid and client1_valid else '❌ FAILED'}"
)
print(f"{'='*50}")

## Feature Analysis - Compare Client Feature Sets


In [None]:
# Load the original column information to understand what features each client has
# Based on prepare_data.py configuration

from prepare_data import BANK_COLS, MARKETING_COLS

print("Feature Set Analysis:")
print(f"\nClient 0 (Bank features): {len(BANK_COLS)} features")
print(f"  Features: {BANK_COLS}")
print(f"  Actual feature count in data: {X0_private.shape[1]}")

print(f"\nClient 1 (Marketing features): {len(MARKETING_COLS)} features")
print(f"  Features: {MARKETING_COLS}")
print(f"  Actual feature count in data: {X1_private.shape[1]}")

print(f"\nServer Test (All features): {X_test.shape[1]} features")
print(f"  Expected: {len(BANK_COLS) + len(MARKETING_COLS)} features (after encoding)")

# Note: The actual feature count will be higher due to one-hot encoding of categorical variables

## Data Distribution Analysis


In [None]:
# Create visualizations to compare distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle("Label Distribution Across Datasets", fontsize=16)

datasets = [
    ("Client 0 Mock", y0_mock),
    ("Client 0 Private", y0_private),
    ("Client 1 Mock", y1_mock),
    ("Client 1 Private", y1_private),
    ("Server Test", y_test),
]

for idx, (name, y_data) in enumerate(datasets):
    row = idx // 3
    col = idx % 3

    # Count labels
    counts = np.bincount(y_data.astype(int))
    labels = ["No (0)", "Yes (1)"]

    axes[row, col].bar(labels, counts, color=["lightcoral", "lightblue"])
    axes[row, col].set_title(f"{name}\n(n={len(y_data)})")
    axes[row, col].set_ylabel("Count")

    # Add percentage labels
    total = len(y_data)
    for i, count in enumerate(counts):
        percentage = count / total * 100
        axes[row, col].text(
            i, count + total * 0.01, f"{percentage:.1f}%", ha="center", va="bottom"
        )

# Hide the empty subplot
axes[1, 2].set_visible(False)

plt.tight_layout()
plt.show()

## Summary and Validation Report


In [None]:
print("=" * 60)
print("FINAL VALIDATION REPORT")
print("=" * 60)

# Check all validation criteria
validations = []

# 1. Mock data has exactly 10 samples
mock_size_valid = (len(X0_mock) == 10) and (len(X1_mock) == 10)
validations.append(("Mock data has 10 samples", mock_size_valid))

# 2. Mock data matches first 10 rows of private data
mock_subset_valid = client0_valid and client1_valid
validations.append(("Mock data is subset of private data", mock_subset_valid))

# 3. Clients have different feature sets
different_features = X0_private.shape[1] != X1_private.shape[1]
validations.append(("Clients have different feature sets", different_features))

# 4. Test data exists and has reasonable size
test_data_valid = len(X_test) > 0 and len(y_test) > 0
validations.append(("Test data exists", test_data_valid))

# 5. All datasets have consistent label formats
label_format_valid = all(
    [
        set(np.unique(y)) <= {0, 1}
        for y in [y0_mock, y0_private, y1_mock, y1_private, y_test]
    ]
)
validations.append(("Labels are binary (0/1)", label_format_valid))

# Print validation results
all_valid = True
for check, is_valid in validations:
    status = "✅ PASS" if is_valid else "❌ FAIL"
    print(f"{check:.<50} {status}")
    all_valid = all_valid and is_valid

print("=" * 60)
final_status = (
    "✅ ALL VALIDATIONS PASSED" if all_valid else "❌ SOME VALIDATIONS FAILED"
)
print(f"OVERALL STATUS: {final_status}")
print("=" * 60)

if all_valid:
    print("\n🎉 Dataset structure is correct and ready for federated learning!")
else:
    print(
        "\n⚠️ Please review the failed validations and fix the data preparation script."
    )