Importing modules and visualization tools

In [7]:
# Core imports
import numpy as np
import pandas as pd
import json
import uuid
import time
from collections import Counter

# For ML tasks
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

# Optional: Plotting
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [8]:
folder_path = '/content/drive/My Drive/Capsule'


Class defining data capsule structure

In [9]:
# DataCapsule encapsulates sample-level features and metadata
class DataCapsule:
    def __init__(self, capsule_id, features, label,
                 confidence=None, rare=False, source_id=None, timestamp=None):
        self.id = capsule_id
        self.features = features  # numpy array
        self.label = label
        self.confidence = confidence
        self.rare = rare
        self.source_id = source_id
        self.timestamp = timestamp or time.time()

    def to_dict(self):
        return {
            "id": self.id,
            "type": "data",
            "features": self.features.tolist(),
            "label": self.label,
            "confidence": self.confidence,
            "rare": self.rare,
            "source_id": self.source_id,
            "timestamp": self.timestamp
        }

    @staticmethod
    def from_dict(data):
        return DataCapsule(
            capsule_id=data["id"],
            features=np.array(data["features"]),
            label=data["label"],
            confidence=data["confidence"],
            rare=data["rare"],
            source_id=data["source_id"],
            timestamp=data["timestamp"]
        )


Class defining model capsule structure

In [10]:
# ModelCapsule encapsulates model-level metadata, summaries, and relevant references
class ModelCapsule:
    def __init__(self, capsule_id, model_summary,
                 high_conf_data_ids=None, rare_data_ids=None,
                 source_id=None, timestamp=None):
        self.id = capsule_id
        self.model_summary = model_summary  # e.g. weights, accuracy, label dist
        self.high_conf_data_ids = high_conf_data_ids or []
        self.rare_data_ids = rare_data_ids or []
        self.source_id = source_id
        self.timestamp = timestamp or time.time()

    def to_dict(self):
        return {
            "id": self.id,
            "type": "model",
            "model_summary": self.model_summary,
            "high_conf_data_ids": self.high_conf_data_ids,
            "rare_data_ids": self.rare_data_ids,
            "source_id": self.source_id,
            "timestamp": self.timestamp
        }

    @staticmethod
    def from_dict(data):
        return ModelCapsule(
            capsule_id=data["id"],
            model_summary=data["model_summary"],
            high_conf_data_ids=data.get("high_conf_data_ids", []),
            rare_data_ids=data.get("rare_data_ids", []),
            source_id=data.get("source_id"),
            timestamp=data["timestamp"]
        )


Save & Load Utility

In [11]:
def save_capsule(capsule, folder_path):
    os.makedirs(folder_path, exist_ok=True)
    with open(os.path.join(folder_path, f"{capsule.id}.json"), "w") as f:
        json.dump(capsule.to_dict(), f)

def load_capsules(folder_path, capsule_type="data"):
    capsules = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            with open(os.path.join(folder_path, file_name), "r") as f:
                data = json.load(f)
                if capsule_type == "data" and data["type"] == "data":
                    capsules.append(DataCapsule.from_dict(data))
                elif capsule_type == "model" and data["type"] == "model":
                    capsules.append(ModelCapsule.from_dict(data))
    return capsules


Load dataset → Extract features (via PCA) → Create & store Data Capsules

In [12]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import numpy as np
import uuid
import time
import os
import json

# Step 1: Load dataset
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names

# Step 2: Apply PCA for compressed representation
pca = PCA(n_components=10)
X_reduced = pca.fit_transform(X)

# Step 3: Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)


Generate & Save Data Capsules

In [13]:
data_capsules = []

for i in range(len(X_train)):
    capsule_id = str(uuid.uuid4())
    features = X_train[i]
    label = int(y_train[i])
    confidence = 1.0  # You can replace this later with predicted confidence scores

    capsule = DataCapsule(
        capsule_id=capsule_id,
        features=features,
        label=label,
        confidence=confidence,
        source_id="client_1"
    )

    data_capsules.append(capsule)
    save_capsule(capsule, folder_path)


Load saved Data Capsules → Reconstruct training dataset → Train a simple classifier

In [14]:
# Load capsules from disk
loaded_capsules = load_capsules(folder_path, capsule_type="data")

# Reconstruct features and labels
X_capsules = np.array([cap.features for cap in loaded_capsules])
y_capsules = np.array([cap.label for cap in loaded_capsules])

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train
clf = LogisticRegression(max_iter=1000)
clf.fit(X_capsules, y_capsules)

# Predict on test set (which we already PCA-compressed earlier)
y_pred = clf.predict(X_test)

# Evaluate
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy on held-out test set: {acc:.4f}")
print(classification_report(y_test, y_pred))


Accuracy on held-out test set: 0.9561
              precision    recall  f1-score   support

           0       0.97      0.91      0.94        43
           1       0.95      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



Build Model Capsules

Define a ModelCapsule class

In [15]:
class ModelCapsule:
    def __init__(self,
                 model_id=None,
                 model_type="LogisticRegression",
                 accuracy=None,
                 capsule_ids=None,
                 high_conf_capsules=None,
                 rare_capsules=None,
                 weights=None,
                 intercept=None,
                 created_at=None):
        self.model_id = model_id or str(uuid.uuid4())
        self.model_type = model_type
        self.accuracy = accuracy
        self.capsule_ids = capsule_ids or []
        self.high_conf_capsules = high_conf_capsules or []
        self.rare_capsules = rare_capsules or []
        self.weights = weights  # list of floats
        self.intercept = intercept  # float
        self.created_at = created_at or datetime.datetime.utcnow().isoformat()

    def to_dict(self):
        return {
            "model_id": self.model_id,
            "model_type": self.model_type,
            "accuracy": self.accuracy,
            "capsule_ids": self.capsule_ids,
            "high_conf_capsules": self.high_conf_capsules,
            "rare_capsules": self.rare_capsules,
            "weights": self.weights,
            "intercept": self.intercept,
            "created_at": self.created_at
        }

    def save(self, folder_path):
        path = os.path.join(folder_path, f"{self.model_id}_model_capsule.json")
        with open(path, 'w') as f:
            json.dump(self.to_dict(), f, indent=4)
        print(f"Saved model capsule with weights to {path}")


Capture weights from your model

In [46]:
# Extract model weights and intercept
weights_list = clf.coef_.flatten().tolist()
intercept_val = float(clf.intercept_[0])



Instantiate a model capsule from current training

In [47]:
# Step 1: Determine high-confidence capsules
# We'll mark all correctly predicted samples as high-confidence for now

high_conf_ids = []
for i, (x_feat, y_true, capsule) in enumerate(zip(X_capsules, y_capsules, loaded_capsules)):
    y_pred = clf.predict([x_feat])[0]
    if y_pred == y_true:
        high_conf_ids.append(capsule.id)




Assign “rare = True” to selected capsules

Hybrid-Based Rarity (Simple and Practical)
Let's define rare classes as those that occur with low frequency (e.g., < 10% of all samples).

In [48]:
# Feature outlier score
from sklearn.covariance import MinCovDet
robust_cov = MinCovDet().fit(X_capsules)
mahalanobis_dist = robust_cov.mahalanobis(X_capsules)

# Confidence-based uncertainty
probs = clf.predict_proba(X_capsules)
max_conf = np.max(probs, axis=1)

# Frequency score
from collections import Counter
label_freq = Counter(cap.label for cap in loaded_capsules)
label_ratio = [label_freq[cap.label]/len(loaded_capsules) for cap in loaded_capsules]

# Mark rare
for cap, dist, conf, freq in zip(loaded_capsules, mahalanobis_dist, max_conf, label_ratio):
    cap.rare = (dist > np.percentile(mahalanobis_dist, 90)) or \
               (conf < 0.6) or \
               (freq < 0.1)



Extract rare capsule IDs

In [49]:
# Step 7.1: Extract rare capsule IDs from previously loaded capsules

rare_ids = [cap.id for cap in loaded_capsules if cap.rare]

# Print the number of rare capsules and a sample of their IDs
print(f"Number of rare capsules: {len(rare_ids)}")
print("Sample rare capsule IDs:", rare_ids[:5])


Number of rare capsules: 59
Sample rare capsule IDs: ['57c63db2-d90a-40ab-bd9d-b07396eeb7ea', '8b094014-ab80-44f0-a5e5-f86ac1b0be21', 'a37d4a5e-4364-4729-af6f-1963cd33d4b7', 'bd2690d0-df02-4f38-bc9e-ff9ecffe320b', 'a6cc3c6a-6a23-45dd-a982-211dd600fa73']


Create and save the Model Capsule with metadata(rarity term updated)

In [51]:
# Step 7.2: Create the model capsule and save it

model_capsule = ModelCapsule(
    model_type="LogisticRegression",
    accuracy=acc,
    capsule_ids=[cap.id for cap in loaded_capsules],
    high_conf_capsules=high_conf_ids,
    weights=weights_list,  # Changed from weights_list
    intercept=intercept_val, # Changed from intercept_val
    rare_capsules=rare_ids
)

# Save model capsule
model_capsule.save(folder_path)

Saved model capsule with weights to /content/drive/My Drive/Capsule/21a4f89e-d25d-4a54-9985-10346f735ce1_model_capsule.json


Client Capsule State Table (Context Tracker)

This component will be a dictionary or a small class per client storing:

client_id

capsule_ids_received (set of capsule UUIDs)

rare_labels_missing (labels the client has rarely seen — optional)

last_updated

maybe: history of model capsule scores

In [52]:
class ClientContextTable:
    def __init__(self, client_id=None):
        self.client_id = client_id or str(uuid.uuid4())
        self.received_capsule_ids = set()
        self.rare_labels = set()
        self.class_distribution = {}  # New
        self.low_confidence_ids = set()  # New
        self.last_updated = datetime.datetime.utcnow().isoformat()

    def update_class_distribution(self, label_counts):
        self.class_distribution = label_counts
        self.last_updated = datetime.datetime.utcnow().isoformat()

    def update_low_confidence_ids(self, low_conf_ids):
        self.low_confidence_ids.update(low_conf_ids)
        self.last_updated = datetime.datetime.utcnow().isoformat()

    def to_dict(self):
        return {
            "client_id": self.client_id,
            "received_capsule_ids": list(self.received_capsule_ids),
            "rare_labels": list(self.rare_labels),
            "class_distribution": self.class_distribution,
            "low_confidence_predictions": list(self.low_confidence_ids),
            "last_updated": self.last_updated
        }


Compute class distribution

In [53]:
from collections import Counter

# Compute class distribution from local capsules
label_counts = Counter([cap.label for cap in loaded_capsules])

# Convert to standard dictionary (if needed)
label_counts = dict(label_counts)

# Update in client context
client_ctx.update_class_distribution(label_counts)


Compute Low confidence and rare labels

In [54]:
low_conf_ids = []

for cap, x_feat, y_true in zip(loaded_capsules, X_capsules, y_capsules):
    y_pred = clf.predict([x_feat])[0]
    if y_pred != y_true:
        low_conf_ids.append(cap.id)

client_ctx.update_low_confidence_ids(low_conf_ids)

rare_labels = list(set([cap.label for cap in loaded_capsules if cap.rare]))
client_ctx.set_rare_labels(rare_labels)

folder_path = "/content/drive/My Drive/Capsule"
client_ctx.save(folder_path)



Saved context table to /content/drive/My Drive/Capsule/client_1_context.json


Optional Next Steps (Later Stages)
Once you have multiple clients or simulated sharing:

Use rare_labels to prioritize what the client needs

Use received_capsule_ids to avoid duplicate reception

Use class_distribution to choose capsules to send (diversity or class balance)

Use low_confidence_ids to refine local training