In [168]:
import time
import numpy as np
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_openml
from pyope.ope import OPE, ValueRange
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.backends import default_backend
import os
from tqdm import tqdm
import matplotlib.pyplot as plt



Loading Dataset

In [169]:
start_total_time = time.perf_counter()  # Start total execution time

#Load Dataset Used for Testing and Training
start_dataset_load_time = time.perf_counter()

mnist = fetch_openml("mnist_784", version=1, as_frame=False)
X, y = mnist.data.astype("float32"), mnist.target.astype("int")



end_dataset_load_time = time.perf_counter()

dataset_load_time = end_dataset_load_time - start_dataset_load_time
print(f"Dataset Loading Time: {dataset_load_time:.4f} seconds")


Dataset Loading Time: 4.2906 seconds


Scaling Dataset

In [170]:
# Normalize pixel values to [0, 1]
X = X / 255.0

if X.max() <= 1:
    # Rescale dataset from original range to [0, 255]
    X = (X - X.min()) / (X.max() - X.min()) * 255
    X = (X * 10).astype(int)  # Scale to 0–2550
else:
    X = X.astype(int)

Dataset Splitting

In [171]:
# ✅ Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Change the number of samples to be encrypted for testing purposes (can be removed )
num_samples_training = len(X_train)
num_samples_testing = 100

X_test = X_test[:num_samples_testing]
X_train = X_train[:num_samples_training]

print(f"Number of training samples: {num_samples_training}")
print(f"Number of testing samples: {num_samples_testing}")

Number of training samples: 56000
Number of testing samples: 100


OPE Initialization and Setup

In [172]:
ope_key = b'some_secure_key'
scale_factor = 10
max_pixel_value = 255 * scale_factor  

ope = OPE(ope_key, ValueRange(0, max_pixel_value), ValueRange(0, 2**32))

#Function to encrypt the dataset using OPE
def encrypt_dataset_with_ope(X):
    return np.array([[ope.encrypt(int(val)) for val in row] for row in tqdm(X, desc="Encrypting Dataset")])




Testing Data Encryption using OPE

In [173]:
def load_or_encrypt_dataset(X, ope_key):


    # Convert key to a safe, readable string
    key_name = ope_key.decode("utf-8")
    encrypted_file_path = f"X_test_encrypted_{key_name}.npy"

    if os.path.exists(encrypted_file_path):
        print(f"🔁 Loaded cached encrypted data for key '{key_name}'")
        return np.load(encrypted_file_path), 0

    print(f"Encrypting data using key '{key_name}' and saving to disk...")
    start_time = time.perf_counter()
    encrypted_X = np.array([
        [ope.encrypt(int(val)) for val in row]
        for row in X
    ])
    end_time = time.perf_counter()

    np.save(encrypted_file_path, encrypted_X)
    print(f"✅ Encrypted data saved as '{encrypted_file_path}'")
    return encrypted_X, end_time - start_time


In [174]:
# Encrypt Test Data Using OPE

start_test_data_encryption_time = time.perf_counter()

X_test_encrypted, test_data_encryption_time = load_or_encrypt_dataset(X_test, ope_key)

end_test_data_encryption_time = time.perf_counter()

if test_data_encryption_time == 0:
    test_data_encryption_time = end_test_data_encryption_time - start_test_data_encryption_time

print(f"Dataset Encryption Time: {test_data_encryption_time:.4f} seconds")

🔁 Loaded cached encrypted data for key 'some_secure_key'
Dataset Encryption Time: 0.0034 seconds


Random Forest Initialization & Training

In [185]:
#Number of Decision Trees in Random Forest
num_estimators = 3

start_training_time = time.perf_counter()

clf_ope = RandomForestClassifier(n_estimators=num_estimators, max_depth=20, random_state=42, min_samples_split=2)
clf_ope.fit(X_train, y_train)

end_training_time = time.perf_counter()
training_time = end_training_time - start_training_time

print(f"Random Forest Training Time: {training_time:.4f} seconds")

Random Forest Training Time: 1.1765 seconds


AES Function Definition and Label Encryption (Using ECB as the mode of operation)

In [186]:

aes_key = os.urandom(32)  # Use this key for encryption/decryption

# ✅ AES Encrypt Function
def aes_encrypt(data, key):
    cipher = Cipher(algorithms.AES(key), modes.ECB(), backend=default_backend())
    encryptor = cipher.encryptor()
    padded_data = data.ljust(16)
    ciphertext = encryptor.update(padded_data.encode()) + encryptor.finalize()
    return ciphertext

# ✅ AES Decrypt Function
def aes_decrypt(ciphertext, key):
    cipher = Cipher(algorithms.AES(key), modes.ECB(), backend=default_backend())
    decryptor = cipher.decryptor()
    return decryptor.update(ciphertext).decode().strip()

start_label_encryption_time = time.perf_counter()

encrypted_labels = {
    label: aes_encrypt(str(label), aes_key)
    for label in tqdm(np.unique(y_train), desc="Encrypting Labels")
}

end_label_encryption_time = time.perf_counter()
label_encryption_time = end_label_encryption_time - start_label_encryption_time

print(f"Number of Encrypted Labels: {len(encrypted_labels)}")
print(f"AES Label Encryption Time: {label_encryption_time:.4f} seconds")

Encrypting Labels: 100%|██████████| 10/10 [00:00<00:00, 7710.12it/s]

Number of Encrypted Labels: 10
AES Label Encryption Time: 0.0054 seconds





Thresholds Encryption

In [187]:
start_threshold_encryption_time = time.perf_counter()

encrypted_thresholds = []

for tree in tqdm(clf_ope.estimators_, desc="Encrypting Thresholds Per Tree"):
    tree_thresholds = []
    for th in tree.tree_.threshold:
        if th != -2:
            tree_thresholds.append(ope.encrypt(int(th)))
        else:
            tree_thresholds.append(None)
    encrypted_thresholds.append(tree_thresholds)

end_threshold_encryption_time = time.perf_counter()
threshold_encryption_time = end_threshold_encryption_time - start_threshold_encryption_time

print(f"Threshold Encryption Time: {threshold_encryption_time:.4f} seconds")

Encrypting Thresholds Per Tree: 100%|██████████| 3/3 [00:18<00:00,  6.01s/it]

Threshold Encryption Time: 18.0365 seconds





Leaf Node Encryption

In [188]:
# Encrypt Leaf Node Labels Per Tree Using AES

start_leaf_encryption_time = time.perf_counter()

encrypted_leaf_values = []

for i, tree in enumerate(clf_ope.estimators_):
    tree_leaf_map = {}
    for node in range(tree.tree_.node_count):
        if tree.tree_.feature[node] == -2:  # It's a leaf
            label = str(tree.tree_.value[node].argmax())
            tree_leaf_map[node] = aes_encrypt(label, aes_key)
    encrypted_leaf_values.append(tree_leaf_map)

end_leaf_encryption_time = time.perf_counter()
leaf_encryption_time = end_leaf_encryption_time - start_leaf_encryption_time

print(f"Leaf Node Encryption Time: {leaf_encryption_time:.4f} seconds")


Leaf Node Encryption Time: 0.2945 seconds


Dataset Encryption Functions

In [189]:
# Function to Encrypt an Image with OPE
def encrypt_image(image, ope_key):
    encrypted_image = [ope_key.encrypt(int(pixel)) for pixel in image]
    return encrypted_image

# Cleaned Function to Encrypt the Entire Dataset
def encrypt_dataset(X, ope_key):
    encrypted_X = []
    for idx, image in enumerate(X, start=1):
        start_time = time.time()
        encrypted_image = encrypt_image(image, ope_key)  # No scaling needed
        encryption_time = time.time() - start_time
        print(f"{idx}: Image Encryption Time: {encryption_time:.4f} sec")
        encrypted_X.append(encrypted_image)

    print(f"Number of encrypted images: {len(encrypted_X)}")
    return np.array(encrypted_X)

In [190]:
def secure_classify(model, encrypted_X, encrypted_thresholds, encrypted_leaf_values, aes_key):
    votes = []

    for tree_idx, tree in enumerate(model.estimators_):
        node = 0
        tree_thresholds = encrypted_thresholds[tree_idx]

        while tree.tree_.feature[node] != -2:
            feature_idx = tree.tree_.feature[node]
            encrypted_threshold = tree_thresholds[node]

            if encrypted_X[feature_idx] < encrypted_threshold:
                node = tree.tree_.children_left[node]
            else:
                node = tree.tree_.children_right[node]
                
        encrypted_value = encrypted_leaf_values[tree_idx][node]
        decrypted_value = aes_decrypt(encrypted_value, aes_key)
        votes.append(int(decrypted_value))

    return Counter(votes).most_common(1)[0][0]


In [191]:
# ✅ Function to Perform Secure Classification on All Test Images
def secure_classify_dataset(model, X_encrypted, encrypted_thresholds, encrypted_leaf_values, aes_key):
    return np.array([secure_classify(model, encrypted_X, encrypted_thresholds, encrypted_leaf_values, aes_key) for encrypted_X in X_encrypted])


In [192]:
# ✅ Measure time taken for classification
start_time = time.time()
print("Performing Secure Classification...")
y_pred_encrypted = secure_classify_dataset(clf_ope, X_test_encrypted, encrypted_thresholds, encrypted_leaf_values, aes_key)
classification_time = time.time() - start_time
print(f"Secure Classification Time: {classification_time:.4f} seconds")


Performing Secure Classification...
Secure Classification Time: 0.0229 seconds


In [193]:
# ✅ Compute Accuracy of Secure Inference
secure_accuracy = accuracy_score(y_test[:num_samples_testing], y_pred_encrypted)

# ✅ Print Secure Classification Results
print(f"Secure Random Forest Accuracy on Encrypted Dataset: {secure_accuracy:.4f}")

Secure Random Forest Accuracy on Encrypted Dataset: 0.8800


In [194]:
# Ensure all timer variables are defined to avoid NameError
if 'start_threshold_encryption_time' not in globals():
    start_threshold_encryption_time = end_threshold_encryption_time = time.perf_counter()

if 'start_label_encryption_time' not in globals():
    start_label_encryption_time = end_label_encryption_time = time.perf_counter()

if 'start_dataset_encryption_time' not in globals():
    start_dataset_encryption_time = end_dataset_encryption_time = time.perf_counter()

if 'start_rf_training_time' not in globals():
    start_rf_training_time = end_rf_training_time = time.perf_counter()

end_total_time = time.perf_counter()  # End total execution time

total_time = end_total_time - start_total_time
dataset_encryption_time = end_dataset_encryption_time - start_dataset_encryption_time
rf_training_time = end_rf_training_time - start_rf_training_time
threshold_encryption_time = end_threshold_encryption_time - start_threshold_encryption_time


# Recompute actual total from all components to ensure percentages are meaningful
effective_total_time = (
    dataset_load_time +
    test_data_encryption_time +
    training_time +
    threshold_encryption_time +
    classification_time
)

dataset_load_percentage = (dataset_load_time / effective_total_time) * 100
test_data_encryption_percentage = (test_data_encryption_time / effective_total_time) * 100
rf_training_percentage = (training_time / effective_total_time) * 100
threshold_encryption_percentage = (threshold_encryption_time / effective_total_time) * 100
classification_percentage = (classification_time / effective_total_time) * 100




print("\n===== Execution Time Summary =====")
print(f"Total Execution Time: {effective_total_time:.4f} seconds")
print(f"Dataset Load Time: {dataset_load_time:.4f} seconds ({dataset_load_percentage:.2f}%)")
print(f"Test Data Encryption Time: {test_data_encryption_time:.4f} seconds ({test_data_encryption_percentage:.2f}%)")
print(f"Random Forest Training Time: {training_time:.4f} seconds ({rf_training_percentage:.2f}%)")
print(f"Threshold Encryption Time: {threshold_encryption_time:.4f} seconds ({threshold_encryption_percentage:.2f}%)")
print(f"Secure Classification Time: {classification_time:.4f} seconds ({classification_percentage:.2f}%)")

print("\n===== Secure Classification Results =====")
print(f"Secure Random Forest Accuracy on Encrypted MNIST: {secure_accuracy:.4f}")
print(f"Number of Decision Trees (num_estimators): {num_estimators}")
print(f"Number of Images Used for Training: {len(X_train)}")
print(f"Number of Images Used for Testing: {len(X_test)}")

# Combine encryption and classification times
total_throughput_time = test_data_encryption_time + classification_time
encryption_percentage_throughput = (test_data_encryption_time / total_throughput_time) * 100
classification_percentage_througput = (classification_time / total_throughput_time) * 100

print("\n===== Throughput =====")
throughput = len(X_test) / total_throughput_time
print(f"Total Throughput Time: {total_throughput_time:.4f} seconds")
print(f"Throughput: {throughput:.2f} samples/second")
print(f"Percentage of Test Data Encryption Time vs Throughput: {encryption_percentage_throughput:.2f}%")
print(f"Percentage of Classification Time vs Throughput: {classification_percentage_througput:.2f}%")



===== Execution Time Summary =====
Total Execution Time: 23.5298 seconds
Dataset Load Time: 4.2906 seconds (18.23%)
Test Data Encryption Time: 0.0034 seconds (0.01%)
Random Forest Training Time: 1.1765 seconds (5.00%)
Threshold Encryption Time: 18.0365 seconds (76.65%)
Secure Classification Time: 0.0229 seconds (0.10%)

===== Secure Classification Results =====
Secure Random Forest Accuracy on Encrypted MNIST: 0.8800
Number of Decision Trees (num_estimators): 3
Number of Images Used for Training: 56000
Number of Images Used for Testing: 100

===== Throughput =====
Total Throughput Time: 0.0263 seconds
Throughput: 3808.62 samples/second
Percentage of Test Data Encryption Time vs Throughput: 12.93%
Percentage of Classification Time vs Throughput: 87.07%
