<a href="https://colab.research.google.com/github/Rhicarde/CECS-574---ZKML/blob/main/ZKML_for_Privacy_Preservation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
pip install skl2onnx onnx onnxruntime



# Dataset
View dataset from kaggle [here](https://www.kaggle.com/datasets/ankushpanday2/heart-attack-risk-and-prediction-dataset-in-india/data).

The dataset used is representing medical and lifestyle risk factors that may lead to heart diseases. We will be using it to train a model that can ideally predict early signs of heart diseases based on the given factors.

In [55]:
# Downloading Dataset
import kagglehub

path = kagglehub.dataset_download("ankushpanday2/heart-attack-risk-and-prediction-dataset-in-india")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/ankushpanday2/heart-attack-risk-and-prediction-dataset-in-india/versions/1


In [56]:
import pandas as pd

df = pd.read_csv(path + '/heart_attack_prediction_india.csv')
df.head()

Unnamed: 0,Patient_ID,State_Name,Age,Gender,Diabetes,Hypertension,Obesity,Smoking,Alcohol_Consumption,Physical_Activity,...,Diastolic_BP,Air_Pollution_Exposure,Family_History,Stress_Level,Healthcare_Access,Heart_Attack_History,Emergency_Response_Time,Annual_Income,Health_Insurance,Heart_Attack_Risk
0,1,Rajasthan,42,Female,0,0,1,1,0,0,...,119,1,0,4,0,0,157,611025,0,0
1,2,Himachal Pradesh,26,Male,0,0,0,0,1,1,...,115,0,0,7,0,0,331,174527,0,0
2,3,Assam,78,Male,0,0,1,0,0,1,...,117,0,1,10,1,0,186,1760112,1,0
3,4,Odisha,58,Male,1,0,1,0,0,1,...,65,0,0,1,1,1,324,1398213,0,0
4,5,Karnataka,22,Male,0,0,0,0,0,1,...,109,0,0,9,0,0,209,97987,0,1


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Patient_ID               10000 non-null  int64 
 1   State_Name               10000 non-null  object
 2   Age                      10000 non-null  int64 
 3   Gender                   10000 non-null  object
 4   Diabetes                 10000 non-null  int64 
 5   Hypertension             10000 non-null  int64 
 6   Obesity                  10000 non-null  int64 
 7   Smoking                  10000 non-null  int64 
 8   Alcohol_Consumption      10000 non-null  int64 
 9   Physical_Activity        10000 non-null  int64 
 10  Diet_Score               10000 non-null  int64 
 11  Cholesterol_Level        10000 non-null  int64 
 12  Triglyceride_Level       10000 non-null  int64 
 13  LDL_Level                10000 non-null  int64 
 14  HDL_Level                10000 non-null

# Training Model

The model that is used is a simple logistic regression model from Sklearn and then converted to a Tensor Model. A logistic regression model estimates the probability of an event, in this case heart disease, occuring using the given data.

The model trained gets a prediction accuracy rate of 71.75%.

In [58]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Split Dataset
x = df.drop('Heart_Attack_Risk', axis=1)
y = df['Heart_Attack_Risk']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

ct = ColumnTransformer(
    transformers= [('onehot', OneHotEncoder(drop='first'), ['State_Name', 'Gender']), # Expands number of features thrugh one hot encoding (0, 1)
                   ('normal', StandardScaler(),
                    ['Diastolic_BP', 'Annual_Income', 'Emergency_Response_Time', 'Systolic_BP', 'Cholesterol_Level', 'Triglyceride_Level', 'LDL_Level', 'HDL_Level']) # Keeps feature count the same
                  ], remainder='passthrough') # Remaining columns kept as they are (no transformation)

x_train = ct.fit_transform(x_train)
x_test = ct.transform(x_test)

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Logistic Regression Model
model = LogisticRegression(max_iter=2000)

# Train Model
model.fit(x_train, y_train)

# Test Model Accuracy
y_pred = model.predict(x_test)
print(f'Accuracy Score: {accuracy_score(y_test, y_pred) * 100}%')

Accuracy Score: 71.75%


In [60]:
import torch
import torch.nn as nn


class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [61]:
# Convert to Torch Model for EZKL

# Get sklearn model weights and bias
weights = model.coef_  # Shape: (1, n_features)
bias = model.intercept_  # Shape: (1,)

# Create Torch model
input_dim = x_train.shape[1]
torch_model = LogisticRegression(input_dim)

# Convert sklearn weights to PyTorch tensors and assign them
torch_model.linear.weight = nn.Parameter(torch.tensor(weights, dtype=torch.float32))
torch_model.linear.bias = nn.Parameter(torch.tensor(bias, dtype=torch.float32))

In [62]:
# Testing torch model to ensure same result

# Convert test data to tensor
x_test_torch = torch.tensor(x_test, dtype=torch.float32)
y__test_torch = torch.tensor(y_test.tolist(), dtype=torch.float32)

torch_model.eval()

with torch.no_grad():
  # Get model predictions and convert them to int (0 or 1 tells us no or yes to heart disease risk)
  y_pred_probs = torch_model(x_test_torch).squeeze()
  y_pred_labels = (y_pred_probs >= 0.5).int()  # Convert to binary labels

# Check accuracy
accuracy = accuracy_score(y_torch, y_pred_labels.numpy())  # Compare with true labels

print(f"PyTorch Accuracy: {accuracy * 100:.2f}%")

PyTorch Accuracy: 71.75%


# Implementing EZKL circuit

In [63]:
# check if notebook is in colab
try:
    import google.colab
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "ezkl"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "onnx"])

# rely on local installation of ezkl if the notebook is not in colab
except:
    pass

import os
import json
import ezkl

In [64]:
# Required Files for EZKL
model_path = os.path.join('network.onnx')
data_path = os.path.join('input.json')
cal_data_path = os.path.join('calibration.json')

In [80]:
# Create a random input
x = torch.tensor(x_test[0], dtype=torch.float32).unsqueeze(0)

In [82]:
# Convert model to ONNX

# Export the model
torch.onnx.export(torch_model,                                   # model being run
                  x,                                             # model input
                  model_path,                                    # where to save the model
                  export_params=True,                            # store the trained parameter weights inside the model file
                  opset_version=10,                              # the ONNX version to export the model to
                  do_constant_folding=True,                      # whether to execute constant folding for optimization
                  input_names = ['input'],                       # the model's input names
                  output_names = ['output'],                     # the model's output names
                  dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
                                'output' : {0 : 'batch_size'}})

data_array = ((x).detach().numpy()).reshape([-1]).tolist()

data = dict(input_data = [data_array])

# Serialize data into file:
json.dump(data, open(data_path, 'w'))

In [83]:
!RUST_LOG=trace
# TODO: Dictionary outputs
res = ezkl.gen_settings()
assert res == True

In [84]:
# use the test set to calibrate the circuit
cal_data = dict(input_data = x_test_torch.flatten().tolist())

# Serialize calibration data into file:
json.dump(data, open(cal_data_path, 'w'))

# Optimize for resources, we cap logrows at 12 to reduce setup and proving time, at the expense of accuracy
# You may want to increase the max logrows if accuracy is a concern
res = await ezkl.calibrate_settings(target = "resources", max_logrows = 50, scales = [14])


 <------------- Numerical Fidelity Report (input_scale: 14, param_scale: 14, scale_input_multiplier: 1) ------------->

+-------------+--------------+-------------+-------------+----------------+------------------+---------------+---------------+--------------------+--------------------+------------------------+
| mean_error  | median_error | max_error   | min_error   | mean_abs_error | median_abs_error | max_abs_error | min_abs_error | mean_squared_error | mean_percent_error | mean_abs_percent_error |
+-------------+--------------+-------------+-------------+----------------+------------------+---------------+---------------+--------------------+--------------------+------------------------+
| 0.025770724 | 0.025770724  | 0.025770724 | 0.025770724 | 0.025770724    | 0.025770724      | 0.025770724   | 0.025770724   | 0.0006641302       | 0.07327505         | 0.07327505             |
+-------------+--------------+-------------+-------------+----------------+------------------+---------

In [85]:
res = ezkl.compile_circuit()
assert res == True

In [86]:
res = await ezkl.get_srs()

In [87]:
res = ezkl.setup()
assert res == True

# Create Witness

In [88]:
# Generate the Witness for the proof

# Generate the witness file
witness_path = os.path.join('witness.json')

res = await ezkl.gen_witness()
assert os.path.isfile(witness_path)

In [101]:
# Generate the proof
proof_path = os.path.join('proof.json')

proof = ezkl.prove(proof_type="single", proof_path=proof_path)

print(proof)
assert os.path.isfile(proof_path)

{'instances': [['dc14000000000000000000000000000000000000000000000000000000000000']], 'proof': '0x018e6e0ff39eed01f064a5ba7ebe6d0d4c0bd99707266cd8ee02fffbeac9d4bb201916384a3087ed7dda5351cc71faf30933ec9f40b652741a48f4dce75bf0df2d3558e65a8fe97dde8c963719486055dcd21bfa0bfc5fe5d1ed026cac1549c31b81d91e07f313064390f25299e801daf13ea79bb14c2dafb82a625242957aa7193f3d76aae77c2990b61cba17e09802a393f52b224bda6811b72206508292e7224027e730a6e025ffab16e8380a28a1f5b4001f7e2c28596620bdac74f9d8a5066b88469d9b745c585400d1f6085caff21283665f70f44f64d2e0e559b2cee011b4b8a51106df795cf21c19ae43b5f3d9ce9ea49edc08f5bfcc1d224aba34f7235547a6d8368a6e3995894e676583d0cb5a1d72eacf17098885ae63a388c72225b47565f485ad5dbf43c9b23674d344634eb3bc7fd979650312e8b4b4369dba03ff72eee94a77f5c8bd273a02dbf729eb1f57a32174f0cf03ce7af6ac31d2772d5392311301b1994ded376a60cccfbc4e92a52a80ba82ce4ca33eadd58e6a6a00577c5dc7510b687fd03b40c8718ca1b8a1330daa170d56349c5a79de6d10f90593096b71407e34ff579ee7b86792ea873f58f24933247328a34c3e0d1f91180a0a3b

In [112]:
# Get model prediction on same data
i = 4

y_pred = torch_model(x_test_torch[i]).squeeze()

print(y_pred)
print(y_torch[i])

tensor(0.3102, grad_fn=<SqueezeBackward0>)
tensor(0.)


In [97]:
# Verify proof
res = ezkl.verify()

assert res == True
print("verified")

verified


# Create Verifier

In [91]:
# Check if notebook is in colab
try:
    import google.colab
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "solc-select"])
    !solc-select install 0.8.20
    !solc-select use 0.8.20
    !solc --version

# Rely on local installation if the notebook is not in colab
except:
    pass

Installing solc '0.8.20'...
Version '0.8.20' installed.
Switched global version to 0.8.20
solc, the solidity compiler commandline interface
Version: 0.8.20+commit.a1b79de6.Linux.g++


In [92]:
sol_code_path = os.path.join('Verifier.sol')
abi_path = os.path.join('Verifier.abi')

res = await ezkl.create_evm_verifier(
        sol_code_path=sol_code_path,
        abi_path=abi_path,
    )

assert res == True
assert os.path.isfile(sol_code_path)

In [93]:
onchain_input_array = []

formatted_output = "["
for i, value in enumerate(proof["instances"]):
    for j, field_element in enumerate(value):
        onchain_input_array.append(ezkl.felt_to_big_endian(field_element))
        formatted_output += '"' + str(onchain_input_array[-1]) + '"'
        if j != len(value) - 1:
            formatted_output += ", "
    if i != len(proof["instances"]) - 1:
        formatted_output += ", "
formatted_output += "]"

# This will be the values you use onchain
# Copy them over to remix and see if they verify
# What happens when you change a value?
print("pubInputs: ", formatted_output)
print("proof: ", proof["proof"])

pubInputs:  ["0x00000000000000000000000000000000000000000000000000000000000014dc"]
proof:  0x01897f2002d4145b401ca35c92e9c3c29fdf5db58748b6f0ec71186878289d6411073ea43dbb8ec3f21690870b3b41f32d4bca9aa50e0ebec785533d82f58f340a1a8852e0d10f8bbce5ea3425dd9955a0368862c350134fb471a147d0357d8328b0dbbee12cbf982c72301dd5d0afbb458e374f05784706e565de30a49fedf62dc2e4bab20f2a2f0dc1da08e1a38b4cfd9a2a27e213e548e48e535e5716521d191ed284a4757c8d43e97c94b3bf28c9a46abdbe07c089f832a6f95fadb070e1058caa451197a4f8e9a9e50755929d69a3b41eb3abdf9ad3ac60116b6369ead02ecee4e610ac95b9c0fc47e07de48a1d9f23a13461c98705930e1a2c880f44e3235c317b72acf1bbc287943874902a69e57792f5881273643a7360a8ba47884d27af32dce34637c7d202e68016385bc7d2f2a5a1885cf170dd7680a8c1a1d2822e7d18c14e37062cd61628b83d284f60143572247ab43c20a4a4ab52427338d02e71fec6fab39bc0f3cdbd488056c47fe8f906a9c6fd4bb3c77c067f9be48f7c00577c5dc7510b687fd03b40c8718ca1b8a1330daa170d56349c5a79de6d10f90593096b71407e34ff579ee7b86792ea873f58f24933247328a34c3e0d1f91180a0a3b13d17