In [131]:
import pandas as pd
import numpy as np
import random
from math import gcd
from sympy import lcm, mod_inverse

In [132]:
# Load the dataset
df = pd.read_csv("heart_disease_uci.csv")

In [133]:
# Data Pre-processing
# Step 1: Inspecting the dataset
print("Dataset Head:\n", df.head())

Dataset Head:
    id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  
4             normal 

In [134]:
# Calculate the percentage of missing values for each column
missing_percentage = df.isnull().mean() * 100
print("Summary of Missing Values (in percentage):\n", missing_percentage)

Summary of Missing Values (in percentage):
 id           0.000000
age          0.000000
sex          0.000000
dataset      0.000000
cp           0.000000
trestbps     6.413043
chol         3.260870
fbs          9.782609
restecg      0.217391
thalch       5.978261
exang        5.978261
oldpeak      6.739130
slope       33.586957
ca          66.413043
thal        52.826087
num          0.000000
dtype: float64


In [135]:
# Print the data types of each column
print("Data Types of Each Column:\n", df.dtypes)

Data Types of Each Column:
 id            int64
age           int64
sex          object
dataset      object
cp           object
trestbps    float64
chol        float64
fbs          object
restecg      object
thalch      float64
exang        object
oldpeak     float64
slope        object
ca          float64
thal         object
num           int64
dtype: object


In [136]:
# Handling Missing Values
# Fill missing numeric values with the median and categorical with the mode.
for column in df.columns:
    # Check if the column is numeric
    if df[column].dtype != 'object':
        df[column].fillna(df[column].median(), inplace=True)
    # Categorical data
    else:
        df[column].fillna(df[column].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)
  df[column].fillna(df[column].mode()[0], inplace=True)


In [137]:
# Confirm that there are no more missing values
print("Summary of Missing Values After Imputation:\n", df.isnull().sum())

Summary of Missing Values After Imputation:
 id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64


In [138]:
# Implement Label Encoding manually
def label_encode_column(df, col):
    # Create a dictionary mapping each unique category to an integer
    unique_values = df[col].unique()
    mapping = {category: index for index, category in enumerate(unique_values)}

    # Replace each category in the column with its corresponding integer
    df[col] = df[col].map(mapping)
    return df

In [139]:
# Encode Categorical Variables using a simple mapping for label encoding on ordinal data and one-hot encoding on nominal data.
# Identify object-type columns (categorical columns)
categorical_columns = df.select_dtypes(include=[object, bool]).columns

# Check the number of unique values in each categorical column
for col in categorical_columns:
    unique_values = df[col].nunique()
    print(f"Column: {col}, Number of Unique Values: {unique_values}")

# Apply encoding to each categorical column
for col in categorical_columns:
    df = label_encode_column(df, col)

# Check the result
print("Encoded Dataset Head:\n", df.head())
print("Encoded Dataset Data Types:\n", df.dtypes)


Column: sex, Number of Unique Values: 2
Column: dataset, Number of Unique Values: 4
Column: cp, Number of Unique Values: 4
Column: fbs, Number of Unique Values: 2
Column: restecg, Number of Unique Values: 3
Column: exang, Number of Unique Values: 2
Column: slope, Number of Unique Values: 3
Column: thal, Number of Unique Values: 3
Encoded Dataset Head:
    id  age  sex  dataset  cp  trestbps   chol  fbs  restecg  thalch  exang  \
0   1   63    0        0   0     145.0  233.0    0        0   150.0      0   
1   2   67    0        0   1     160.0  286.0    1        0   108.0      1   
2   3   67    0        0   1     120.0  229.0    1        0   129.0      1   
3   4   37    0        0   2     130.0  250.0    1        1   187.0      0   
4   5   41    1        0   3     130.0  204.0    1        0   172.0      0   

   oldpeak  slope   ca  thal  num  
0      2.3      0  0.0     0    0  
1      1.5      1  3.0     1    2  
2      2.6      1  2.0     2    1  
3      3.5      0  0.0     1    

In [140]:
# Homomorphic Encryption
# Key generation for Paillier Cryptosystem
def generate_keys(bit_length=16):
    # Choose two large prime numbers, prime1 and prime2
    prime1 = random_prime(bit_length)
    prime2 = random_prime(bit_length)

    modulus = prime1 * prime2
    private_exponent = lcm(prime1 - 1, prime2 - 1)

    # Set base for encryption
    base_g = modulus + 1

    # Compute decryption_coefficient
    modulus_squared = modulus * modulus
    base_g_exponent_modulus_squared = pow(base_g, int(private_exponent), modulus_squared)
    L_value = (base_g_exponent_modulus_squared - 1) // modulus
    decryption_coefficient = mod_inverse(L_value, modulus)

    # Public and private keys
    public_key = (modulus, base_g)
    private_key = (private_exponent, decryption_coefficient)
    return public_key, private_key

# Helper function to generate a random prime number
def random_prime(bit_length):
    while True:
        candidate_prime = random.getrandbits(bit_length)
        if candidate_prime % 2 == 0:
            candidate_prime += 1
        if all(candidate_prime % i != 0 for i in range(3, int(candidate_prime**0.5) + 1, 2)):
            return candidate_prime

# Encryption function
def encrypt(plaintext, public_key):
    modulus, base_g = public_key
    modulus_squared = modulus * modulus

    # Choose random value for blinding_factor (r) where 1 <= r < modulus and gcd(r, modulus) = 1
    while True:
        blinding_factor = random.randint(1, modulus - 1)
        if gcd(blinding_factor, modulus) == 1:
            break

    # Encrypt plaintext
    ciphertext = (pow(base_g, plaintext, modulus_squared) * pow(blinding_factor, modulus, modulus_squared)) % modulus_squared
    return ciphertext

# Decryption function
def decrypt(ciphertext, public_key, private_key):
    modulus, base_g = public_key
    private_exponent, decryption_coefficient = private_key
    modulus_squared = modulus * modulus

    # Decrypt ciphertext
    ciphertext_exponent_private_modulus_squared = pow(ciphertext, int(private_exponent), modulus_squared)
    L_value = (ciphertext_exponent_private_modulus_squared - 1) // modulus
    plaintext = (L_value * decryption_coefficient) % modulus
    return plaintext

# Homomorphic addition
def homomorphic_addition(ciphertext1, ciphertext2, public_key):
    modulus, _ = public_key
    modulus_squared = modulus * modulus

    # Add ciphertexts by multiplication mod modulus_squared
    return (ciphertext1 * ciphertext2) % modulus_squared


In [141]:
# Example
public_key, private_key = generate_keys()
print(f"Public Key: {public_key}, Private Key: {private_key}")

# Convert age values to integers by rounding
df['age'] = df['age'].astype(int)

# Encrypt the age values and store them in a list
encrypted_ages = []
for age in df['age']:
    encrypted_age = encrypt(age, public_key)
    encrypted_ages.append(encrypted_age)

# Calculate the mean without encryption
mean_without_encryption = df['age'].mean()
print(f"Mean without encryption: {mean_without_encryption}")

# Homomorphic addition to compute the mean of encrypted ages
# Initialize to 1 for multiplicative identity
encrypted_sum = 1
for enc_age in encrypted_ages:
    encrypted_sum = homomorphic_addition(encrypted_sum, enc_age, public_key)

# Decrypt the sum of encrypted ages
decrypted_sum = decrypt(encrypted_sum, public_key, private_key)

# Calculate the mean of encrypted ages (decrypted)
num_ages = len(encrypted_ages)
decrypted_mean = decrypted_sum / num_ages
print(f"Mean of encrypted ages (decrypted): {decrypted_mean}")

Public Key: (3720883, 3720884), Private Key: (611640, 3546732)
Mean without encryption: 53.51086956521739
Mean of encrypted ages (decrypted): 53.51086956521739
