In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

In [2]:
# Step 1: Load the Processed Dataset
try:
    df = pd.read_csv("../Data/The_Cancer_data_1500_V3_Processed.csv")  # Adjust the file path as needed
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: The dataset file was not found. Please check the file path.")
    exit()

Dataset loaded successfully!


In [3]:




def add_gaussian_noise(data, epsilon, sensitivity=1):
    """
    Add Gaussian noise to a dataset for Differential Privacy.

    Args:
        data (array-like): The data to add noise to.
        epsilon (float): The privacy budget (smaller = more noise, stronger privacy).
        sensitivity (float): The sensitivity of the data.
    Returns:
        Noisy data (array-like).
    """
    scale = sensitivity / epsilon  # Scale of Gaussian noise
    noise = np.random.normal(loc=0, scale=scale, size=data.shape)
    return data + noise


In [4]:
epsilon = 1.0  # Adjust privacy budget as needed (smaller = more privacy)
sensitive_columns = ['Age', 'BMI', 'PhysicalActivity', 'AlcoholIntake']

for col in sensitive_columns:
    if col in df.columns:
        df[col] = add_gaussian_noise(df[col], epsilon)


In [5]:
dp_output_path = "../Data/The_Cancer_data_DP_Protected.csv"
df.to_csv(dp_output_path, index=False)
print(f"DP-Protected dataset saved at: {dp_output_path}")


DP-Protected dataset saved at: ../Data/The_Cancer_data_DP_Protected.csv


In [6]:


# Ensure the Data folder exists
data_folder = "../Data/"
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Save histograms for sensitive columns
for col in sensitive_columns:
    if col in df.columns:
        plt.figure(figsize=(10, 5))
        plt.hist(df[col], bins=30, alpha=0.5, label="Noisy Data", color='orange')
        plt.xlabel(f"{col} (Noisy)")
        plt.ylabel("Frequency")
        plt.title(f"Distribution of {col} After Noise Addition")
        plt.legend()
        
        # Save the image
        output_path = os.path.join(data_folder, f"{col}_noise_distribution.png")
        plt.savefig(output_path)
        plt.close()  # Close the plot to avoid overlapping
        
        print(f"Saved histogram for {col} at: {output_path}")


Saved histogram for Age at: ../Data/Age_noise_distribution.png
Saved histogram for BMI at: ../Data/BMI_noise_distribution.png
Saved histogram for PhysicalActivity at: ../Data/PhysicalActivity_noise_distribution.png
Saved histogram for AlcoholIntake at: ../Data/AlcoholIntake_noise_distribution.png
