# Cluster the Data using sklearn kMeans

In [1]:
import numpy as np
from scipy import stats
from sklearn.cluster import KMeans

# Example dataset
data = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]])

#apply clustering on this data set and cluster the data 2 clusters

kmeans = KMeans(n_clusters=2)
kmeans.fit(data)
clusters = kmeans.labels_
original_centroids = kmeans.cluster_centers_

# Print the original data points, clusters and centroids
print("Original Data Points:\n", data)
print("Clusters:\n", clusters)
print("Original Centroids:\n", original_centroids)


Original Data Points:
 [[ 1.   2. ]
 [ 1.5  1.8]
 [ 5.   8. ]
 [ 8.   8. ]
 [ 1.   0.6]
 [ 9.  11. ]]
Clusters:
 [1 1 0 0 1 0]
Original Centroids:
 [[7.33333333 9.        ]
 [1.16666667 1.46666667]]




# Add noise to the Data based on the privacy budget and proivded sensitivity and cluster the data

In [2]:
def add_noise(data, epsilon, sensitivity):
    beta = sensitivity / epsilon
    laplace_noise = np.random.laplace(0, beta, data.shape)
    noisy_data = data + laplace_noise
    return noisy_data


# Sensitivity is the maximum change in data points due to the addition or removal of a single data point
sensitivity = np.max(np.abs(data - np.mean(data, axis=0)))

# Privacy parameter epsilon determines the amount of noise to be added
epsilon = 0.1

# Add noise to the data points
noisy_data = add_noise(data, epsilon, sensitivity)




# Perform clustering on the noisy data
kmeans = KMeans(n_clusters=2)
kmeans.fit(noisy_data)
noisy_clusters = kmeans.labels_
noise_centroids = kmeans.cluster_centers_


print("Noisy Data Points:\n", noisy_data)
print("Noisy Clusters:\n", noisy_clusters)
print("Noisy Centroids :\n", noise_centroids)

Noisy Data Points:
 [[ -8.22894996 -25.09225801]
 [ 48.29852161 -93.63432789]
 [  2.61671234  86.87531981]
 [ 10.03114688   7.72529685]
 [-27.57009962  59.88763296]
 [ 16.99705384 -94.28428515]]
Noisy Clusters:
 [1 1 0 0 0 1]
Noisy Centroids :
 [[ -4.97408014  51.49608321]
 [ 19.0222085  -71.00362369]]




# Cluster the data using diffprivlib frameworks which supports Differential Privacy enabled clustering

In [3]:
!pip3 install diffprivlib

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://arrao:****@artifactory.trusted.visa.com/api/pypi/pypi-remote/simple


In [4]:
from diffprivlib.models import KMeans

epsilon = 1.0

# Perform clustering with differential privacy
dp_kmeans = KMeans(epsilon=epsilon, n_clusters=2)
dp_kmeans.fit(data)

# Get the differentially private cluster centroids
dp_centroids = dp_kmeans.cluster_centers_

# Print the differentially private cluster centroids
print("Differentially Private Cluster Centroids:\n", dp_centroids)


Differentially Private Cluster Centroids:
 [[8.71915573 9.51643083]
 [5.96366996 3.84980361]]


