### Assignment: Clustering Analysis for 6D Data using Spark

###### Goal: To discover the structure of higher dimensional data in Spark using k-Means clustering and PCA

###### The dataset contains a a list of 6 dimensional coordinates within a cube of side length 100.

In [101]:
# Create a Spark session
from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.appName("ClusteringAnalysis").getOrCreate()
sc = SparkContext.getOrCreate()

In [102]:
#importing necessary libraries
from pyspark.ml.clustering import KMeans 
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator 
from pyspark.ml.feature import StandardScaler 
from pyspark.ml.feature import PCA 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [103]:
# Loading and reading the dataset
rdd = sc.textFile("space.dat")

# Transforming the RDD to a structured format
rdd2 = rdd.map(lambda x: x.split(",")).map(lambda x: [float(x[0]), float(x[1]), float(x[2]), float(x[3]), float(x[4]), float(x[5])])

# Creating a DataFrame with named columns
df = spark.createDataFrame(rdd2, ["D1", "D2", "D3", "D4", "D5", "D6"])
df.show(10)

+------------------+------------------+------------------+------------------+------------------+------------------+
|                D1|                D2|                D3|                D4|                D5|                D6|
+------------------+------------------+------------------+------------------+------------------+------------------+
|16.175290018360865| 79.26158903022598|15.279677957430495| 84.44449050934236|18.389312435117514| 79.87125481957949|
|12.441585418179132| 81.38351233369806|17.745524605746372| 77.67266089686214|11.879433413248021| 82.51749363148706|
|12.612232031217749| 85.35689559849533|17.217141384399465| 81.99797307493071|12.917574288336507| 80.80276566517777|
|11.213021770627718|11.213021770627718|11.213021770627718|11.213021770627718|11.213021770627718|11.213021770627718|
| 76.15301332156801|  75.0499428915535| 76.49786909309103| 75.10331927879925|    75.79480494739| 75.49118728415654|
|25.030992305661588| 23.32588783792623| 25.62437382971083|  74.947433228

In [104]:
# Transforming the data using the VectorAssembler
feature_cols = df.columns 
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features") 
assembled_data = assembler.transform(df)

In [105]:
#Scaling the data using Standard Scaler (i.e. normalizing data)
scaler = StandardScaler(inputCol="features", outputCol="scaled_features") 
scaler_model = scaler.fit(assembled_data) 
scaled_data = scaler_model.transform(assembled_data)

In [106]:
# Defining a range of values for k (number of clusters)
k_values = range(2, 10)

# Initializing an empty list to store the results for each k
results = []

# Looping through each k value to find the optimum k
for k in k_values:
    kmeans = KMeans(k=k, seed=1)
    
    model = kmeans.fit(scaled_data)
    
    predictions = model.transform(scaled_data)
    
    # Evaluating the clustering using the Silhouette Score
    evaluator = ClusteringEvaluator()
    silhouette_score = evaluator.evaluate(predictions)
    result_dict = {"k": k, "Silhouette Score": silhouette_score}
    results.append(result_dict)

In [107]:
results_df = spark.createDataFrame(results) 
results_df.show(10)

+------------------+---+
|  Silhouette Score|  k|
+------------------+---+
|0.6838411073627806|  2|
|0.6005114952335537|  3|
| 0.784628629090211|  4|
|0.9762436358648875|  5|
|0.9857394825123544|  6|
|0.9058153510037809|  7|
|0.8925534169047732|  8|
|0.7897798321436718|  9|
+------------------+---+



From the table above, we can see that the optimal number of clusters for this dataset is 6, which has the highest silhouette score. 

In [108]:
#Perform k-means clustering with k=6
kmeans = KMeans(k=6, seed=1)
model = kmeans.fit(scaled_data)
predictions = model.transform(scaled_data)

In [109]:
# Get cluster centers
centers = model.clusterCenters()
print("Cluster Centers:")
for center in centers:
    print(center)

Cluster Centers:
[70.00305127 60.02333638 50.02662094 39.99197196 29.99459599 20.01922986]
[14.99656549 80.01182547 14.99467628 80.05577159 14.97239517 80.02087051]
[25.00130508 25.00506354 24.97928291 75.01356403 75.01196977 75.00804088]
[74.97159526 75.00501487 74.95916543 75.00238108 74.98414552 74.99371641]
[0.08449107 0.08178346 0.08159672 0.07987057 0.0867673  0.08799652]
[14.87519246 14.87569221 14.87619196 14.87669171 14.87719146 14.87769121]


In [110]:
# Get cluster points
cluster_sizes = predictions.groupBy("prediction").count().orderBy("prediction")
print("Cluster Sizes:")
cluster_sizes.show()

Cluster Sizes:
+----------+-----+
|prediction|count|
+----------+-----+
|         0| 2501|
|         1| 4000|
|         2| 3501|
|         3| 3001|
|         4| 2400|
|         5| 2001|
+----------+-----+



In [111]:
#filtering cluster points
cluster1 = predictions.filter(predictions["prediction"] == 0) 
cluster2 = predictions.filter(predictions["prediction"] == 1) 
cluster3 = predictions.filter(predictions["prediction"] == 2) 
cluster4 = predictions.filter(predictions["prediction"] == 3) 
cluster5 = predictions.filter(predictions["prediction"] == 4) 
cluster6 = predictions.filter(predictions["prediction"] == 5) 

In [112]:
df1_6D = cluster1.select("D1", "D2", "D3", "D4", "D5", "D6").toPandas()
df2_6D = cluster2.select("D1", "D2", "D3", "D4", "D5", "D6").toPandas()
df3_6D = cluster3.select("D1", "D2", "D3", "D4", "D5", "D6").toPandas()
df4_6D = cluster4.select("D1", "D2", "D3", "D4", "D5", "D6").toPandas()
df5_6D = cluster5.select("D1", "D2", "D3", "D4", "D5", "D6").toPandas()
df6_6D = cluster6.select("D1", "D2", "D3", "D4", "D5", "D6").toPandas()

In [113]:
#performing PCA on the 6D Data for each cluster
pca = PCA(k=6, inputCol="scaled_features", outputCol="pca_features") 
 
pca_model1 = pca.fit(cluster1) 
pca_result1 = pca_model1.transform(cluster1)
pca_matrix1 = np.array(pca_result1.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect())

pca_model2 = pca.fit(cluster2) 
pca_result2 = pca_model2.transform(cluster2)
pca_matrix2 = np.array(pca_result2.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 

pca_model3 = pca.fit(cluster3) 
pca_result3 = pca_model3.transform(cluster3)
pca_matrix3= np.array(pca_result3.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 

pca_model4 = pca.fit(cluster4) 
pca_result4 = pca_model4.transform(cluster4)
pca_matrix4= np.array(pca_result4.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 

pca_model5 = pca.fit(cluster5) 
pca_result5 = pca_model5.transform(cluster5)
pca_matrix5 = np.array(pca_result5.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 

pca_model6 = pca.fit(cluster6) 
pca_result6 = pca_model6.transform(cluster6)
pca_matrix6 = np.array(pca_result6.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 

In [114]:
#checking dimensions of the shape each cluster produce through cumulative variance
explained_variances1 = pca_model1.explainedVariance.toArray()
cumulative_variance1 = explained_variances1.cumsum()
print(cumulative_variance1)

explained_variances2 = pca_model2.explainedVariance.toArray()
cumulative_variance2 = explained_variances2.cumsum()
print(cumulative_variance2)

explained_variances3 = pca_model3.explainedVariance.toArray()
cumulative_variance3 = explained_variances3.cumsum()
print(cumulative_variance3)

explained_variances4 = pca_model4.explainedVariance.toArray()
cumulative_variance4 = explained_variances4.cumsum()
print(cumulative_variance4)

explained_variances5 = pca_model5.explainedVariance.toArray()
cumulative_variance5 = explained_variances5.cumsum()
print(cumulative_variance5)

explained_variances6 = pca_model6.explainedVariance.toArray()
cumulative_variance6 = explained_variances6.cumsum()
print(cumulative_variance6)

[0.47920818 0.79350254 1.         1.         1.         1.        ]
[0.23610388 0.41357649 0.5704253  0.72211295 0.86981003 1.        ]
[0.65361961 0.82866565 0.99133456 1.         1.         1.        ]
[0.81558969 0.98493942 1.         1.         1.         1.        ]
[0.2456057  0.44725014 0.62943154 0.79390144 0.95498558 1.        ]
[0.99983508 1.         1.         1.         1.         1.        ]


In [115]:
#doing PCA on each cluster to reduce the dimensionality to 3D
pca = PCA(k=3, inputCol="scaled_features", outputCol="pca_features") 
 
pca_3dmodel1 = pca.fit(cluster1) 
pca_3dresult1 = pca_3dmodel1.transform(cluster1)
pca_3dmatrix1 = np.array(pca_3dresult1.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 
 
pca_3dmodel2 = pca.fit(cluster2) 
pca_3dresult2 = pca_3dmodel2.transform(cluster2)
pca_3dmatrix2 = np.array(pca_3dresult2.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 
 
pca_3dmodel3 = pca.fit(cluster3) 
pca_3dresult3 = pca_3dmodel3.transform(cluster3)
pca_3dmatrix3 = np.array(pca_3dresult3.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 
 
pca_3dmodel4 = pca.fit(cluster4) 
pca_3dresult4 = pca_3dmodel4.transform(cluster4)
pca_3dmatrix4 = np.array(pca_3dresult4.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 
 
pca_3dmodel5 = pca.fit(cluster5) 
pca_3dresult5 = pca_3dmodel5.transform(cluster5)
pca_3dmatrix5 = np.array(pca_3dresult5.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 
 
pca_3dmodel6 = pca.fit(cluster6) 
pca_3dresult6 = pca_3dmodel6.transform(cluster6)
pca_3dmatrix6 = np.array(pca_3dresult6.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 

In [116]:
df1 = pd.DataFrame(pca_3dmatrix1, columns=['x', 'y', 'z'])
df2 = pd.DataFrame(pca_3dmatrix2, columns=['x', 'y', 'z'])
df3 = pd.DataFrame(pca_3dmatrix3, columns=['x', 'y', 'z'])
df4 = pd.DataFrame(pca_3dmatrix4, columns=['x', 'y', 'z'])
df5 = pd.DataFrame(pca_3dmatrix5, columns=['x', 'y', 'z'])
df6 = pd.DataFrame(pca_3dmatrix6, columns=['x', 'y', 'z'])

In [117]:
#doing PCA on each cluster to reduce the dimensionality to 2D
pca = PCA(k=2, inputCol="scaled_features", outputCol="pca_features") 

pca_2dmodel1 = pca.fit(cluster1) 
pca_2dresult1 = pca_2dmodel1.transform(cluster1)
pca_2dmatrix1 = np.array(pca_2dresult1.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 

pca_2dmodel2 = pca.fit(cluster2) 
pca_2dresult2 = pca_2dmodel2.transform(cluster2)
pca_2dmatrix2 = np.array(pca_2dresult2.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 

pca_2dmodel3 = pca.fit(cluster3) 
pca_2dresult3 = pca_2dmodel3.transform(cluster3)
pca_2dmatrix3 = np.array(pca_2dresult3.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 

pca_2dmodel4 = pca.fit(cluster4) 
pca_2dresult4 = pca_2dmodel4.transform(cluster4)
pca_2dmatrix4 = np.array(pca_2dresult4.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 

pca_2dmodel5 = pca.fit(cluster5) 
pca_2dresult5 = pca_2dmodel5.transform(cluster5)
pca_2dmatrix5 = np.array(pca_2dresult5.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect()) 

pca_2dmodel6 = pca.fit(cluster6) 
pca_2dresult6 = pca_2dmodel6.transform(cluster6)
pca_2dmatrix6 = np.array(pca_2dresult6.select("pca_features").rdd.map(lambda x: x.pca_features.toArray()).collect())

In [118]:
df1A = pd.DataFrame(pca_2dmatrix1, columns=['x', 'y'])
df2A = pd.DataFrame(pca_2dmatrix2, columns=['x', 'y'])
df3A = pd.DataFrame(pca_2dmatrix3, columns=['x', 'y'])
df4A = pd.DataFrame(pca_2dmatrix4, columns=['x', 'y'])
df5A = pd.DataFrame(pca_2dmatrix5, columns=['x', 'y'])
df6A = pd.DataFrame(pca_2dmatrix6, columns=['x', 'y'])

##### Cluster 1 - Visualization in 2D and 3D Space

In [119]:
import plotly.express as px

fig = px.scatter_3d(df1, x='x', y='y', z='z', title='Cluster 1 - 3D')
fig.update_layout(scene_aspectmode='manual',
                  scene_aspectratio=dict(x=1, y=1, z=1))
fig.show()

In [120]:
fig = px.scatter(df1A, x='x', y='y', title='Cluster 1 - 2D')
fig.show()

In [121]:
#finding outlier present in cluster 1
mean_values = df1_6D.mean()
std_values = df1_6D.std()

# Set a threshold for outliers (e.g., 3 standard deviations from the mean)
threshold = 3

# Detect outliers using a boolean mask
outliers_mask = (df1_6D - mean_values).abs() > threshold * std_values

# Filter the DataFrame to get rows with at least one outlier
outliers = df1_6D[outliers_mask.any(axis=1)]
print(outliers)

       D1    D2    D3    D4    D5    D6
317  91.0  92.0  93.0  14.0  15.0  16.0


In [122]:
#removing the outlier
df1_6D1 = df1_6D.drop(317)
df1 = df1.drop(317)
df1A = df1A.drop(317)

In [123]:
#3D Visualization after removing outlier
fig = px.scatter_3d(df1, x='x', y='y', z='z', title='Cluster 1 - 3D (after removing outlier)')
fig.update_layout(scene_aspectmode='manual',
                  scene_aspectratio=dict(x=1, y=1, z=1))
fig.show()

In [124]:
#2D Visualisation after removing outlier
fig = px.scatter(df1A, x='x', y='y', title='Cluster 1 - 2D (after removing outlier)')
fig.show()

In [125]:
#calculating the size of the cluster across 3 axes
length1 = np.ptp(df1, axis=0)
print(length1)

#calculating the size of the cluster across 2 axes
length1A = np.ptp(df1A, axis=0)
print(length1A)

#calculating the size of the cluster across 6 axes
length1B = np.ptp(df1_6D1, axis=0)
print(length1B)

[0.12908166 0.17567145 0.11069277]
[0.12908166 0.17567145]
[1.31399459 2.8271627  1.95569421 2.28919737 3.97900732 4.56487748]


##### Cluster 2 - Visualization in 2D and 3D Space

In [126]:
fig = px.scatter_3d(df2, x='x', y='y', z='z', title='Cluster 2 - 3D')
fig.update_layout(scene_aspectmode='manual',
                  scene_aspectratio=dict(x=1, y=1, z=1))
fig.show()

In [151]:
fig = px.scatter(df2A, x='x', y='y', title='Cluster 2 - 2D')
fig.update_layout(scene_aspectmode='manual',
                  scene_aspectratio=dict(x=1, y=1))
fig.show()

In [128]:
#calculating the size of the cluster across 3 axes
length2 = np.ptp(df2, axis=0)
print(length2)

#calculating the size of the cluster across 2 axes
length2A = np.ptp(df2A, axis=0)
print(length2A)

#calculating the size of the cluster across 6 axes
length2B = np.ptp(df2_6D, axis=0)
print(length2B)

[0.63237087 0.4547086  0.43586687]
[0.63237087 0.4547086 ]
[12.32194775 14.98664006 15.24697556 16.07740304 15.83218229 14.49105923]


##### Cluster 3 - Visualization in 2D and 3D Space

In [129]:
fig = px.scatter_3d(df3, x='x', y='y', z='z', title='Cluster 3 - 3D')
fig.update_layout(scene_aspectmode='manual',
                  scene_aspectratio=dict(x=1, y=1, z=1))
fig.show()

In [130]:
fig = px.scatter(df3A, x='x', y='y', title='Cluster 3 - 2D')
fig.update_layout(scene_aspectmode='manual',
                  scene_aspectratio=dict(x=1, y=1))
fig.show()

In [131]:
#finding outlier present in cluster 3
mean_values = df3_6D.mean()
std_values = df3_6D.std()

# Set a threshold for outliers (e.g., 3 standard deviations from the mean)
threshold = 3

# Detect outliers using a boolean mask
outliers_mask = (df3_6D - mean_values).abs() > threshold * std_values

# Filter the DataFrame to get rows with at least one outlier
outliers = df3_6D[outliers_mask.any(axis=1)]
print(outliers)

        D1    D2    D3    D4    D5    D6
1798  11.0  12.0  13.0  94.0  95.0  96.0


In [132]:
#removing the outlier
df3_6D1 = df3_6D.drop(1798)
df3 = df3.drop(1798)
df3A = df3A.drop(1798)

In [152]:
#3D Visualisation after removing outlier
fig = px.scatter_3d(df3, x='x', y='y', z='z', title='Cluster 3 - 3D')
fig.update_layout(scene_aspectmode='manual',
                  scene_aspectratio=dict(x=1, y=1, z=1))
fig.show()

In [153]:
#2D Visualisation after removing outlier
fig = px.scatter(df3A, x='x', y='y', title='Cluster 3 - 2D')
fig.update_layout(scene_aspectmode='manual',
                  scene_aspectratio=dict(x=1, y=1))
fig.show()

In [133]:
#calculating the size of the cluster across 3 axes
length3 = np.ptp(df3, axis=0)
print(length3)

#calculating the size of the cluster across 2 axes
length3A = np.ptp(df3A, axis=0)
print(length3A)

#calculating the size of the cluster across 6 axes
length3B = np.ptp(df3_6D, axis=0)
print(length3B)

[0.38532126 0.25362956 0.22688209]
[0.38532126 0.25362956]
[16.04682784 16.46413654 16.34263155 21.37566149 23.69087203 24.8011092 ]


##### Cluster 4 - Visualization in 2D and 3D Space

In [134]:
fig = px.scatter_3d(df4, x='x', y='y', z='z', title='Cluster 4 - 3D')
fig.show()

In [135]:
fig = px.scatter(df4A, x='x', y='y', title='Cluster 4 - 2D')
fig.show()

In [136]:
#finding outlier present in cluster 4
mean_values = df4_6D.mean()
std_values = df4_6D.std()

# Set a threshold for outliers (e.g., 3 standard deviations from the mean)
threshold = 3

# Detect outliers using a boolean mask
outliers_mask = (df4_6D - mean_values).abs() > threshold * std_values

# Filter the DataFrame to get rows with at least one outlier
outliers = df4_6D[outliers_mask.any(axis=1)]
print(outliers)

        D1    D2    D3    D4    D5    D6
1077  91.0  92.0  93.0  94.0  95.0  96.0


In [137]:
#removing the outlier
df4_6D1 = df4_6D.drop(1077)
df4 = df4.drop(1077)
df4A = df4A.drop(1077)

In [138]:
#3D Visualization after removing outlier
fig = px.scatter_3d(df4, x='x', y='y', z='z', title='Cluster 4 - 3D (after removing outlier)')
fig.update_layout(scene_aspectmode='manual',
                  scene_aspectratio=dict(x=1, y=1, z=1))
fig.show()

In [139]:
#2D Visualisation after removing outlier
fig = px.scatter(df4A, x='x', y='y', title='Cluster 4 - 2D (after removing outlier)')
fig.show()

In [140]:
#calculating the size of the cluster across 3 axes
length4 = np.ptp(df4, axis=0)
print(length4)

#calculating the size of the cluster across 2 axes
length4A = np.ptp(df4A, axis=0)
print(length4A)

#calculating the size of the cluster across 6 axes
length4B = np.ptp(df4_6D1, axis=0)
print(length4B)

[0.39889306 0.17808598 0.01829411]
[0.39889306 0.17808598]
[6.78484428 1.61215833 9.75762464 1.76449433 5.85540837 4.65188637]


##### Cluster 5 - Visualization in 2D and 3D Space

In [141]:
fig = px.scatter_3d(df5, x='x', y='y', z='z', title='Cluster 5 - 3D')
fig.show()

In [142]:
fig = px.scatter(df5A, x='x', y='y', title='Cluster 5 - 2D')
fig.show()

In [143]:
#calculating the size of the cluster across 3 axes
length5 = np.ptp(df5, axis=0)
print(length5)

#calculating the size of the cluster across 2 axes
length5A = np.ptp(df5A, axis=0)
print(length5A)

#calculating the size of the cluster across 6 axes
length5B = np.ptp(df5_6D, axis=0)
print(length5B)

[0.04655451 0.04576661 0.03770191]
[0.04655451 0.04576661]
[0.9908578  0.99361846 0.99795141 0.99317739 0.99966001 0.99925865]


##### Cluster 6 - Visualization in 2D and 3D Space

In [144]:
fig = px.scatter_3d(df6, x='x', y='y', z='z', title='Cluster 6')
fig.show()

In [145]:
fig = px.scatter(df6A, x='x', y='y', title='Cluster 6 - 2D')
fig.show()

In [146]:
#finding outlier present in cluster 6
mean_values = df6_6D.mean()
std_values = df6_6D.std()

# Set a threshold for outliers (e.g., 3 standard deviations from the mean)
threshold = 3

# Detect outliers using a boolean mask
outliers_mask = (df6_6D - mean_values).abs() > threshold * std_values

# Filter the DataFrame to get rows with at least one outlier
outliers = df6_6D[outliers_mask.any(axis=1)]
print(outliers)

Empty DataFrame
Columns: [D1, D2, D3, D4, D5, D6]
Index: []


In [147]:
#calculating the size of the cluster across 3 axes
length6 = np.ptp(df6, axis=0)
print(length6)

#calculating the size of the cluster across 2 axes
length6A = np.ptp(df6A, axis=0)
print(length6A)

#calculating the size of the cluster across 6 axes
length6B = np.ptp(df6_6D, axis=0)
print(length6B)

[8.41814765e-01 1.39144225e-01 1.05349757e-13]
[0.84181476 0.13914422]
[9.98720549 9.98720549 9.98720549 9.98720549 9.98720549 9.98720549]
