In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('clustering').getOrCreate()

In [0]:
#Importing Libraries
from pyspark.ml.clustering import KMeans

In [0]:
#Uploading the dataset
dataset = spark.read.format('libsvm').load('/FileStore/tables/sample_kmeans_data.txt')
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [0]:
#Dataset has labels but we do not need it
final_data = dataset.select('features')
final_data.show(3)

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
+--------------------+
only showing top 3 rows



In [0]:
#Model
#Using seed to ensure our randomly chosen centroids do not change
kmeans = KMeans(featuresCol='features').setK(2).setSeed(1)
model = kmeans.fit(final_data)

In [0]:
#Check within set of sum of squared errors (wssse)
wssse = model.summary.trainingCost
print(wssse)

0.11999999999994547


In [0]:
#Finding cluster centers
centers = model.clusterCenters()
print(centers)

[array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]


In [0]:
#Checking the assigned labels
results = model.transform(final_data)
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+



In [0]:
#Trying a different number of clusters (k = 3)
kmeans = KMeans(featuresCol='features').setK(3).setSeed(1)
model = kmeans.fit(final_data)

centers = model.clusterCenters()
print(centers)

results = model.transform(final_data)
results.show()

[array([9.1, 9.1, 9.1]), array([0.05, 0.05, 0.05]), array([0.2, 0.2, 0.2])]
+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+

