In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [2]:
spark = SparkSession.builder.appName('kmeans').getOrCreate()
spark

In [3]:
data = spark.read.format('libsvm').load('sample_kmeans_data.txt')
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [4]:
data.count()

6

In [5]:
data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
+-----+--------------------+
only showing top 5 rows



In [6]:
data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|                 6|
|   mean|               2.5|
| stddev|1.8708286933869707|
|    min|               0.0|
|    max|               5.0|
+-------+------------------+



In [7]:
final_df = data.select('features')
final_df.show(5)

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
+--------------------+
only showing top 5 rows



In [9]:
kmeans = KMeans().setK(2).setSeed(36)
kmeans_model = kmeans.fit(final_df)
kmeans_model

KMeansModel: uid=KMeans_254283fcf151, k=2, distanceMeasure=euclidean, numFeatures=3

In [10]:
cluster_centers = kmeans_model.clusterCenters()
cluster_centers

[array([0.1, 0.1, 0.1]), array([9.1, 9.1, 9.1])]

In [11]:
sse = kmeans_model.summary.trainingCost
sse

0.11999999999994547

In [12]:
kmeans_model.summary.cluster.show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         1|
|         1|
|         1|
+----------+



In [13]:
kmeans_model.summary.clusterSizes

[3, 3]

In [14]:
kmeans_model.summary.predictions.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         0|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+



In [15]:
results = kmeans_model.transform(final_df)
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         0|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+



In [16]:
clusteval = ClusteringEvaluator()
silhouette_score = clusteval.evaluate(results)
print("Silhouette Coefficient of KMeans:", silhouette_score)

Silhouette Coefficient of KMeans: 0.9997530305375207


In [17]:
kmeans = KMeans(k=3,seed=36)
model = kmeans.fit(final_df)

In [18]:
sse = model.summary.trainingCost
sse

0.07499999999994544

In [19]:
cluster_centers = model.clusterCenters()
cluster_centers

[array([0.05, 0.05, 0.05]), array([9.1, 9.1, 9.1]), array([0.2, 0.2, 0.2])]

In [20]:
model.transform(final_df).show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+



In [21]:
clusteval.evaluate(model.transform(final_df))

0.6248737134600261

So, the optimal number of KMeans clusters for this dataset is 2.