In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

spark = SparkSession.builder.appName('cluster').getOrCreate()

In [0]:
dataset = spark.read.format('libsvm').load('/FileStore/tables/sample_kmeans_data.txt')

In [0]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [0]:
final_data = dataset.select('features')

In [0]:
kmeans = KMeans().setK(2).setSeed(1)

In [0]:
model = kmeans.fit(final_data)

In [0]:
predictions = model.transform(final_data)

In [0]:
evaluator = ClusteringEvaluator(predictionCol='prediction',featuresCol='features',metricName='silhouette')
squaredEuclideanDistance = evaluator.evaluate(predictions)
squaredEuclideanDistance

Out[20]: 0.9997530305375207

In [0]:
centers = model.clusterCenters()
centers

Out[22]: [array([9.1, 9.1, 9.1]), array([0.1, 0.1, 0.1])]

In [0]:
predictions.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         1|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+

