In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

spark = SparkSession.builder.appName('cluster').getOrCreate()

In [0]:
dataset = spark.read.csv('/FileStore/tables/seeds_dataset.csv', inferSchema=True, header=True)

In [0]:
dataset.head(1)

Out[18]: [Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [0]:
assembler = VectorAssembler(inputCols=dataset.columns, outputCol='features')

In [0]:
final_data = assembler.transform(dataset)

In [0]:
scaler = StandardScaler(inputCol='features', outputCol='ScaledFeatures')

In [0]:
scaler_model = scaler.fit(final_data)

In [0]:
final_data = scaler_model.transform(final_data)

In [0]:
final_data.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|      ScaledFeatures|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|[5.24452795332028...|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|[5.11393027165175...|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|[4.91116018695588...|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|     

In [0]:
kmeans = KMeans(featuresCol='ScaledFeatures', k=3)

In [0]:
model = kmeans.fit(final_data)

In [0]:
predictions = model.transform(final_data)

In [0]:
evaluator = ClusteringEvaluator(predictionCol='prediction',featuresCol='features',metricName='silhouette')
squaredEuclideanDistance = evaluator.evaluate(predictions)
squaredEuclideanDistance

Out[15]: 0.616267393520126

In [0]:
model.clusterCenters()

Out[16]: [array([ 4.93382436, 10.94691274, 37.30542404, 12.41332714,  8.60366812,
         1.82917353, 10.40106154]),
 array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
         2.41585013, 12.29286107]),
 array([ 4.06660859, 10.14191893, 35.84098009, 11.81592066,  7.52397236,
         3.1823335 , 10.39801233])]

In [0]:
predictions.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+----------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|            features|      ScaledFeatures|prediction|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+--------------------+--------------------+----------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|[15.26,14.84,0.87...|[5.24452795332028...|         0|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|[14.88,14.57,0.88...|[5.11393027165175...|         0|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|[14.29,14.09,0.90...|[4.91116018695588...|         0|
|13.84|    13.94|     0.8955

In [0]:
predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   67|
|         2|   69|
|         0|   74|
+----------+-----+

