In [15]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

spark = SparkSession.builder.appName("BDA").getOrCreate()

In [13]:
df_pyspark = spark.read.csv("data.csv", header=True, inferSchema=True)
df_pyspark.printSchema()

root
 |-- x: integer (nullable = true)
 |-- y: integer (nullable = true)



In [19]:
assemble=VectorAssembler(inputCols=[
 'x',
 'y',], outputCol='features')
assembled_data=assemble.transform(df_pyspark)
assembled_data.show(2)

+------+------+-------------------+
|     x|     y|           features|
+------+------+-------------------+
|151700|351102|[151700.0,351102.0]|
|155799|354358|[155799.0,354358.0]|
+------+------+-------------------+
only showing top 2 rows



In [20]:
from pyspark.ml.feature import StandardScaler
scale=StandardScaler(inputCol='features',outputCol='standardized')
data_scale=scale.fit(assembled_data)
data_scale_output=data_scale.transform(assembled_data)
data_scale_output.show(2)

+------+------+-------------------+--------------------+
|     x|     y|           features|        standardized|
+------+------+-------------------+--------------------+
|151700|351102|[151700.0,351102.0]|[1.74619409479561...|
|155799|354358|[155799.0,354358.0]|[1.79337701895228...|
+------+------+-------------------+--------------------+
only showing top 2 rows



In [22]:
silhouette_score=[]
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='standardized',
                                metricName='silhouette', distanceMeasure='squaredEuclidean')


for i in range(2,10):
    KMeans_algo=KMeans(featuresCol='standardized', k=i)
    
    KMeans_fit=KMeans_algo.fit(data_scale_output)
    
    output=KMeans_fit.transform(data_scale_output)
    
    
    score=evaluator.evaluate(output)
    
    silhouette_score.append(score)
    
    print("Silhouette Score:",score)

Silhouette Score: 0.6222246389682424
Silhouette Score: 0.8591373156043869
Silhouette Score: 0.9037290460344806
Silhouette Score: 0.9129758594222266
Silhouette Score: 0.8046552590739979
Silhouette Score: 0.8013188211395024
Silhouette Score: 0.8877472990645328
Silhouette Score: 0.910296976495051
