In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("cluster_exam").getOrCreate()

In [3]:
from pyspark.ml.clustering import KMeans

# Loads data
dataset = spark.read.csv("/home/shoaibalauddin/spark-2.4.4-bin-hadoop2.7/python/pyspark_notebooks/dataset/kmeans/seeds_dataset.csv", header=True, inferSchema=True)

In [4]:
dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [6]:
dataset.describe().show()

+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|summary|              area|         perimeter|         compactness|   length_of_kernel|   width_of_kernel|asymmetry_coefficient|   length_of_groove|
+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+
|  count|               210|               210|                 210|                210|               210|                  210|                210|
|   mean|14.847523809523816|14.559285714285718|  0.8709985714285714|  5.628533333333335| 3.258604761904762|   3.7001999999999997|  5.408071428571429|
| stddev|2.9096994306873647|1.3059587265640225|0.023629416583846364|0.44306347772644983|0.3777144449065867|   1.5035589702547392|0.49148049910240543|
|    min|             10.59|             12.41|              0.8081|              4.899|            

In [8]:
from pyspark.ml.feature import VectorAssembler

In [9]:
assembler = VectorAssembler(inputCols=dataset.columns, outputCol="features")
final_dataset = assembler.transform(dataset)

In [10]:
final_dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [11]:
from pyspark.ml.feature import StandardScaler

In [12]:
scaler = StandardScaler(inputCol="features", outputCol="scalerFeatures")

In [13]:
final_dataset = scaler.fit(final_dataset).transform(final_dataset)

In [14]:
final_dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- scalerFeatures: vector (nullable = true)



In [16]:
# Model training
kmeans = KMeans(featuresCol="scalerFeatures", k=3)
model = kmeans.fit(final_dataset)

In [17]:
# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(final_dataset)
print("Final cost (Within Set Sum of Squared Errors)", wssse)

Final cost (Within Set Sum of Squared Errors) 428.60820118716356


In [20]:
# cluster center point
centers = model.clusterCenters()
print("Center Points")
for center in centers:
    print(center,"\n")

Center Points
[ 6.35645488 12.40730852 37.41990178 13.93860446  9.7892399   2.41585013
 12.29286107] 

[ 4.07497225 10.14410142 35.89816849 11.80812742  7.54416916  3.15410901
 10.38031464] 

[ 4.96198582 10.97871333 37.30930808 12.44647267  8.62880781  1.80061978
 10.41913733] 



In [22]:
model.transform(final_dataset).select("scalerFeatures", "prediction").show()

+--------------------+----------+
|      scalerFeatures|prediction|
+--------------------+----------+
|[5.24452795332028...|         2|
|[5.11393027165175...|         2|
|[4.91116018695588...|         2|
|[4.75650503761158...|         2|
|[5.54696468981581...|         2|
|[4.94209121682475...|         2|
|[5.04863143081749...|         2|
|[4.84929812721816...|         2|
|[5.71536696354628...|         0|
|[5.65006812271202...|         2|
|[5.24452795332028...|         2|
|[4.82180387844584...|         2|
|[4.77368894309428...|         2|
|[4.73588435103234...|         2|
|[4.72213722664617...|         2|
|[5.01426361985209...|         2|
|[4.80805675405968...|         2|
|[5.39230954047151...|         2|
|[5.05206821191403...|         2|
|[4.37158555479908...|         1|
+--------------------+----------+
only showing top 20 rows

