# Spark - KMeans Exercise

**Imports**

In [16]:
import findspark
findspark.init('/home/sedat/spark-3.3.2-bin-hadoop3')
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StandardScaler

**Start spark session and read csv file**

In [4]:
spark = SparkSession.builder.appName('seed_kmeans').getOrCreate()

In [41]:
df = spark.read.csv('seeds_dataset.csv', inferSchema=True, header=True)

In [42]:
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [6]:
df.show()

+-----+---------+-----------+------------------+------------------+---------------------+------------------+
| area|perimeter|compactness|  length_of_kernel|   width_of_kernel|asymmetry_coefficient|  length_of_groove|
+-----+---------+-----------+------------------+------------------+---------------------+------------------+
|15.26|    14.84|      0.871|             5.763|             3.312|                2.221|              5.22|
|14.88|    14.57|     0.8811| 5.553999999999999|             3.333|                1.018|             4.956|
|14.29|    14.09|      0.905|             5.291|3.3369999999999997|                2.699|             4.825|
|13.84|    13.94|     0.8955|             5.324|3.3789999999999996|                2.259|             4.805|
|16.14|    14.99|     0.9034|5.6579999999999995|             3.562|                1.355|             5.175|
|14.38|    14.21|     0.8951|             5.386|             3.312|   2.4619999999999997|             4.956|
|14.69|    14.49|  

**Total observation units**

In [7]:
df.count()

210

**Vector Assembler**

In [24]:
assembler = VectorAssembler(inputCols=df.columns, outputCol='features')

In [25]:
output = assembler.transform(df)

In [26]:
output.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



**Standard Scaler**

In [27]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [28]:
scaled_df = scaler.fit(output).transform(output)

In [29]:
scaled_df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



**Final data and model**

In [32]:
final_data = scaled_df.select('scaledFeatures')

In [33]:
kmeans = KMeans(k=3, featuresCol='scaledFeatures')

In [34]:
kmeans_model = kmeans.fit(final_data)

In [36]:
centers = kmeans_model.clusterCenters()
centers

[array([ 6.31670546, 12.37109759, 37.39491396, 13.91155062,  9.748067  ,
         2.39849968, 12.2661748 ]),
 array([ 4.06105916, 10.13979506, 35.80536984, 11.82133095,  7.50395937,
         3.27184732, 10.42126018]),
 array([ 4.87257659, 10.88120146, 37.27692543, 12.3410157 ,  8.55443412,
         1.81649011, 10.32998598])]

In [38]:
kmeans_model.transform(final_data).show()

+--------------------+----------+
|      scaledFeatures|prediction|
+--------------------+----------+
|[5.24452795332028...|         2|
|[5.11393027165175...|         2|
|[4.91116018695588...|         2|
|[4.75650503761158...|         2|
|[5.54696468981581...|         2|
|[4.94209121682475...|         2|
|[5.04863143081749...|         2|
|[4.84929812721816...|         2|
|[5.71536696354628...|         0|
|[5.65006812271202...|         0|
|[5.24452795332028...|         2|
|[4.82180387844584...|         2|
|[4.77368894309428...|         2|
|[4.73588435103234...|         2|
|[4.72213722664617...|         2|
|[5.01426361985209...|         2|
|[4.80805675405968...|         2|
|[5.39230954047151...|         2|
|[5.05206821191403...|         2|
|[4.37158555479908...|         1|
+--------------------+----------+
only showing top 20 rows

