## Importing the libraries

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.\
        builder.\
        appName('Clustering_seed').\
        getOrCreate()

## Preparing the data

### Importing the Seed data

In [3]:
df = spark.read.csv('seeds_dataset.csv', header = True, inferSchema=True)
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [4]:
for item in df.head(1)[0]:
    print(item)

15.26
14.84
0.871
5.763
3.312
2.221
5.22


### Vectorizing the features

In [5]:
from pyspark.ml.feature import VectorAssembler
data = VectorAssembler(inputCols=df.columns, outputCol='features').transform(df)

In [6]:
data = data.select('features')

### Scaling the features

In [7]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='features', outputCol='scaled_features').fit(data)
data = scaler.transform(data)

In [8]:
data.show()

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|[15.26,14.84,0.87...|[5.24452795332028...|
|[14.88,14.57,0.88...|[5.11393027165175...|
|[14.29,14.09,0.90...|[4.91116018695588...|
|[13.84,13.94,0.89...|[4.75650503761158...|
|[16.14,14.99,0.90...|[5.54696468981581...|
|[14.38,14.21,0.89...|[4.94209121682475...|
|[14.69,14.49,0.87...|[5.04863143081749...|
|[14.11,14.1,0.891...|[4.84929812721816...|
|[16.63,15.46,0.87...|[5.71536696354628...|
|[16.44,15.25,0.88...|[5.65006812271202...|
|[15.26,14.85,0.86...|[5.24452795332028...|
|[14.03,14.16,0.87...|[4.82180387844584...|
|[13.89,14.02,0.88...|[4.77368894309428...|
|[13.78,14.06,0.87...|[4.73588435103234...|
|[13.74,14.05,0.87...|[4.72213722664617...|
|[14.59,14.28,0.89...|[5.01426361985209...|
|[13.99,13.83,0.91...|[4.80805675405968...|
|[15.69,14.75,0.90...|[5.39230954047151...|
|[14.7,14.21,0.915...|[5.05206821191403...|
|[12.72,13.57,0.86...|[4.3715855

## Fitting and predicting with KMeans Clustering 

### Fitting the model

In [9]:
from pyspark.ml.clustering import KMeans
model = KMeans(k=3, featuresCol='scaled_features').setSeed(42).fit(data)
print('WSSE: {:.3f}'.format(model.summary.trainingCost))

WSSE: 428.608


### Predictions with the model

In [10]:
prediction = model.transform(data)
prediction.groupBy('prediction').count().orderBy('prediction').show()

+----------+-----+
|prediction|count|
+----------+-----+
|         0|   71|
|         1|   67|
|         2|   72|
+----------+-----+

