In [1]:
# Boiler Plate
import findspark
import numpy as np
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster2').getOrCreate()

In [2]:
dataset = spark.read.csv('seeds_dataset.csv', header=True, inferSchema=True)

In [3]:
dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [4]:
dataset.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

Note that there are no labels for the type of seed. What we do know is that there were three different varities of wheat. We'll use K = 3.

In [5]:
from pyspark.ml.clustering import KMeans

In [6]:
from pyspark.ml.feature import VectorAssembler

In [8]:
dataset.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [10]:
assembler = VectorAssembler(inputCols=dataset.columns, outputCol='features')

In [11]:
final_data = assembler.transform(dataset)

In [12]:
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



Scaling the data...

In [13]:
from pyspark.ml.feature import StandardScaler

In [14]:
scaler = StandardScaler(inputCol='features',outputCol='scaledFeatures')

In [15]:
scaler_model = scaler.fit(final_data)

In [16]:
final_data = scaler_model.transform(final_data)

In [17]:
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

Now we have a features and a scaled features column. This becomes more important when there are larger changes of magnitude in the data.

In [25]:
kmeans = KMeans(featuresCol='scaledFeatures',k=2)

In [26]:
model = kmeans.fit(final_data)

In [27]:
print("WSSE")
print(model.computeCost(final_data))

WSSE
656.032839539751


In [28]:
centers = model.clusterCenters()

In [29]:
print(centers)

[array([  4.44396468,  10.48536862,  36.54671035,  12.05177027,
         8.0111241 ,   2.54558929,  10.33965102]), array([  6.2407035 ,  12.29350122,  37.40324608,  13.82968554,
         9.69123508,   2.31478489,  12.15051313])]


In [30]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         1|
|         0|
|         0|
|         0|
|         1|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
+----------+
only showing top 20 rows



And these are our clusters!