# Kmeans Clustering using Spark Mllib

Import the necessary Pyspark functions. Since we have already defined the Spark Context,sc is the keyword which will populate it.

In [1]:
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('vinodh_kmeans')
sc

<pyspark.context.SparkContext at 0x7f3261bc9550>

Load the dataset from Cloudera. It is important to load a dataset with values that can be clustered and since we are using numpy, sqrt be sure to use a numeric dataset.  

In [2]:
#Load and Parse the data
data = sc.textFile('iris_data.csv')
parsedData1 = data.map(lambda line: array([x for x in line.split(',')]))

In [3]:
parsedData1

PythonRDD[2] at RDD at PythonRDD.scala:43

Check the first five rows to make sure the data is loaded properly

In [4]:
parsedData1.take(5)

[array([u'5.1', u'3.5', u'1.4', u'0.2', u'Iris-setosa'], 
       dtype='<U11'), array([u'4.9', u'3.0', u'1.4', u'0.2', u'Iris-setosa'], 
       dtype='<U11'), array([u'4.7', u'3.2', u'1.3', u'0.2', u'Iris-setosa'], 
       dtype='<U11'), array([u'7.6', u'3.1', u'1.5', u'0.2', u'Iris-setosa'], 
       dtype='<U11'), array([u'5.0', u'3.6', u'1.4', u'0.2', u'Iris-setosa'], 
       dtype='<U11')]

In [5]:
#Take only the parameters we use to predict the class
params_only =  parsedData1.map(lambda x : array([float(x[0]),float(x[1]),float(x[2]),float(x[3])]))


In [6]:
#Check the contents
params_only

PythonRDD[4] at RDD at PythonRDD.scala:43

In [7]:
params_only.take(5)

[array([ 5.1,  3.5,  1.4,  0.2]),
 array([ 4.9,  3. ,  1.4,  0.2]),
 array([ 4.7,  3.2,  1.3,  0.2]),
 array([ 7.6,  3.1,  1.5,  0.2]),
 array([ 5. ,  3.6,  1.4,  0.2])]

Evaluate clustering by computing Within Set Sum of Squared Errors(WSSSE)

In [2]:
from pyspark.mllib.clustering import KMeans, KMeansModel

In [3]:

def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

In [12]:
WSSSE = (params_only.map(lambda point: error(point)).reduce(lambda x, y: x + y))
print ("Within Set sum of square error = " + str(WSSSE))

Within Set sum of square error = 140.891195804


Repeat with a number of clusters specified within a range

In [14]:
for i in range(1,10):
    clusters = KMeans.train(params_only,i,maxIterations=100,runs=100,initializationMode="random")
    WSSSE = (params_only.map(lambda point: error(point)).reduce(lambda x, y: x + y))
    print ("With" + str(i)+ "cluster : Within set sum of squared error = " + str(WSSSE))
    

With1cluster : Within set sum of squared error = 298.709116551
With2cluster : Within set sum of squared error = 140.891195804
With3cluster : Within set sum of squared error = 110.27617199
With4cluster : Within set sum of squared error = 105.358601825
With5cluster : Within set sum of squared error = 93.432614554
With6cluster : Within set sum of squared error = 89.2326694169
With7cluster : Within set sum of squared error = 79.3055707021
With8cluster : Within set sum of squared error = 74.9570367531
With9cluster : Within set sum of squared error = 71.0354012011
