In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

## Create dataframe
https://spark.apache.org/docs/latest/api/python/pyspark.ml.html?highlight=kmeans#pyspark.ml.clustering.KMeans

In [2]:
#Load the data and create an RDD (16 pixels and label)
pen_raw = sc.textFile("../Data/penbased.dat", 4).map(lambda x:  x.split(", ")).map(lambda row: [float(x) for x in row])

In [3]:
#Create a DataFrame
from pyspark.sql.types import *
from pyspark.sql import Row
penschema = StructType([
    StructField("pix1",DoubleType(),True),
    StructField("pix2",DoubleType(),True),
    StructField("pix3",DoubleType(),True),
    StructField("pix4",DoubleType(),True),
    StructField("pix5",DoubleType(),True),
    StructField("pix6",DoubleType(),True),
    StructField("pix7",DoubleType(),True),
    StructField("pix8",DoubleType(),True),
    StructField("pix9",DoubleType(),True),
    StructField("pix10",DoubleType(),True),
    StructField("pix11",DoubleType(),True),
    StructField("pix12",DoubleType(),True),
    StructField("pix13",DoubleType(),True),
    StructField("pix14",DoubleType(),True),
    StructField("pix15",DoubleType(),True),
    StructField("pix16",DoubleType(),True),
    StructField("label",DoubleType(),True)
])

dfpen = ss.createDataFrame(pen_raw.map(lambda x : Row(x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],x[8],x[9],x[10],x[11],x[12],x[13],x[14],x[15],x[16])), penschema)

In [4]:
dfpen.show()

+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| pix1| pix2|pix3| pix4| pix5| pix6| pix7| pix8| pix9|pix10|pix11|pix12|pix13|pix14|pix15|pix16|label|
+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 47.0|100.0|27.0| 81.0| 57.0| 37.0| 26.0|  0.0|  0.0| 23.0| 56.0| 53.0|100.0| 90.0| 40.0| 98.0|  8.0|
|  0.0| 89.0|27.0|100.0| 42.0| 75.0| 29.0| 45.0| 15.0| 15.0| 37.0|  0.0| 69.0|  2.0|100.0|  6.0|  2.0|
|  0.0| 57.0|31.0| 68.0| 72.0| 90.0|100.0|100.0| 76.0| 75.0| 50.0| 51.0| 28.0| 25.0| 16.0|  0.0|  1.0|
|  0.0|100.0| 7.0| 92.0|  5.0| 68.0| 19.0| 45.0| 86.0| 34.0|100.0| 45.0| 74.0| 23.0| 67.0|  0.0|  4.0|
|  0.0| 67.0|49.0| 83.0|100.0|100.0| 81.0| 80.0| 60.0| 60.0| 40.0| 40.0| 33.0| 20.0| 47.0|  0.0|  1.0|
|100.0|100.0|88.0| 99.0| 49.0| 74.0| 17.0| 47.0|  0.0| 16.0| 37.0|  0.0| 73.0| 16.0| 20.0| 20.0|  6.0|
|  0.0|100.0| 3.0| 72.0| 26.0| 35.0| 85.0| 35.0|100.0| 71.0| 73.0| 97.0| 

## Create dataframe with a feature vector (Exclude the label)

In [5]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(outputCol="features", inputCols=dfpen.columns[0:-1]) #except the last col.
penlpoints = va.transform(dfpen)

## Apply KMeans algorithm to the data frame

In [6]:
from pyspark.ml.clustering import KMeans
kmeans =  KMeans(k = 10, maxIter = 200, tol = 0.1) # k = 10 as there are 10 different handwritten numbers.
model = kmeans.fit(penlpoints)

## Evaluation

In [7]:
# Evaluate clustering by computing Within Set Sum of Squared Errors
wssse = model.computeCost(penlpoints) 
print("Within Set Sum of Squared Errors = " + str(wssse))

Within Set Sum of Squared Errors = 45280957.45483561


In [8]:
# Average distance from the center (max = 100)
import math
print("Average distance from the center = " + str(math.sqrt(wssse/pen_raw.count())))

Average distance from the center = 67.58917632982867


In [9]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[ 4.03370787 62.0247191  30.71123596 73.0741573  72.35280899 89.53595506
 89.69101124 93.71685393 77.94494382 73.7988764  70.23483146 50.19550562
 56.28314607 24.08539326 50.03820225  1.04269663]
[48.06390328 96.48618307 23.25734024 79.91450777  3.66407599 54.61830743
 43.97668394 46.32210708 86.64075993 55.19948187 87.21675302 59.74438687
 71.87737478 30.44214162 60.22711572  0.10535406]
[88.25108601 98.0807993  53.39443962 87.79148566 22.04344049 61.84361425
  7.56733275 30.90443093 34.33709818  9.51520417 79.12945265 13.89574283
 60.08774978 28.24847958 12.392702   21.27106864]
[26.65292096 89.98969072 18.11168385 60.59621993 61.08419244 37.43298969
 43.56872852  2.84536082  6.46735395 18.25945017  8.54295533 63.95876289
 40.33161512 93.51890034 97.84364261 98.104811  ]
[45.66170501 90.57239513 27.49526387 70.35994587 35.14749662 30.68200271
 30.8849797   2.11096076 43.87550744 19.1826793  82.17320704 54.35723951
 88.74289581 90.20162382 30.40595399 90.73612991]
[2

In [10]:
# prediction is a group, not an actual label.
model.transform(penlpoints).select('label', 'prediction').groupBy('label', 'prediction').count().show(100)

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  4.0|         8|    4|
|  0.0|         4|  352|
|  4.0|         9|    3|
|  1.0|         0|  584|
|  0.0|         5|  636|
|  5.0|         3|  555|
|  1.0|         8|  263|
|  0.0|         6|   22|
|  1.0|         2|   27|
|  7.0|         7|  812|
|  7.0|         4|    2|
|  6.0|         8|    1|
|  8.0|         4|  385|
|  6.0|         2|  963|
|  7.0|         6|    1|
|  8.0|         6|  398|
|  5.0|         6|    3|
|  1.0|         7|   41|
|  2.0|         9|    1|
|  7.0|         2|    3|
|  3.0|         8|    1|
|  5.0|         9|  304|
|  2.0|         7|   50|
|  9.0|         9|  656|
|  2.0|         0|   14|
|  9.0|         6|    1|
|  9.0|         8|   13|
|  6.0|         1|    3|
|  0.0|         0|    2|
|  7.0|         1|    1|
|  4.0|         0|   14|
|  4.0|         1|  961|
|  3.0|         9|  899|
|  0.0|         1|    9|
|  7.0|         0|  142|
|  9.0|         2|    3|
|  8.0|         9|   61|
