# Clustering in Spark

In [1]:
#import modules

from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.regression import LinearRegression

In [2]:
#Create Spark Session

appName = "Clusterig in Spark"
spark = SparkSession.builder.appName("appName").config("spark.some.config.option","some-value").getOrCreate()

In [3]:
#Load the data

customers = spark.read.csv("dataset/customers.csv",inferSchema=True,header=True)
customers.show(3)

+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
|   CustomerName|Age|MaritalStatus|IncomeRange|Gender|TotalChildren|ChildrenAtHome|Education|Occupation|HomeOwner|Cars|
+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
|    Aaron Adams| 42|            0|      50000|     0|            0|             0|        3|         2|        1|   1|
|Aaron Alexander| 40|            1|      50000|     0|            0|             0|        2|         2|        1|   2|
|    Aaron Allen| 63|            0|      25000|     0|            2|             1|        2|         1|        1|   2|
+---------------+---+-------------+-----------+------+-------------+--------------+---------+----------+---------+----+
only showing top 3 rows



# Prepare the data

In [4]:
#Define Assembler

assembler = VectorAssembler(inputCols = ["Age","MaritalStatus","IncomeRange","Gender","TotalChildren","ChildrenAtHome","Education","HomeOwner","Cars"],outputCol ="features")

data = assembler.transform(customers).select("CustomerName","features")

data.show(truncate=False,n=5)


+---------------+------------------------------------------+
|CustomerName   |features                                  |
+---------------+------------------------------------------+
|Aaron Adams    |[42.0,0.0,50000.0,0.0,0.0,0.0,3.0,1.0,1.0]|
|Aaron Alexander|[40.0,1.0,50000.0,0.0,0.0,0.0,2.0,1.0,2.0]|
|Aaron Allen    |[63.0,0.0,25000.0,0.0,2.0,1.0,2.0,1.0,2.0]|
|Aaron Baker    |[56.0,1.0,50000.0,0.0,4.0,2.0,2.0,1.0,2.0]|
|Aaron Bryant   |[72.0,0.0,75000.0,0.0,4.0,0.0,4.0,1.0,2.0]|
+---------------+------------------------------------------+
only showing top 5 rows



In [5]:
#define kmeans clustering model

kmeans = KMeans(featuresCol = assembler.getOutputCol(),predictionCol = "cluster" , k=5)
model = kmeans.fit(data)
print ("Model is successfully trained")


Model is successfully trained



# Print centroid for each cluster

In [6]:
#find centriod
centers = model.clusterCenters()
print("Cluster centers: ")
for center in centers:
        print(center)

Cluster centers: 
[5.60711289e+01 5.83804487e-01 7.50000000e+04 5.03921211e-01
 2.17308043e+00 8.16706183e-01 3.73244574e+00 7.23326646e-01
 1.38063104e+00]
[5.31013005e+01 4.17180014e-01 2.50000000e+04 4.80492813e-01
 1.41512663e+00 6.08487337e-01 2.31622177e+00 5.93086927e-01
 1.11464750e+00]
[5.53417813e+01 5.72411296e-01 1.00000000e+05 4.97103548e-01
 2.54380883e+00 1.54272266e+00 3.46198407e+00 7.16509776e-01
 1.94532947e+00]
[5.82794840e+01 6.22850123e-01 1.50000000e+05 4.79729730e-01
 2.07248157e+00 3.20638821e+00 3.41461916e+00 6.48648649e-01
 3.10995086e+00]
[5.19737441e+01 5.26868545e-01 5.00000000e+04 4.93961141e-01
 1.34552774e+00 4.98337126e-01 3.23035183e+00 6.62699107e-01
 1.14615789e+00]


# Cluster the Data

In [7]:
#Clustering

prediction = model.transform(data) #cluster given data
prediction.groupBy("cluster").count().orderBy("cluster").show() #count menbers in each cluster
prediction.select("CustomerName", "cluster").show(4) #show several clustered data

+-------+-----+
|cluster|count|
+-------+-----+
|      0| 5483|
|      1| 2922|
|      2| 2762|
|      3| 1628|
|      4| 5713|
+-------+-----+

+---------------+-------+
|   CustomerName|cluster|
+---------------+-------+
|    Aaron Adams|      4|
|Aaron Alexander|      4|
|    Aaron Allen|      1|
|    Aaron Baker|      4|
+---------------+-------+
only showing top 4 rows

