## Import modules and create Spark session

In [1]:
#import modules
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

#create session
appName = "Clustering in Spark"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

NameError: name 'SparkSession' is not defined

## Read file into dataFrame
Description for each column data:
- CustomerName: name of customer
- Age: age of customer (in year)
- MaritalStatus: (1=married, 0=not married)
- IncomeRange: income per year (in USD)
- Gender: (1=female, 2=male)
- TotalChildren: number of children customer has
- ChildrenAtHome: number of children living with customer (in the same home)
- Education: (1=high school, 2=bachelor, 3=master, 4=PhD, 5=Post-doc)
- Occupation: (0=unskilled manual work until 5=professional)
- HomeOwner: (1=owning a home, 0=not owning a home)
- Cars: number of car customer has

In [None]:
#read csv file using automatically inferred schema
customers = spark.read.csv(
    'dataset/customers.csv', inferSchema=True, header=True)
customers.show(3)

## Prepare the data

In [None]:
#define assembler
assembler = VectorAssembler(inputCols = [
    "Age", "MaritalStatus", "IncomeRange", "Gender", "TotalChildren", 
    "ChildrenAtHome", "Education", "Occupation", "HomeOwner", "Cars"], 
                            outputCol="features")
data = assembler.transform(customers).select('CustomerName', 'features')
data.show(truncate = False, n=3)

## Create k-Means clustering model

In [None]:
#define kMeans clustering algorithm
kmeans = KMeans(
    featuresCol=assembler.getOutputCol(), 
    predictionCol="cluster", k=5)
model = kmeans.fit(data)
print ("Model is successfully trained!")

## Print centroid for each cluster

In [None]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

## Cluster the data

In [None]:
prediction = model.transform(data)#cluster given data
prediction.groupBy("cluster").count().orderBy("cluster").show()#count members in each cluster
prediction.select('CustomerName', 'cluster').show(5)#show several clustered data
prediction.show(5)