# Importing machine learning libraries and packages

In [1]:
import findspark
findspark.init()
import pyspark

In [2]:
# mengimport modul yang dibutuhkan
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

#membuat session
appName = "Clustering di Apache Spark 2"
spark = SparkSession \
.builder \
.appName(appName) \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

# Create customer data from files

In [10]:
# memuat data dari file ke DataFrame dengan infer skema
latihan = spark.read.csv('latihan.csv', inferSchema=True, header=True)
latihan.show(10)

+----+-------+-------+--------+
|Data|Fitur_X|Fitur_Y|Kelompok|
+----+-------+-------+--------+
|   1|      5|      8|       2|
|   2|      5|      6|       2|
|   3|      9|      3|       2|
|   4|      1|      4|       1|
|   5|      7|      8|       3|
|   6|      1|      2|       1|
|   7|      2|      2|       1|
|   8|      9|      4|       3|
|   9|      5|     10|       2|
|  10|      6|      6|       2|
+----+-------+-------+--------+



# Prepare training data

In [11]:
# membuat assembler untuk mengubah fitur menjadi satu kolom fitur
assembler = VectorAssembler(inputCols = ["Fitur_X", "Fitur_Y"],
outputCol="features")
train = assembler.transform(latihan).select('Data', 'features')
train.show(truncate = False, n=10)

+----+----------+
|Data|features  |
+----+----------+
|1   |[5.0,8.0] |
|2   |[5.0,6.0] |
|3   |[9.0,3.0] |
|4   |[1.0,4.0] |
|5   |[7.0,8.0] |
|6   |[1.0,2.0] |
|7   |[2.0,2.0] |
|8   |[9.0,4.0] |
|9   |[5.0,10.0]|
|10  |[6.0,6.0] |
+----+----------+



# Creating a k-Means Clustering model

In [25]:
# mendefinisikan algoritma clustering
kmeans = KMeans(featuresCol = assembler.getOutputCol(), predictionCol="cluster", k=3, maxIter=3, seed=0)

# mentraining model dengan perintah ".fit()"
model = kmeans.fit(train)
print("Model selesai dibuat!")

Model selesai dibuat!


# Find the midpoint value of each cluster

In [26]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[5.6 7.6]
[1.33333333 2.66666667]
[9.  3.5]


# Predicting clusters

In [27]:
prediction = model.transform(train) # melakukan prediksi klaster
prediction.groupBy("cluster").count().orderBy("cluster").show()
prediction.select('Data', 'cluster').show(10)

+-------+-----+
|cluster|count|
+-------+-----+
|      0|    5|
|      1|    3|
|      2|    2|
+-------+-----+

+----+-------+
|Data|cluster|
+----+-------+
|   1|      0|
|   2|      0|
|   3|      2|
|   4|      1|
|   5|      0|
|   6|      1|
|   7|      1|
|   8|      2|
|   9|      0|
|  10|      0|
+----+-------+



In [33]:
import pyspark.sql.functions as F
from pyspark.sql.functions import *

prediction.withColumn("cluster",col("cluster")+1).show(10)

+----+----------+-------+
|Data|  features|cluster|
+----+----------+-------+
|   1| [5.0,8.0]|      1|
|   2| [5.0,6.0]|      1|
|   3| [9.0,3.0]|      3|
|   4| [1.0,4.0]|      2|
|   5| [7.0,8.0]|      1|
|   6| [1.0,2.0]|      2|
|   7| [2.0,2.0]|      2|
|   8| [9.0,4.0]|      3|
|   9|[5.0,10.0]|      1|
|  10| [6.0,6.0]|      1|
+----+----------+-------+

