In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("cluster_project").getOrCreate()

In [2]:
data = spark.read.csv("dataset/kmeans/hack_data.csv", inferSchema=True, header=True)

In [8]:
data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [10]:
feat_cols = ['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used',
             'Servers_Corrupted', 'Pages_Corrupted','WPM_Typing_Speed']

In [4]:
from pyspark.ml.feature import VectorAssembler

In [11]:
assembler = VectorAssembler(inputCols=feat_cols, outputCol="features")
final_dataset = assembler.transform(data)

In [12]:
final_dataset.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)



In [13]:
from pyspark.ml.feature import StandardScaler

In [15]:
scaler = StandardScaler(inputCol="features", outputCol="scalerFeatures")
final_dataset= scaler.fit(final_dataset).transform(final_dataset)

In [17]:
# within set sum of squared error
from pyspark.ml.clustering import KMeans
kmean2 = KMeans(featuresCol="scalerFeatures", k=2)
kmean3 = KMeans(featuresCol="scalerFeatures", k=3)

In [18]:
model2 = kmean2.fit(final_dataset)
model3 = kmean3.fit(final_dataset)

In [21]:
print("Model 2 wssse :",model2.computeCost(final_dataset))
print("Model 3 wssse :",model3.computeCost(final_dataset))

Model 2 wssse : 601.7707512676716
Model 3 wssse : 434.75507308487647


In [25]:
for k in range(2,21):
    kmeans = KMeans(featuresCol='scalerFeatures',k=k)
    model = kmeans.fit(final_dataset)
    wssse = model.computeCost(final_dataset)
    print("With K={}".format(k))
    print("Within Set Sum of Squared Errors = " + str(wssse))
    print('--'*30)

With K=2
Within Set Sum of Squared Errors = 601.7707512676716
------------------------------------------------------------
With K=3
Within Set Sum of Squared Errors = 434.75507308487647
------------------------------------------------------------
With K=4
Within Set Sum of Squared Errors = 267.1336116887891
------------------------------------------------------------
With K=5
Within Set Sum of Squared Errors = 248.83046300020916
------------------------------------------------------------
With K=6
Within Set Sum of Squared Errors = 229.22882667544616
------------------------------------------------------------
With K=7
Within Set Sum of Squared Errors = 221.0332248481683
------------------------------------------------------------
With K=8
Within Set Sum of Squared Errors = 202.31081630497334
------------------------------------------------------------
With K=9
Within Set Sum of Squared Errors = 196.1172957916953
------------------------------------------------------------
With K=10
Wi

In [26]:
model.transform(final_dataset).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|        12|   29|
|         1|   23|
|        13|    9|
|        16|   11|
|         6|   15|
|         3|   47|
|         5|   14|
|        19|   12|
|        15|   20|
|         9|   20|
|        17|   18|
|         4|   22|
|         8|    8|
|         7|   12|
|        10|   14|
|        11|   11|
|        14|   10|
|         2|   16|
|         0|   14|
|        18|    9|
+----------+-----+



In [28]:
model2.transform(final_dataset).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [29]:
model3.transform(final_dataset).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   88|
|         0|   79|
+----------+-----+

