In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("cluster").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/01 11:07:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
dataset = spark.read.csv("hack_data.csv", inferSchema = True, header = True)

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [4]:
dataset.head()

Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)

In [5]:
from pyspark.ml.clustering import KMeans

In [6]:
from pyspark.ml.feature import VectorAssembler

In [7]:
dataset.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [8]:
feat_cols = ["Session_Connection_Time",
 "Bytes Transferred",
 "Kali_Trace_Used",
 "Servers_Corrupted",
 "Pages_Corrupted",
 "WPM_Typing_Speed"]

In [9]:
assembler = VectorAssembler(inputCols = feat_cols, outputCol = "features")

In [10]:
final_data = assembler.transform(dataset)

In [11]:
final_data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)



In [12]:
from pyspark.ml.feature import StandardScaler

In [13]:
scaler = StandardScaler(inputCol = "features", outputCol = "scaledFeatures")

In [14]:
scaler_model = scaler.fit(final_data)

                                                                                

In [15]:
cluster_final_data = scaler_model.transform(final_data)

In [16]:
cluster_final_data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



In [17]:
kmeans2 = KMeans(featuresCol = "scaledFeatures", k = 2)

In [18]:
kmeans3 = KMeans(featuresCol = "scaledFeatures", k = 3)

In [19]:
model_k2 = kmeans2.fit(cluster_final_data)

                                                                                

In [20]:
model_k3 = kmeans3.fit(cluster_final_data)

In [21]:
model_k3.transform(cluster_final_data)

DataFrame[Session_Connection_Time: double, Bytes Transferred: double, Kali_Trace_Used: int, Servers_Corrupted: double, Pages_Corrupted: double, Location: string, WPM_Typing_Speed: double, features: vector, scaledFeatures: vector, prediction: int]

In [22]:
model_k3.transform(cluster_final_data).select("prediction").show()

+----------+
|prediction|
+----------+
|         0|
|         2|
|         0|
|         0|
|         2|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
|         2|
|         2|
|         0|
|         0|
|         0|
|         2|
|         0|
|         2|
+----------+
only showing top 20 rows



In [24]:
model_k3.transform(cluster_final_data).groupBy("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   84|
|         0|   83|
+----------+-----+



In [26]:
model_k2.transform(cluster_final_data).groupBy("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+

