In [1]:
#A tech company was attached by hackers. the engineer collected data from hacks
#and want to finalize if there were two hackers or three.
#In this project, I will try to indentify hacker count using clustering.

import findspark
findspark.init('/home/shoby/spark-2.4.0-bin-hadoop2.7')
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Hack').getOrCreate()

In [4]:
dataset = spark.read.csv('hack_data.csv', header = True, inferSchema = True)

In [5]:
dataset.head()

Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)

In [6]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StandardScaler


In [7]:
dataset.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [8]:
#Removing location information from feature column list here because hackers may be using vpns.

feature_cols =['Session_Connection_Time',  'Bytes Transferred',   'Kali_Trace_Used',  'Servers_Corrupted',
 'Pages_Corrupted',  'WPM_Typing_Speed']

In [9]:
feature_cols

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed']

In [10]:
#Creating the VectorAssembler object and configuring it.

assembler = VectorAssembler(inputCols = feature_cols, outputCol = 'features')

In [11]:
final_data = assembler.transform(dataset)

In [12]:
#KMean algorithm needs Scaled inputs so we need to input scaled Feature vector.

scaler = StandardScaler(inputCol='features', outputCol = 'scaledFeatures')

In [13]:
#Fitting the final_data to the scaler

scaler_model = scaler.fit(final_data)

In [14]:
#transforming usng fitted scaled model.

cluster_final_model = scaler_model.transform(final_data)

In [15]:
#Checking how the dataframe looks.

cluster_final_model.show(3)

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+--------------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|            features|      scaledFeatures|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+--------------------+--------------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|[8.0,391.09,1.0,2...|[0.56785108466505...|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|[20.0,720.99,0.0,...|[1.41962771166263...|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58

In [16]:
#Checking schema of final data

cluster_final_model.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



In [17]:
#Creating two KMeans instances, one with k=2 and other with k=3

Kmeans2 = KMeans(featuresCol = 'scaledFeatures', k=2)
Kmeans3 = KMeans(featuresCol = 'scaledFeatures', k=3)

In [18]:
#Fitting final data in Kmean instances.

Km2 = Kmeans2.fit(cluster_final_model)
Km3 = Kmeans3.fit(cluster_final_model)

In [29]:
#outputs based on 2 clusters.

Km2.transform(cluster_final_model).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [30]:
#outputs based on 3 clusters.

Km3.transform(cluster_final_model).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   83|
|         2|  167|
|         0|   84|
+----------+-----+



In [None]:
#According to the output above, it seems like there are two hackers, not three. since there is an almost
#Even distribution of attacks when clustering at k=2 and uneven attack counts when clustering at k=3.