In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.\
        builder.\
        appName('Hack_Project').\
        getOrCreate()

In [3]:
df = spark.read.csv('hack_data.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [4]:
df.groupBy('Location').count().show() # Possibly Hackers used VPN, therefore Location data is useless

+--------------------+-----+
|            Location|count|
+--------------------+-----+
|            Anguilla|    1|
|            Paraguay|    2|
|               Macao|    2|
|Heard Island and ...|    2|
|               Yemen|    1|
|             Tokelau|    2|
|              Sweden|    3|
|French Southern T...|    3|
|            Kiribati|    1|
|              Guyana|    2|
|         Philippines|    3|
|            Malaysia|    2|
|           Singapore|    1|
|United States Vir...|    6|
|              Turkey|    1|
|      Western Sahara|    2|
|              Malawi|    2|
|                Iraq|    3|
|Northern Mariana ...|    3|
|             Germany|    1|
+--------------------+-----+
only showing top 20 rows



In [5]:
data = df.drop('Location')
data.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed']

In [6]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')
data = assembler.transform(data)

scaler = StandardScaler(inputCol='features', outputCol='Scaled_feats').fit(data)
data = scaler.transform(data)
data = data.select('Scaled_feats')
data = data.withColumnRenamed('Scaled_feats', 'features')
data.show(5)

+--------------------+
|            features|
+--------------------+
|[0.56785108466505...|
|[1.41962771166263...|
|[2.20042295307707...|
|[0.14196277116626...|
|[1.41962771166263...|
+--------------------+
only showing top 5 rows



In [7]:
from pyspark.ml.clustering import KMeans
for k in [2, 3]:
    model = KMeans(k=k).setSeed(42).fit(data)
    print('Model WSSE with k = {}: {:.3f}'.format(k, model.summary.trainingCost))
    predictions = model.transform(data)
    print('\nPredictions with k = {}'.format(k))
    predictions.groupBy('prediction').count().orderBy('prediction').show()
    print('-'*50)

Model WSSE with k = 2: 601.771

Predictions with k = 2
+----------+-----+
|prediction|count|
+----------+-----+
|         0|  167|
|         1|  167|
+----------+-----+

--------------------------------------------------
Model WSSE with k = 3: 434.755

Predictions with k = 3
+----------+-----+
|prediction|count|
+----------+-----+
|         0|  167|
|         1|   88|
|         2|   79|
+----------+-----+

--------------------------------------------------


**Conclusion:** Forensic Engineer expects a equal share of task between hackers. For example, if there are 120 tasks, there would be 60 tasks each for 2 hackers and there would be 40 tasks each for 3 hackers. The results of $k = 2$ confirms that expectation and the conclusion is that there were 2 hackers during the attack. 