## Anomaly Detection in Network Trac with K-means Clustering

- KDD Cup 1999: Computer network intrusion detection
- http://www.kdd.org/kdd-cup/view/kdd-cup-1999/Data

---
- kddcup.names A list of features.
- kddcup.data.zip The full data set (18M; 743M Uncompressed)
- kddcup.testdata.unlabeled.zip (11.2M; 430M Uncompressed)
- kddcup.data_10_percent.zip A 10% subset. (2.1M; 75M Uncompressed)
- kddcup.newtestdata_10_percent_unlabeled.zip (1.4M; 45M Uncompressed)
- kddcup.testdata.unlabeled_10_percent.zip (1.4M;45M Uncompressed)
- corrected.zip Test data with corrected labels.
- training_attack_types A list of intrusion types.

In [1]:
from pyspark.conf import SparkConf
from pyspark import StorageLevel

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Spark ML") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

In [3]:
df = spark.read.csv("../dataset/kddcup.data_10_percent.txt", inferSchema=True, sep=",").toDF(
    "duration", "protocol_type", "service", "flag", "src_bytes",
    "dst_bytes", "land", "wrong_fragment", "urgent",
    "hot", "num_failed_logins", "logged_in", "num_compromised",
    "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count",
    "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
    "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
    "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label")

print("Number of training data: {}".format(df.count()))
df.printSchema()

Number of training data: 494021
root
 |-- duration: integer (nullable = true)
 |-- protocol_type: string (nullable = true)
 |-- service: string (nullable = true)
 |-- flag: string (nullable = true)
 |-- src_bytes: integer (nullable = true)
 |-- dst_bytes: integer (nullable = true)
 |-- land: integer (nullable = true)
 |-- wrong_fragment: integer (nullable = true)
 |-- urgent: integer (nullable = true)
 |-- hot: integer (nullable = true)
 |-- num_failed_logins: integer (nullable = true)
 |-- logged_in: integer (nullable = true)
 |-- num_compromised: integer (nullable = true)
 |-- root_shell: integer (nullable = true)
 |-- su_attempted: integer (nullable = true)
 |-- num_root: string (nullable = true)
 |-- num_file_creations: string (nullable = true)
 |-- num_shells: string (nullable = true)
 |-- num_access_files: integer (nullable = true)
 |-- num_outbound_cmds: integer (nullable = true)
 |-- is_host_login: integer (nullable = true)
 |-- is_guest_login: integer (nullable = true)
 |-- co

In [4]:
df.select("label").groupBy("label").count().orderBy(desc("count")).show(25)

+----------------+------+
|           label| count|
+----------------+------+
|          smurf.|280790|
|        neptune.|107201|
|         normal.| 97277|
|           back.|  2203|
|          satan.|  1589|
|        ipsweep.|  1247|
|      portsweep.|  1040|
|    warezclient.|  1020|
|       teardrop.|   979|
|            pod.|   264|
|           nmap.|   231|
|   guess_passwd.|    53|
|buffer_overflow.|    30|
|           land.|    21|
|    warezmaster.|    20|
|           imap.|    12|
|        rootkit.|    10|
|     loadmodule.|     9|
|      ftp_write.|     8|
|       multihop.|     7|
|            phf.|     4|
|           perl.|     3|
|            spy.|     2|
|            0.00|     1|
+----------------+------+



In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml import Pipeline

import random

In [6]:
# Only numeric
df = df.withColumn("num_root", df.num_root.cast("int"))
df = df.withColumn("num_file_creations", df.num_root.cast("int"))
df = df.withColumn("num_shells", df.num_root.cast("int"))

# Drop null
train = df.drop("protocol_type", "service", "flag").dropna().cache()
columns = train.columns
columns.remove('label')

In [7]:
# VectorAssembler
assembler = VectorAssembler(
    inputCols=columns,
    outputCol='features')

# StandardScaler
scaler = StandardScaler(
    inputCol='features',
    outputCol='scaled_features',
    withStd=True,
    withMean=True)

# KMeans
kmeans = KMeans(
    featuresCol='scaled_features',
    predictionCol='cluster',
    maxIter=30,
    seed=random.randrange(1,10))
kmeans.setK(3)

KMeans_4ca6977062d576f928ad

### KMeans.explainParams()
- featuresCol: features column name (default: features)
- initMode: initialization algorithm (default: k-means||)
- initSteps: number of steps for k-means|| (default: 5)
- k: number of clusters to create (default: 2)
- maxIter: maximum number of iterations (>= 0) (default: 20)
- predictionCol: prediction column name (default: prediction)
- seed: random seed (default: -1689246527)
- tol: the convergence tolerance for iterative algorithms (default: 1.0E-4)

In [8]:
# Pipeline
pipeline = Pipeline().setStages([assembler, scaler, kmeans])
pipelineModel = pipeline.fit(train)
kmeansModel = pipelineModel.stages[-1]

In [9]:
print(*kmeansModel.clusterCenters(), sep='\n')

[-0.06768475 -0.00234773 -0.02619924 -0.00667342 -0.04772019 -0.00257147
 -0.04412916 -0.00961196 -0.41718843 -0.00567868 -0.01055195 -0.00467567
 -0.00564001 -0.00564001 -0.00564001 -0.02763182  0.          0.
 -0.03726266  0.41183858  0.35085558  0.13839435  0.13854274 -0.24789685
 -0.2486132  -0.01231898 -0.07526366 -0.20035199  0.3453196   0.08954367
  0.05754689 -0.14071868  0.2934586  -0.15847693  0.13831224  0.13898245
 -0.25188338 -0.24946264]
[ 0.22147364  0.00769472  0.08573617  0.0218723   0.15640414  0.00842805
  0.14463444  0.03150345  1.36729709  0.01066278  0.03294298  0.01088012
  0.01052669  0.01052669  0.01052669  0.08724121  0.          0.
  0.12212933 -1.3497775  -1.1499116  -0.45358059 -0.45406697  0.81248829
  0.81483617  0.04036626  0.2466817   0.65665886 -1.13173936 -0.29345095
 -0.18858729  0.46119809 -0.96179208  0.51941222 -0.45331195 -0.45551016
  0.82555426  0.81762023]
[  2.10018788e+01  -1.72190129e-03   7.62630765e+00  -6.67341765e-03
  -4.77201855e-02  

In [10]:
withCluster = pipelineModel.transform(train)
clusterLabel = withCluster.select("cluster", "label") \
    .groupBy("cluster", "label").count() \
    .orderBy("cluster", "count")
clusterLabel.show(100)

+-------+----------------+------+
|cluster|           label| count|
+-------+----------------+------+
|      0|   guess_passwd.|     1|
|      0|            spy.|     1|
|      0|           imap.|     4|
|      0|      portsweep.|    13|
|      0|          satan.|    71|
|      0|           nmap.|   105|
|      0|         normal.| 10822|
|      0|        neptune.| 86723|
|      0|          smurf.|280786|
|      1|            spy.|     1|
|      1|           perl.|     3|
|      1|            phf.|     4|
|      1|          smurf.|     4|
|      1|       multihop.|     7|
|      1|      ftp_write.|     8|
|      1|           imap.|     8|
|      1|     loadmodule.|     9|
|      1|        rootkit.|    10|
|      1|    warezmaster.|    20|
|      1|           land.|    21|
|      1|buffer_overflow.|    30|
|      1|   guess_passwd.|    52|
|      1|           nmap.|   126|
|      1|            pod.|   264|
|      1|       teardrop.|   979|
|      1|    warezclient.|  1020|
|      1|     

In [11]:
clusterLabel.groupBy('cluster').sum('count').orderBy('cluster').show()

+-------+----------+
|cluster|sum(count)|
+-------+----------+
|      0|    378526|
|      1|    115492|
|      2|         2|
+-------+----------+



In [12]:
kmeansModel.summary.clusterSizes

[378526, 115492, 2]

In [13]:
scaled_features = withCluster.select('cluster', 'scaled_features')
scaled_features.show(3)

+-------+--------------------+
|cluster|     scaled_features|
+-------+--------------------+
|      1|[-0.0677917208490...|
|      1|[-0.0677917208490...|
|      1|[-0.0677917208490...|
+-------+--------------------+
only showing top 3 rows



In [14]:
spark.stop()