## Anomaly Detection in Network Trac with K-means Clustering

- KDD Cup 1999: Computer network intrusion detection
- http://www.kdd.org/kdd-cup/view/kdd-cup-1999/Data

---
- kddcup.names A list of features.
- kddcup.data.zip The full data set (18M; 743M Uncompressed)
- kddcup.testdata.unlabeled.zip (11.2M; 430M Uncompressed)
- kddcup.data_10_percent.zip A 10% subset. (2.1M; 75M Uncompressed)
- kddcup.newtestdata_10_percent_unlabeled.zip (1.4M; 45M Uncompressed)
- kddcup.testdata.unlabeled_10_percent.zip (1.4M;45M Uncompressed)
- corrected.zip Test data with corrected labels.
- training_attack_types A list of intrusion types.

In [1]:
from pyspark.conf import SparkConf
from pyspark import StorageLevel

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Spark ML") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

In [3]:
df = spark.read.csv("../dataset/kddcup.data_10_percent.txt", inferSchema=True, sep=",").toDF(
    "duration", "protocol_type", "service", "flag", "src_bytes",
    "dst_bytes", "land", "wrong_fragment", "urgent",
    "hot", "num_failed_logins", "logged_in", "num_compromised",
    "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count",
    "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
    "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
    "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label")

print("Number of training data: {}".format(df.count()))
df.printSchema()

Number of training data: 494021
root
 |-- duration: integer (nullable = true)
 |-- protocol_type: string (nullable = true)
 |-- service: string (nullable = true)
 |-- flag: string (nullable = true)
 |-- src_bytes: integer (nullable = true)
 |-- dst_bytes: integer (nullable = true)
 |-- land: integer (nullable = true)
 |-- wrong_fragment: integer (nullable = true)
 |-- urgent: integer (nullable = true)
 |-- hot: integer (nullable = true)
 |-- num_failed_logins: integer (nullable = true)
 |-- logged_in: integer (nullable = true)
 |-- num_compromised: integer (nullable = true)
 |-- root_shell: integer (nullable = true)
 |-- su_attempted: integer (nullable = true)
 |-- num_root: string (nullable = true)
 |-- num_file_creations: string (nullable = true)
 |-- num_shells: string (nullable = true)
 |-- num_access_files: integer (nullable = true)
 |-- num_outbound_cmds: integer (nullable = true)
 |-- is_host_login: integer (nullable = true)
 |-- is_guest_login: integer (nullable = true)
 |-- co

In [4]:
df.select("label").groupBy("label").count().orderBy(desc("count")).show(25)

+----------------+------+
|           label| count|
+----------------+------+
|          smurf.|280790|
|        neptune.|107201|
|         normal.| 97277|
|           back.|  2203|
|          satan.|  1589|
|        ipsweep.|  1247|
|      portsweep.|  1040|
|    warezclient.|  1020|
|       teardrop.|   979|
|            pod.|   264|
|           nmap.|   231|
|   guess_passwd.|    53|
|buffer_overflow.|    30|
|           land.|    21|
|    warezmaster.|    20|
|           imap.|    12|
|        rootkit.|    10|
|     loadmodule.|     9|
|      ftp_write.|     8|
|       multihop.|     7|
|            phf.|     4|
|           perl.|     3|
|            spy.|     2|
|            0.00|     1|
+----------------+------+



In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml import Pipeline

In [6]:
# Only numeric
df = df.withColumn("num_root", df.num_root.cast("int"))
df = df.withColumn("num_file_creations", df.num_root.cast("int"))
df = df.withColumn("num_shells", df.num_root.cast("int"))

# Drop null
train = df.drop("protocol_type", "service", "flag").dropna().cache()
columns = train.columns
columns.remove('label')

In [7]:
# VectorAssembler
assembler = VectorAssembler() \
    .setInputCols(columns) \
    .setOutputCol("featureVector")

# KMeans
kmeans = KMeans() \
    .setK(3) \
    .setSeed(7) \
    .setPredictionCol("cluster") \
    .setFeaturesCol("featureVector")

In [8]:
# Pipeline
pipeline = Pipeline().setStages([assembler, kmeans])
pipelineModel = pipeline.fit(train)
kmeansModel = pipelineModel.stages[-1]

In [9]:
print(*kmeansModel.clusterCenters(), sep='\n')

[  4.73818366e+01   1.00902844e+03   8.68636272e+02   4.45380193e-05
   6.43371933e-03   1.41711880e-05   3.41971010e-02   1.51834157e-04
   1.48143574e-01   1.02133776e-02   1.11345048e-04   3.64401976e-05
   1.13531460e-02   1.13531460e-02   1.13531460e-02   1.00817880e-03
   0.00000000e+00   0.00000000e+00   1.38675196e-03   3.32326486e+02
   2.92942530e+02   1.76704855e-01   1.76626954e-01   5.74400761e-02
   5.77254029e-02   7.91524536e-01   2.09821645e-02   2.89928334e-02
   2.32495196e+02   1.88684776e+02   7.53766358e-01   3.09065714e-02
   6.01897178e-01   6.67185602e-03   1.76774132e-01   1.76460159e-01
   5.81244028e-02   5.74180500e-02]
[  2.00000000e+00   6.93375640e+08   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   5.70000000e+0

In [10]:
withCluster = pipelineModel.transform(train)
withCluster.select("cluster", "label") \
    .groupBy("cluster", "label").count() \
    .orderBy("cluster", "count").show(25)

+-------+----------------+------+
|cluster|           label| count|
+-------+----------------+------+
|      0|            spy.|     2|
|      0|           perl.|     3|
|      0|            phf.|     4|
|      0|       multihop.|     7|
|      0|      ftp_write.|     8|
|      0|     loadmodule.|     9|
|      0|        rootkit.|    10|
|      0|           imap.|    12|
|      0|    warezmaster.|    20|
|      0|           land.|    21|
|      0|buffer_overflow.|    30|
|      0|   guess_passwd.|    53|
|      0|           nmap.|   231|
|      0|            pod.|   264|
|      0|    warezclient.|   961|
|      0|       teardrop.|   979|
|      0|      portsweep.|  1039|
|      0|        ipsweep.|  1247|
|      0|          satan.|  1589|
|      0|           back.|  2203|
|      0|         normal.| 97277|
|      0|        neptune.|107201|
|      0|          smurf.|280790|
|      1|      portsweep.|     1|
|      2|    warezclient.|    59|
+-------+----------------+------+



In [11]:
spark.stop()