# KD Cup 1999 Dataset
> From Kaggle

## Importing the dataset

In [19]:
import os
from pathlib import Path
import pandas as pd

In [20]:
df = pd.read_csv('KDD_Train.csv')

## Playing Around

In [21]:
print(df.shape) # no of rows (instances) by no of cols (features)
df

(125973, 42)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,tcp,private,S0,0,0,0,0,0,0,...,25,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
125969,8,udp,private,SF,105,145,0,0,0,0,...,244,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,normal
125970,0,tcp,smtp,SF,2231,384,0,0,0,0,...,30,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,normal
125971,0,tcp,klogin,S0,0,0,0,0,0,0,...,8,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly


In [22]:
df.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [23]:
df['class'].value_counts() # the class is the outcome we would want our model to learn how to predict

normal     67343
anomaly    58630
Name: class, dtype: int64

In [24]:
for feature in df:
    print(f"---{feature}---")
    print(df[feature].value_counts())

---duration---
0        115955
1          1989
2           843
3           557
4           351
          ...  
4946          1
5284          1
20771         1
3294          1
679           1
Name: duration, Length: 2981, dtype: int64
---protocol_type---
tcp     102689
udp      14993
icmp      8291
Name: protocol_type, dtype: int64
---service---
http         40338
private      21853
domain_u      9043
smtp          7313
ftp_data      6860
             ...  
tftp_u           3
http_8001        2
aol              2
harvest          2
http_2784        1
Name: service, Length: 70, dtype: int64
---flag---
SF        74945
S0        34851
REJ       11233
RSTR       2421
RSTO       1562
S1          365
SH          271
S2          127
RSTOS0      103
S3           49
OTH          46
Name: flag, dtype: int64
---src_bytes---
0        49392
8         3691
1         2432
44        2334
45        2089
         ...  
1940         1
12973        1
1899         1
1661         1
2358         1
Name: src_b

## Preprocessing
> we will represent the data in a numerical format

In [27]:
df.replace(('normal', 'anomaly'), (0, 1), inplace=True)
print(df['class'])
print(df['class'].value_counts())

0         0
1         0
2         1
3         0
4         0
         ..
125968    1
125969    0
125970    0
125971    1
125972    0
Name: class, Length: 125973, dtype: int64
0    67343
1    58630
Name: class, dtype: int64


### Splitting the dataset in half

In [28]:
df1 = df[: df.shape[0] // 2]
df2 = df[df.shape[0] // 2 :]

print(df.shape)
print(df1.shape)
print(df2.shape)

(125973, 42)
(62986, 42)
(62987, 42)


> Identifying the 'accuracy'

In [54]:
dummy = df.iloc[[3]]['class']

In [68]:

def get_accuracy(df1: pd.DataFrame, df2: pd.DataFrame) -> float:
    right = 0
    size = df1.shape[0]
    for i in range(size):
        if df1.iloc[[i]]['class'].item() == df2.iloc[[i]]['class'].item():
            right += 1
    return (right/size)

In [69]:
print(f"df1 and df2 are {round(get_accuracy(df1, df2) * 100, 3)}% accurate")

df1 and df2 are 50.287% accurate


### Splitting based on 'class'

In [38]:
g = df.groupby('class')
df_normal = g.get_group(0)
df_anomaly = g.get_group(1)
print(df_normal.shape)
print(df_anomaly.shape)

(67343, 42)
(58630, 42)
