# Testing anomaly detection algorithms on KDDCUP99
based on https://github.com/elena-sharova/IsolationForest/blob/master/IsolationForest_v0.1.ipynb
## KDDCUP 99

The 1998 DARPA Intrusion Detection Evaluation Program was prepared and managed by MIT Lincoln Labs. The objective was to survey and evaluate research in intrusion detection.  A standard set of data to be audited, which includes a wide variety of intrusions simulated in a military network environment, was provided.  The 1999 KDD intrusion detection contest uses a version of this dataset.

`SA` is obtained by simply selecting all the normal data, and a small proportion of abnormal data to give an anomaly ratio of 1%. SA has all 41 attributes.

`SF` is the data where attribute logged_in is positive, thus focusing on the intrusion attack, which gives an anomaly ratio 0.3%. SF has log-transformed 4 attributes.

In [86]:
from sklearn import datasets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn import svm
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import datetime

%matplotlib inline

In [87]:
print("numpy:",np.__version__)
print("pandas", pd.__version__)

numpy: 1.20.2
pandas 1.2.4


# KDDCUP SF dataset
This dataset contains 41 attributes, has 703,067 records and an anomaly rate of 0.5%

In [88]:
target = "target"
service = "service"
sf = datasets.fetch_kddcup99(subset='SF', percent10=True)
# dfSF=pd.DataFrame(sf.data, columns=["duration", "service", "src_bytes", "dst_bytes"])

In [89]:
dfsf = pd.DataFrame(sf.data, columns=["duration", "service", "src_bytes", "dst_bytes"])
dfsf[target]=sf.target

anomaly_rate_sf = 1.0 - len(dfsf.loc[dfsf[target]==b'normal.'])/len(dfsf)
f"SF anomaly rate is {anomaly_rate_sf:.1%}"

'SF anomaly rate is 4.5%'

In [90]:
def byte_decoder(val):
    # decodes byte literals to strings
    
    return val.decode('utf-8')

# 2. Data preprocessing for sf
non-numeric attributes are label encoded into integers. all targets are converted to 1 (b'normal.') or -1
For the sf subset, 2 out of 4 columns needs to be label encoded.

In [91]:
print(f"{dfsf.head(1)}")
print(f"list of sf service values : {set(dfsf[service])}")
print(f"list of sa service values : {set(dfsf[target])}")

   duration  service src_bytes dst_bytes      target
0 -2.302585  b'http'  5.199049  8.603389  b'normal.'
list of sf service values : {b'auth', b'pop_3', b'ftp_data', b'other', b'smtp', b'login', b'telnet', b'http', b'ftp', b'gopher', b'nntp', b'domain', b'private', b'discard', b'ssh', b'imap4', b'X11', b'IRC'}
list of sa service values : {b'imap.', b'perl.', b'multihop.', b'spy.', b'normal.', b'satan.', b'buffer_overflow.', b'guess_passwd.', b'rootkit.', b'ftp_write.', b'warezmaster.', b'warezclient.', b'back.', b'ipsweep.', b'loadmodule.', b'phf.'}


In [92]:
toDecodeSF = [service, target]

In [93]:
# apply hot encoding to fields of type string
# convert all abnormal target types to single anomaly class

dfsf['binary_target'] = [1 if x==b'normal.' else -1 for x in dfsf[target]]
    
leSF = preprocessing.LabelEncoder()

for f in toDecodeSF:
    dfsf[f] = list(map(byte_decoder, dfsf[f]))
    dfsf[f] = leSF.fit_transform(dfsf[f])

dfsf_normed = preprocessing.normalize(dfsf.drop([target, 'binary_target'], axis=1))

# Isolation forest on 10% SF
First we split the dataset into a train and a test set

In [94]:
X_train_sf, X_test_sf, y_train_sf, y_test_sf = train_test_split(dfsf.drop([target, 'binary_target'], axis=1), 
                                                                dfsf['binary_target'], test_size=0.33, random_state=11)
X_train_nd, X_test_nd, y_train_nd, y_test_nd = train_test_split(dfsf_normed, dfsf['binary_target'], 
                                                    test_size=0.33, random_state=11)

## Parameters
* num_estimators = 100
* max_samples = 25%
* contamination = 15%

In [95]:
ifsf = IsolationForest(max_samples=0.25, random_state=11, contamination = 0.15, n_estimators=100, n_jobs=-1)

## Trainning Isolation Forest on the SF dataset.10%

In [96]:
start = datetime.datetime.now()
ifsf.fit(X_train_sf,y_train_sf)
y_pred_train = ifsf.predict(X_train_sf)
end = datetime.datetime.now()
f"trainning finished in : {end-start}"

'trainning finished in : 0:00:04.252033'

In [97]:
lofsf = LocalOutlierFactor(n_neighbors=15, metric='euclidean', algorithm = 'auto', contamination=0.15, n_jobs=-1)

## Trainning Local Outlier Factor on the SF dataset.10%

In [98]:
start = datetime.datetime.now()
y_pred_train_lof = lofsf.fit_predict(X_train_nd,y_train_nd)
end = datetime.datetime.now()
f"trainning finished in : {end-start}"

'trainning finished in : 0:00:00.312001'

## IF results on SF.10% trainning set

In [99]:
print(classification_report(y_train_sf, y_pred_train, target_names=['anomaly', 'normal']))
print ("AUC: ", "{:.1%}".format(roc_auc_score(y_train_sf, y_pred_train)))

              precision    recall  f1-score   support

     anomaly       0.11      0.37      0.17      2225
      normal       0.97      0.86      0.91     46843

    accuracy                           0.84     49068
   macro avg       0.54      0.61      0.54     49068
weighted avg       0.93      0.84      0.88     49068

AUC:  61.4%


## LOF results on SF.10% trainning set

In [100]:
print(classification_report(y_train_nd, y_pred_train_lof, target_names=['anomaly', 'normal']))
print ("AUC: ", "{:.1%}".format(roc_auc_score(y_train_nd, y_pred_train_lof)))

              precision    recall  f1-score   support

     anomaly       0.03      0.09      0.04      2225
      normal       0.95      0.85      0.90     46843

    accuracy                           0.81     49068
   macro avg       0.49      0.47      0.47     49068
weighted avg       0.91      0.81      0.86     49068

AUC:  46.7%


# SA

In [101]:
sa = datasets.fetch_kddcup99(subset='SA', percent10=True)
dfsa=pd.DataFrame(sa.data, 
                  columns=["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment",
                           "urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted",
                           "num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login",
                           "is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
                           "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
                           "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
                           "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
                           "dst_host_rerror_rate","dst_host_srv_rerror_rate"])
assert len(dfsa)>0, "SA dataset not loaded."

dfsa[target]=sa.target

anomaly_rate_sa = 1.0 - len(dfsa.loc[dfsa[target]==b'normal.'])/len(dfsa)
f"SA anomaly rate is {anomaly_rate_sa:.1%}"

# dfsa["target"].head(5)
# set(dfsa[target])

'SA anomaly rate is 3.4%'

# HTTP
Using the 'service' attribute, the data is divided into {http, smtp, ftp, ftp_data, others} subsets. Here, only 'http' service data is used. Since the continuous attribute values are concentrated around '0', we transformed each value into a value far from '0', by y = log(x + 0.1). The original data set has 3,925,651 attacks (80.1%) out of 4,898,431 records. A smaller set is forged by having only 3,377 attacks (0.35%) of 976,157 records, where attribute 'logged_in' is positive. From this forged dataset 567,497 ‘http’ service data is used to construct the http (KDDCUP99) dataset. 

In [102]:
http = datasets.fetch_kddcup99(subset='http', percent10=True)