In [1]:
from sklearn import datasets
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn import svm
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.svm import OneClassSVM
from sklearn.cluster import DBSCAN

from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import datetime

%matplotlib inline

def byte_decoder(val):
    # decodes byte literals to strings
    
    return val.decode('utf-8')

def plot_confusion_matrix(cm, title, classes=['abnormal', 'normal'],
                          cmap=plt.cm.Reds):
    
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.1%'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

sa_columns = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment",
                           "urgent","hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted",
                           "num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login",
                           "is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
                           "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
                           "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
                           "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
                           "dst_host_rerror_rate","dst_host_srv_rerror_rate"]
sf_columns = ["duration", "service", "src_bytes", "dst_bytes"]

# Model wrapper
def model(algorithm = "if"):
    
    s = datetime.datetime.now()
    y_pred = estimators[algorithm].fit(x_train).predict(x_test)
    t = datetime.datetime.now() - s
    print(f"trainning finished in {t}")
    print(classification_report(y_test, y_pred, target_names=['anomaly', 'normal']))
    print ("AUC: ", "{:.1%}".format(roc_auc_score(y_test, y_pred)))
    plot_confusion_matrix(confusion_matrix(y_test, y_pred), f"Confusion Matrix for {algorithm} on SF test dataset")
    return (t, precision_recall_fscore_support(y_test, y_pred, labels=[-1]))

# Estimators
num_extimators = 100
max_samples = 0.25
contamination = 0.2
eps = 0.2

ifsf = IsolationForest(max_samples=max_samples, random_state=0, contamination=contamination, n_estimators=num_extimators, n_jobs=-1)
lofsf = LocalOutlierFactor(n_neighbors=15, metric='euclidean', algorithm = 'auto', contamination=contamination, n_jobs=-1)
ocsvm = OneClassSVM(nu=contamination, kernel="rbf",gamma=0.1)
dbscan = DBSCAN(eps=eps, min_samples=10, metric='euclidean', algorithm = 'auto', n_jobs=-1)
estimators = {
    "if": ifsf,
    "lof": lofsf,
    "dbs": dbscan,
    "svm": ocsvm
}

In [3]:
# sf = datasets.fetch_kddcup99(subset='SF', random_state=0, percent10=False)
# dfsf = pd.DataFrame(sf.data, columns=sf_columns)
df = pd.read_csv('kddcup.data', delimiter=',')
df.columns = sa_columns + ["target"]

anomaly_rate_sf = 1.0 - len(df.loc[df["target"]=='normal.'])/len(df)
print(f"Anomaly rate is {anomaly_rate_sf:.1%}")

toDecode = ["protocol_type", "service", "flag", "target"]
df["binary_target"] = [1 if x=='normal.' else -1 for x in df["target"]] 

le = preprocessing.LabelEncoder()
for f in toDecode:
    df[f] = le.fit_transform(df[f])

X_train, X_test, y_train, y_test = train_test_split(df.drop(["target", 'binary_target'], axis=1), df['binary_target'], test_size=0.33, random_state=0)
results = []

EOFError: Ran out of input