In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from metrics import binary_evaluate
import time
import copy

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from metrics import binary_evaluate

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
def preprocessing(raw_data, is_onehot=True, is_normalize=False):
    df = copy.deepcopy(raw_data)
    label = 'label'
    target = df.pop(label)

    numerical_features = [x for x in df.columns if df[x].dtype == np.float64 or df[x].dtype == np.int64]
    categorical_features = [x for x in df.columns if df[x].dtype == object]

    def discretization(x):
        mapp = dict(zip(x.unique().tolist(),
            range(len(x.unique().tolist()))))
        def mapfunction(y):
            if y in mapp:
                return mapp[y]
            else:
                return -1
        return mapfunction
    for i in categorical_features:
        df[i] = df[i].apply(discretization(df[i]))
    target = target.apply(discretization(target))

    if is_onehot:
        enc = OneHotEncoder()
        enc.fit(df[categorical_features])
        features = pd.concat([df, pd.DataFrame(enc.transform(df[categorical_features]).toarray())], axis=1)
        features.drop(categorical_features, axis=1, inplace=True)
    else:
        features = df

    if is_normalize:
        mms = MinMaxScaler()
        features[numerical_features + categorical_features] = mms.fit_transform(df[numerical_features + categorical_features])

    return features, target

In [5]:
train_path = './data/nslkdd/KDDTrain_binary.csv'
raw_data = pd.read_csv(train_path)

In [6]:
features, target = preprocessing(raw_data, is_normalize=True, is_onehot=False)
features

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0.000000,0.0,0.000000,0.0,3.558064e-07,0.000000e+00,0.0,0.0,0.0,0.0,...,0.588235,0.098039,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00
1,0.000000,0.5,0.014493,0.0,1.057999e-07,0.000000e+00,0.0,0.0,0.0,0.0,...,1.000000,0.003922,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00
2,0.000000,0.0,0.028986,0.1,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,...,1.000000,0.101961,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00
3,0.000000,0.0,0.043478,0.0,1.681203e-07,6.223962e-06,0.0,0.0,0.0,0.0,...,0.117647,1.000000,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01
4,0.000000,0.0,0.043478,0.0,1.442067e-07,3.206260e-07,0.0,0.0,0.0,0.0,...,1.000000,1.000000,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125959,0.000000,0.0,0.028986,0.1,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,...,1.000000,0.098039,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00
125960,0.000186,0.5,0.028986,0.0,7.608895e-08,1.106923e-07,0.0,0.0,0.0,0.0,...,1.000000,0.956863,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00
125961,0.000000,0.0,0.217391,0.0,1.616709e-06,2.931438e-07,0.0,0.0,0.0,0.0,...,1.000000,0.117647,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00
125962,0.000000,0.0,0.507246,0.1,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,...,1.000000,0.031373,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00


In [7]:
total_details = dict()

In [8]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    lr = LogisticRegression(max_iter=100000)
    train_start_time = time.perf_counter()
    lr.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = lr.predict(x_test)
    test_end_time = time.perf_counter()

    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['lr'] = cv_details

In [9]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    cart = DecisionTreeClassifier()
    train_start_time = time.perf_counter()
    cart.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = cart.predict(x_test)
    test_end_time = time.perf_counter()

    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['CART'] = cv_details

In [10]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    c45 = DecisionTreeClassifier(criterion='entropy')
    train_start_time = time.perf_counter()
    c45.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = c45.predict(x_test)
    test_end_time = time.perf_counter()

    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['C4.5'] = cv_details

In [11]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    knn = KNeighborsClassifier()
    train_start_time = time.perf_counter()
    knn.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = knn.predict(x_test)
    test_end_time = time.perf_counter()

    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['KNN'] = cv_details

In [12]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    nb = GaussianNB()
    train_start_time = time.perf_counter()
    nb.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = nb.predict(x_test)
    test_end_time = time.perf_counter()
   
    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['NB'] = cv_details

In [13]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    rf = RandomForestClassifier()
    train_start_time = time.perf_counter()
    rf.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = rf.predict(x_test)
    test_end_time = time.perf_counter()

    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['RF'] = cv_details

In [14]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    svc = LinearSVC()
    train_start_time = time.perf_counter()
    svc.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = svc.predict(x_test)
    test_end_time = time.perf_counter()

    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['SVC'] = cv_details

In [16]:
pd.DataFrame.from_dict({(i, j): total_details[i][j] for i in total_details.keys() for j in total_details[i].keys()}, orient='index').to_csv('./result/nslkdd/binary/baseline.csv')