In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from metrics import binary_evaluate
import time
import copy

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from metrics import binary_evaluate

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
def preprocessing(raw_data, is_onehot=True, is_normalize=False):
    df = copy.deepcopy(raw_data)
    label = 'label'
    target = df.pop(label)

    numerical_features = [x for x in df.columns if df[x].dtype == np.float64 or df[x].dtype == np.int64]
    categorical_features = [x for x in df.columns if df[x].dtype == object]

    def discretization(x):
        mapp = dict(zip(x.unique().tolist(),
            range(len(x.unique().tolist()))))
        def mapfunction(y):
            if y in mapp:
                return mapp[y]
            else:
                return -1
        return mapfunction
    for i in categorical_features:
        df[i] = df[i].apply(discretization(df[i]))
    target = target.apply(discretization(target))

    if is_onehot:
        enc = OneHotEncoder()
        enc.fit(df[categorical_features])
        features = pd.concat([df, pd.DataFrame(enc.transform(df[categorical_features]).toarray())], axis=1)
        features.drop(categorical_features, axis=1, inplace=True)
    else:
        features = df

    if is_normalize:
        mms = MinMaxScaler()
        features[numerical_features + categorical_features] = mms.fit_transform(df[numerical_features + categorical_features])

    return features, target

In [5]:
train_path = './data/kdd99/kddcup_10p_preprocessing.csv'
raw_data = pd.read_csv(train_path)

In [6]:
features, target = preprocessing(raw_data, is_normalize=True, is_onehot=False)
features

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0.0,0.0,0.0,0.0,2.610418e-07,0.001057,0.0,0.0,0.0,0.0,...,0.035294,0.035294,1.0,0.0,0.11,0.00,0.00,0.00,0.0,0.0
1,0.0,0.0,0.0,0.0,3.446905e-07,0.000094,0.0,0.0,0.0,0.0,...,0.074510,0.074510,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0
2,0.0,0.0,0.0,0.0,3.389216e-07,0.000259,0.0,0.0,0.0,0.0,...,0.113725,0.113725,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0
3,0.0,0.0,0.0,0.0,3.158461e-07,0.000259,0.0,0.0,0.0,0.0,...,0.152941,0.152941,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0
4,0.0,0.0,0.0,0.0,3.129617e-07,0.000394,0.0,0.0,0.0,0.0,...,0.192157,0.192157,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145580,0.0,0.0,0.0,0.0,4.470881e-07,0.000365,0.0,0.0,0.0,0.0,...,0.337255,1.000000,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0
145581,0.0,0.0,0.0,0.0,4.067060e-07,0.000443,0.0,0.0,0.0,0.0,...,0.023529,1.000000,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0
145582,0.0,0.0,0.0,0.0,2.927706e-07,0.000233,0.0,0.0,0.0,0.0,...,0.062745,1.000000,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0
145583,0.0,0.0,0.0,0.0,4.196859e-07,0.000233,0.0,0.0,0.0,0.0,...,0.101961,1.000000,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0


In [7]:
total_details = dict()

In [8]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    lr = LogisticRegression(max_iter=100000)
    train_start_time = time.perf_counter()
    lr.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = lr.predict(x_test)
    test_end_time = time.perf_counter()

    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['lr'] = cv_details

In [9]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    cart = DecisionTreeClassifier()
    train_start_time = time.perf_counter()
    cart.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = cart.predict(x_test)
    test_end_time = time.perf_counter()

    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['CART'] = cv_details

In [10]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    c45 = DecisionTreeClassifier(criterion='entropy')
    train_start_time = time.perf_counter()
    c45.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = c45.predict(x_test)
    test_end_time = time.perf_counter()

    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['C4.5'] = cv_details

In [11]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    knn = KNeighborsClassifier()
    train_start_time = time.perf_counter()
    knn.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = knn.predict(x_test)
    test_end_time = time.perf_counter()

    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['KNN'] = cv_details

In [12]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    nb = GaussianNB()
    train_start_time = time.perf_counter()
    nb.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = nb.predict(x_test)
    test_end_time = time.perf_counter()
   
    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['NB'] = cv_details

In [13]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    rf = RandomForestClassifier()
    train_start_time = time.perf_counter()
    rf.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = rf.predict(x_test)
    test_end_time = time.perf_counter()

    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['RF'] = cv_details

In [14]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    svc = LinearSVC()
    train_start_time = time.perf_counter()
    svc.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = svc.predict(x_test)
    test_end_time = time.perf_counter()

    detail = binary_evaluate(y_test, y_pred)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['SVC'] = cv_details

In [15]:
pd.DataFrame.from_dict({(i, j): total_details[i][j] for i in total_details.keys() for j in total_details[i].keys()}, orient='index').to_csv('./result/kdd99/binary/baseline.csv')

In [16]:
pd.DataFrame.from_dict({(i, j): total_details[i][j] for i in total_details.keys() for j in total_details[i].keys()}, orient='index')

Unnamed: 0,Unnamed: 1,precision,recall,f1_score,fpr,acc,training_time,testing_time
lr,1,0.979526,0.993496,0.986462,0.031412,0.983583,2.036082,0.005429
lr,2,0.982075,0.992518,0.987269,0.02839,0.984373,2.010949,0.00523
lr,3,0.981568,0.992647,0.987076,0.028253,0.984339,2.003623,0.004981
lr,4,0.98306,0.992482,0.987749,0.025488,0.985266,2.067944,0.005427
lr,5,0.981032,0.992597,0.986781,0.02916,0.983961,1.851368,0.005574
CART,1,0.998916,0.998859,0.998888,0.00164,0.998661,0.678102,0.006857
CART,2,0.998706,0.998425,0.998565,0.002028,0.998248,0.602858,0.006181
CART,3,0.999315,0.998119,0.998717,0.001037,0.998455,0.494066,0.00575
CART,4,0.999082,0.99891,0.998996,0.001368,0.998798,0.745399,0.006102
CART,5,0.998861,0.998861,0.998861,0.001731,0.998626,0.657902,0.006004
