In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from metrics import binary_evaluate
import time
import copy

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from metrics import multi_evaluate

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
def preprocessing(raw_data):
    df = copy.deepcopy(raw_data)
    label = 'label'
    target = df.pop(label)
    
    numerical_features = [x for x in df.columns if df[x].dtype == np.float64 or df[x].dtype == np.int64]
    categorical_features = [x for x in df.columns if df[x].dtype == object]
    
    # convert object to int
    lbe = LabelEncoder()
    for feat in categorical_features:
        df[feat] = lbe.fit_transform(df[feat])
    
    # normalize the features
    mms = MinMaxScaler()
    df[numerical_features] = mms.fit_transform(df[numerical_features])
    
    return df, target, numerical_features, categorical_features

In [7]:
train_path = './data/kdd99/kddcup_10p_preprocessing_five.csv'
raw_data = pd.read_csv(train_path)
# raw_data['label'] = raw_data['types']
# raw_data.drop('types', axis=1, inplace=True)
type2idx = {type: idx for idx, type in enumerate(raw_data['label'].unique())}
idx2type = {idx: type for idx, type in enumerate(raw_data['label'].unique())}
raw_data['label'] = raw_data['label'].apply(lambda x: type2idx[x])
raw_data.label = raw_data.label.astype(int)

In [8]:
features, target, numerical_features, categorical_features = preprocessing(raw_data)

In [9]:
total_details = dict()

In [10]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    lr = LogisticRegression(max_iter=100000)
    train_start_time = time.perf_counter()
    lr.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = lr.predict(x_test)
    test_end_time = time.perf_counter()

    detail = multi_evaluate(y_test, y_pred, idx2type)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['lr'] = cv_details

In [11]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    cart = DecisionTreeClassifier()
    train_start_time = time.perf_counter()
    cart.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = cart.predict(x_test)
    test_end_time = time.perf_counter()

    detail = multi_evaluate(y_test, y_pred, idx2type)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['CART'] = cv_details

In [12]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    c45 = DecisionTreeClassifier(criterion='entropy')
    train_start_time = time.perf_counter()
    c45.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = c45.predict(x_test)
    test_end_time = time.perf_counter()

    detail = multi_evaluate(y_test, y_pred, idx2type)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['C4.5'] = cv_details

In [13]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    knn = KNeighborsClassifier()
    train_start_time = time.perf_counter()
    knn.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = knn.predict(x_test)
    test_end_time = time.perf_counter()

    detail = multi_evaluate(y_test, y_pred, idx2type)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['KNN'] = cv_details

In [14]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    nb = GaussianNB()
    train_start_time = time.perf_counter()
    nb.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = nb.predict(x_test)
    test_end_time = time.perf_counter()
   
    detail = multi_evaluate(y_test, y_pred, idx2type)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['NB'] = cv_details

In [15]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    rf = RandomForestClassifier()
    train_start_time = time.perf_counter()
    rf.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = rf.predict(x_test)
    test_end_time = time.perf_counter()

    detail = multi_evaluate(y_test, y_pred, idx2type)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['RF'] = cv_details

In [17]:
cv_details = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    x_train, y_train, x_test, y_test = features.loc[train_idx], target.loc[train_idx], features.loc[test_idx], target.loc[test_idx]
    svc = LinearSVC()
    train_start_time = time.perf_counter()
    svc.fit(x_train, y_train)
    train_end_time = time.perf_counter()

    test_start_time = time.perf_counter()
    y_pred = svc.predict(x_test)
    test_end_time = time.perf_counter()

    detail = multi_evaluate(y_test, y_pred, idx2type)
    detail['training_time'] = train_end_time - train_start_time
    detail['testing_time'] = test_end_time - test_start_time
    cv_details[idx+1] = detail
total_details['SVC'] = cv_details



In [18]:
result = pd.DataFrame.from_dict({(i, j, k): total_details[i][j][k] for i in total_details.keys() for j in total_details[i].keys() for k in total_details[i][j].keys()}).T


In [19]:
result.index.names = ['learner', 'cv', 'type']

In [20]:
result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,acc,fpr,precision,recall,f1_score
learner,cv,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
lr,1,normal,0.988358,0.003179,0.995032,0.975391,0.985114
lr,1,u2r,0.999760,0.777778,0.999760,1.000000,0.999880
lr,1,dos,0.993337,0.015709,0.990782,0.998688,0.994719
lr,1,r2l,0.997459,0.169725,0.998720,0.998720,0.998720
lr,1,probe,0.996703,0.188470,0.997043,0.999616,0.998328
...,...,...,...,...,...,...,...
SVC,5,dos,0.993234,0.015932,0.990430,0.998787,0.994591
SVC,5,r2l,0.998077,0.133005,0.999066,0.998997,0.999032
SVC,5,probe,0.996325,0.243108,0.996633,0.999652,0.998140
SVC,5,training_time,48.960213,48.960213,48.960213,48.960213,48.960213


In [21]:
result.to_csv('./result/kdd99/five/baseline.csv')