In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from xgboost import XGBClassifier

In [19]:
read_file = './data/kdd99/kddcup_10p_preprocessing_five.csv'
df = pd.read_csv(read_file)
target = df.pop('label')

In [20]:
idx2label = {idx: label for idx, label in enumerate(target.unique())}
label2idx = {label: idx for idx, label in enumerate(target.unique())}
target = target.apply(lambda x: label2idx[x])

In [21]:
numerical_features = [x for x in df.columns if df[x].dtype == np.float64 or df[x].dtype == np.int64]
categorical_features = [x for x in df.columns if df[x].dtype == object]

In [22]:
lbe = LabelEncoder()
for feat in categorical_features:
    df[feat] = lbe.fit_transform(df[feat])

In [23]:
features = df

In [7]:
clf = XGBClassifier()
clf.fit(features, target)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=40, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [8]:
importance = pd.Series(clf.feature_importances_, index=features.columns).sort_values(ascending=False)

In [9]:
selected_features = importance[:21].index
selected_features

Index(['same_srv_rate', 'wrong_fragment', 'srv_serror_rate', 'diff_srv_rate',
       'num_compromised', 'count', 'hot', 'serror_rate',
       'dst_host_diff_srv_rate', 'protocol_type', 'src_bytes',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'logged_in',
       'dst_host_srv_serror_rate', 'num_failed_logins', 'srv_count',
       'dst_host_same_src_port_rate', 'root_shell', 'dst_bytes', 'service'],
      dtype='object')

In [10]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from metrics import multi_evaluate
import time
import copy

In [11]:
device = 'cuda'

In [12]:
class CustomDataset(Dataset):
    def __init__(self, data, target, transform=None, target_transform=None):
        self.data = data
        self.target = target
        self.transform = transform
        self.target_transform = target_transform
    
    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        features = self.data[idx]
        target = self.target[idx]
        # sample = {'features': features, 'target': target}
        return features, target

In [13]:
def create_dataloader(features, target, batch_size=64):
    dataset = CustomDataset(features.values, target.values, transform=ToTensor(), target_transform=ToTensor())

    dataloader = DataLoader(dataset, batch_size=batch_size)
    return dataloader

In [31]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)

    start_time = time.perf_counter()
    
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()

        # Compute prediction error
        pred = model(X.float())
        loss = loss_fn(pred, y)
        
        # Backpropagation
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            # print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

    end_time = time.perf_counter()

    return end_time - start_time

In [32]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0

    start_time = time.perf_counter()

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X.float())
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    end_time = time.perf_counter()
        
    test_loss /= size
    correct /= size
    
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return end_time - start_time

In [16]:
def evaluate(dataloader, model, device, idx2label):
    with torch.no_grad():
        input = torch.from_numpy(dataloader.dataset.data).float().to(device)
        out = model(input)
    y_pred = out.argmax(1).to('cpu').numpy()
    y_test = dataloader.dataset.target
    return multi_evaluate(y_test, y_pred, idx2label)

In [24]:
class NeuralNetwork(nn.Module):
    def __init__(self, num_features):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(num_features, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Linear(1024, 5),
            nn.Softmax()
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [28]:
mms = MinMaxScaler()
features[selected_features] = mms.fit_transform(features[selected_features])
features = features[selected_features]

In [35]:
cv_result = dict()
BATCH_SIZE = 1024
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    result = dict()
    x_train, x_test, y_train, y_test = features.loc[train_idx], features.loc[test_idx], target.loc[train_idx], target.loc[test_idx]
    train_dataloader = create_dataloader(x_train, y_train, batch_size=BATCH_SIZE)
    test_dataloader = create_dataloader(x_test, y_test, batch_size=BATCH_SIZE)
    device = 'cuda'
    model = NeuralNetwork(features.shape[1]).to(device)
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    epochs = 10
    training_time = 0.0
    testing_time = 0.0
    for t in range(epochs):
        # print(f'----------Epoch {t+1}----------')
        training_time += train(train_dataloader, model, loss_fn, optimizer)
        testing_time += test(test_dataloader, model, loss_fn)
    result = evaluate(test_dataloader, model, device, idx2label)
    result['training_time'] = training_time
    result['testing_time'] = testing_time
    cv_result[idx+1] = result

loss: 1.609658  [    0/116468]
loss: 1.904831  [102400/116468]
Test Error: 
 Accuracy: 60.5%, Avg loss: 0.001306 

loss: 0.904833  [    0/116468]
loss: 1.904831  [102400/116468]
Test Error: 
 Accuracy: 60.5%, Avg loss: 0.001306 

loss: 0.904833  [    0/116468]
loss: 1.904831  [102400/116468]
Test Error: 
 Accuracy: 60.5%, Avg loss: 0.001306 

loss: 0.904833  [    0/116468]
loss: 1.904831  [102400/116468]
Test Error: 
 Accuracy: 60.5%, Avg loss: 0.001306 

loss: 0.904833  [    0/116468]
loss: 1.904831  [102400/116468]
Test Error: 
 Accuracy: 60.5%, Avg loss: 0.001306 

loss: 0.904833  [    0/116468]
loss: 1.904831  [102400/116468]
Test Error: 
 Accuracy: 60.5%, Avg loss: 0.001306 

loss: 0.904833  [    0/116468]
loss: 1.904831  [102400/116468]
Test Error: 
 Accuracy: 60.5%, Avg loss: 0.001306 

loss: 0.904833  [    0/116468]
loss: 1.904831  [102400/116468]
Test Error: 
 Accuracy: 60.5%, Avg loss: 0.001306 

loss: 0.904833  [    0/116468]
loss: 1.904831  [102400/116468]
Test Error: 
 Acc

  p_tmp = multi_matrix[i][0,0] / (multi_matrix[i][0,0] + multi_matrix[i][1,0])
  input = module(input)


loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.6%, Avg loss: 0.001305 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.6%, Avg loss: 0.001305 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.6%, Avg loss: 0.001305 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.6%, Avg loss: 0.001305 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.6%, Avg loss: 0.001305 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.6%, Avg loss: 0.001305 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.6%, Avg loss: 0.001305 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.6%, Avg loss: 0.001305 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.6%, Avg loss: 0.00130

  p_tmp = multi_matrix[i][0,0] / (multi_matrix[i][0,0] + multi_matrix[i][1,0])
  input = module(input)


loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.3%, Avg loss: 0.001308 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.3%, Avg loss: 0.001308 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.3%, Avg loss: 0.001308 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.3%, Avg loss: 0.001308 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.3%, Avg loss: 0.001308 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.3%, Avg loss: 0.001308 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.3%, Avg loss: 0.001308 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.3%, Avg loss: 0.001308 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.3%, Avg loss: 0.00130

  p_tmp = multi_matrix[i][0,0] / (multi_matrix[i][0,0] + multi_matrix[i][1,0])
  input = module(input)


loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.1%, Avg loss: 0.001311 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.1%, Avg loss: 0.001311 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.1%, Avg loss: 0.001311 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.1%, Avg loss: 0.001311 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.1%, Avg loss: 0.001311 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.1%, Avg loss: 0.001311 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.1%, Avg loss: 0.001311 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.1%, Avg loss: 0.001311 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.1%, Avg loss: 0.00131

  p_tmp = multi_matrix[i][0,0] / (multi_matrix[i][0,0] + multi_matrix[i][1,0])
  input = module(input)


loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.2%, Avg loss: 0.001309 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.2%, Avg loss: 0.001309 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.2%, Avg loss: 0.001309 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.2%, Avg loss: 0.001309 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.2%, Avg loss: 0.001309 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.2%, Avg loss: 0.001309 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.2%, Avg loss: 0.001309 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.2%, Avg loss: 0.001309 

loss: 0.904833  [    0/116469]
loss: 1.904831  [102400/116469]
Test Error: 
 Accuracy: 60.2%, Avg loss: 0.00130

  p_tmp = multi_matrix[i][0,0] / (multi_matrix[i][0,0] + multi_matrix[i][1,0])


In [36]:
result = pd.DataFrame.from_dict({(j, i): cv_result[i][j] for i in cv_result.keys() for j in cv_result[i].keys()}).T
result.index.names = ['type', 'cv']

In [41]:
result.groupby(['type']).mean().to_csv('./result/kdd99/five/xgboost_dnn.csv')