In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from metrics import binary_evaluate
from xgboost import XGBClassifier

In [3]:
read_file = './data/kdd99/kddcup_10p_preprocessing.csv'
# test_file = './data/nslkdd/KDDTest_binary.csv'
df = pd.read_csv(read_file)
# test_df = pd.read_csv(test_file)

In [4]:
numerical_features = [x for x in df.columns if df[x].dtype == np.float64 or df[x].dtype == np.int64]
categorical_features = [x for x in df.columns if df[x].dtype == object]
def discretization(x):
    mapp = dict(zip(x.unique().tolist(),
         range(len(x.unique().tolist()))))
    def mapfunction(y):
        if y in mapp:
            return mapp[y]
        else:
            return -1
    return mapfunction
for i in categorical_features:
    df[i] = df[i].apply(discretization(df[i]))
    # test_df[i] = test_df[i].apply(discretization(test_df[i]))
categorical_features.remove('label')

In [8]:
features = df
target = features.pop('label')

In [9]:
clf = XGBClassifier()
clf.fit(features, target)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=40, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [16]:
importance = pd.Series(clf.feature_importances_, index=features.columns).sort_values(ascending=False)

In [20]:
selected_features = importance[:21].index

In [21]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from metrics import binary_evaluate
import time
import copy

In [22]:
device = 'cuda'

In [23]:
class CustomDataset(Dataset):
    def __init__(self, data, target, transform=None, target_transform=None):
        self.data = data
        self.target = target
        self.transform = transform
        self.target_transform = target_transform
    
    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        features = self.data[idx]
        target = self.target[idx]
        # sample = {'features': features, 'target': target}
        return features, target

In [24]:
def preprocessing(raw_data, cols):
    df = copy.deepcopy(raw_data)
    label = 'label'
    target = df.pop(label)
    df = df[cols]  
    
    numerical_features = [x for x in df.columns if df[x].dtype == np.float64 or df[x].dtype == np.int64]
    categorical_features = [x for x in df.columns if df[x].dtype == object]
    
    # convert object to int
    lbe = LabelEncoder()
    for feat in categorical_features:
        df[feat] = lbe.fit_transform(df[feat])
    
    # normalize the features
    mms = MinMaxScaler()
    df[numerical_features] = mms.fit_transform(df[numerical_features])
    
    return df, target, numerical_features, categorical_features

In [25]:
def create_dataloader(features, target, batch_size=64):
    dataset = CustomDataset(features.values, target.values, transform=ToTensor(), target_transform=ToTensor())

    dataloader = DataLoader(dataset, batch_size=batch_size)
    return dataloader

In [26]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)

    start_time = time.perf_counter()
    
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        # Compute prediction error
        pred = model(X.float())
        loss = loss_fn(pred, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            # print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

    end_time = time.perf_counter()

    return end_time - start_time

In [27]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0

    start_time = time.perf_counter()

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X.float())
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    end_time = time.perf_counter()
        
    test_loss /= size
    correct /= size
    
    # print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return end_time - start_time

In [28]:
def evaluate(dataloader, model, device):
    with torch.no_grad():
        input = torch.from_numpy(dataloader.dataset.data).float().to(device)
        out = model(input)
    y_pred = out.argmax(1).to('cpu').numpy()
    y_test = dataloader.dataset.target
    return binary_evaluate(y_test, y_pred)

In [29]:
class NeuralNetwork(nn.Module):
    def __init__(self, num_features):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(num_features, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 2),
            nn.Softmax()
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [37]:
mms = MinMaxScaler()
features[categorical_features] = mms.fit_transform(features[categorical_features])

In [40]:
features = features[selected_features]

In [42]:
features.shape

(145585, 21)

In [56]:
cv_result = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    result = dict()
    x_train, x_test, y_train, y_test = features.loc[train_idx], features.loc[test_idx], target.loc[train_idx], target.loc[test_idx]
    train_dataloader = create_dataloader(x_train, y_train, batch_size=1024)
    test_dataloader = create_dataloader(x_test, y_test, batch_size=1024)
    device = 'cuda'
    model = NeuralNetwork(features.shape[1]).to(device)
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    epochs = 20
    training_time = 0.0
    testing_time = 0.0
    for t in range(epochs):
        # print(f'----------Epoch {t+1}----------')
        training_time += train(train_dataloader, model, loss_fn, optimizer)
        testing_time += test(test_dataloader, model, loss_fn)
    result = evaluate(test_dataloader, model, device)
    result['training_time'] = training_time
    result['testing_time'] = testing_time
    cv_result[idx+1] = result

----------Epoch 1----------


  input = module(input)


----------Epoch 2----------
----------Epoch 3----------
----------Epoch 4----------
----------Epoch 5----------
----------Epoch 6----------
----------Epoch 7----------
----------Epoch 8----------
----------Epoch 9----------
----------Epoch 10----------
----------Epoch 11----------
----------Epoch 12----------
----------Epoch 13----------
----------Epoch 14----------
----------Epoch 15----------
----------Epoch 16----------
----------Epoch 17----------
----------Epoch 18----------
----------Epoch 19----------
----------Epoch 20----------
----------Epoch 1----------


  input = module(input)


----------Epoch 2----------
----------Epoch 3----------
----------Epoch 4----------
----------Epoch 5----------
----------Epoch 6----------
----------Epoch 7----------
----------Epoch 8----------
----------Epoch 9----------
----------Epoch 10----------
----------Epoch 11----------
----------Epoch 12----------
----------Epoch 13----------
----------Epoch 14----------
----------Epoch 15----------
----------Epoch 16----------
----------Epoch 17----------
----------Epoch 18----------
----------Epoch 19----------
----------Epoch 20----------
----------Epoch 1----------


  input = module(input)


----------Epoch 2----------
----------Epoch 3----------
----------Epoch 4----------
----------Epoch 5----------
----------Epoch 6----------
----------Epoch 7----------
----------Epoch 8----------
----------Epoch 9----------
----------Epoch 10----------
----------Epoch 11----------
----------Epoch 12----------
----------Epoch 13----------
----------Epoch 14----------
----------Epoch 15----------
----------Epoch 16----------
----------Epoch 17----------
----------Epoch 18----------
----------Epoch 19----------
----------Epoch 20----------
----------Epoch 1----------


  input = module(input)


----------Epoch 2----------
----------Epoch 3----------
----------Epoch 4----------
----------Epoch 5----------
----------Epoch 6----------
----------Epoch 7----------
----------Epoch 8----------
----------Epoch 9----------
----------Epoch 10----------
----------Epoch 11----------
----------Epoch 12----------
----------Epoch 13----------
----------Epoch 14----------
----------Epoch 15----------
----------Epoch 16----------
----------Epoch 17----------
----------Epoch 18----------
----------Epoch 19----------
----------Epoch 20----------
----------Epoch 1----------


  input = module(input)


----------Epoch 2----------
----------Epoch 3----------
----------Epoch 4----------
----------Epoch 5----------
----------Epoch 6----------
----------Epoch 7----------
----------Epoch 8----------
----------Epoch 9----------
----------Epoch 10----------
----------Epoch 11----------
----------Epoch 12----------
----------Epoch 13----------
----------Epoch 14----------
----------Epoch 15----------
----------Epoch 16----------
----------Epoch 17----------
----------Epoch 18----------
----------Epoch 19----------
----------Epoch 20----------


In [59]:
pd.DataFrame(cv_result).mean(1).to_csv('./result/kdd99/binary_embedding/xgboost_dnn.csv')