In [None]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from metrics import multi_evaluate
import time
import copy
from torch_geometric.nn import GCNConv,GATConv,SAGEConv
import torch.nn as nn
import torch.nn.functional as F

In [None]:
device = 'cuda'

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, target, transform=None, target_transform=None):
        self.data = data
        self.target = target
        self.transform = transform
        self.target_transform = target_transform
    
    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        features = self.data[idx]
        target = self.target[idx]
        # sample = {'features': features, 'target': target}
        return features, target

In [None]:
def preprocessing(raw_data, cols):
    df = copy.deepcopy(raw_data)
    label = 'label'
    target = df.pop(label)
    df = df[cols]  
    
    numerical_features = [x for x in df.columns if df[x].dtype == np.float64 or df[x].dtype == np.int64]
    categorical_features = [x for x in df.columns if df[x].dtype == object]
    
    # convert object to int
    lbe = LabelEncoder()
    for feat in categorical_features:
        df[feat] = lbe.fit_transform(df[feat])
    
    # normalize the features
    mms = MinMaxScaler()
    df[numerical_features] = mms.fit_transform(df[numerical_features])
    
    return df, target, numerical_features, categorical_features

In [None]:
def create_dataloader(features, target, batch_size=64):
    dataset = CustomDataset(features.values, target.values, transform=ToTensor(), target_transform=ToTensor())

    dataloader = DataLoader(dataset, batch_size=batch_size)
    return dataloader

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)

    start_time = time.perf_counter()
    
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        # Compute prediction error
        pred = model(X.float())
        loss = loss_fn(pred, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            # print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

    end_time = time.perf_counter()

    return end_time - start_time

In [None]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0

    start_time = time.perf_counter()

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X.float())
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    end_time = time.perf_counter()
        
    test_loss /= size
    correct /= size
    
    # print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return end_time - start_time

In [None]:
def evaluate(dataloader, model, device, idx2label):
    with torch.no_grad():
        input = torch.from_numpy(dataloader.dataset.data).float().to(device)
        out = model(input)
    y_pred = out.argmax(1).to('cpu').numpy()
    y_test = dataloader.dataset.target
    return multi_evaluate(y_test, y_pred, idx2label)

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, num_features):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(num_features, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 5),
            nn.Softmax()
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
def linear(inp, out, dropout):
    """
    linear model module by nn.sequential
    :param inp: int, linear model input dimensio
    :param out: int, linear model output dimension
    :param dropout: float dropout probability for linear layer
    :return: tensor
    """
    return nn.Sequential(
        nn.Linear(inp, out),
        nn.LeakyReLU(),
        nn.Dropout(dropout)
    )

In [None]:
class GNN_embedding(nn.Module):
    
    def __init__(self,
                 channels_gnn,
                 channels_mlp=None,
                 num_class=None,
                 heads=1,
                 mp_nn="gcn",
                 bias_gnn=True,
                 bias_mlp=True):
        super().__init__()

        channels_gnn.append(num_class)
        
        self.channels_gnn = channels_gnn
        self.mp_nn = mp_nn
        self.bias_gnn = bias_gnn
        self.heads = heads
        self.gnn = nn.ModuleList()
        
        for channel_no in range(len(channels_gnn)-1):
            if self.mp_nn == "gcn":
                self.gnn.append(GCNConv(in_channels=channels_gnn[channel_no], out_channels=channels_gnn[channel_no+1], bias=bias_gnn))
                
            elif self.mp_nn == "gat":
                self.gnn.append(GATConv(in_channels = channels_gnn[channel_no], out_channels=channels_gnn[channel_no+1], heads = self.heads,bias=bias_gnn,concat=False))

            elif self.mp_nn == "sg":
                self.gnn.append(SAGEConv(in_channels=channels_gnn[channel_no], out_channels=channels_gnn[channel_no+1], bias = bias_gnn))
            
            else:
                raise Exception("Check GNN type!")

        
        # self.channels_mlp = channels_mlp        
        # self.bias_mlp = bias_mlp
        # self.num_class = num_class
        # self.linear = nn.ModuleList()
        # self.channels_mlp.insert(0,channels_gnn[-1])
        # self.channels_mlp.append(num_class)

        # for channel_no in range(0,len(channels_mlp)-1):
        #     self.linear.append(nn.Linear(channels_mlp[channel_no],channels_mlp[channel_no+1], bias=bias_mlp))
        

    def model_parameters(self, model):
        return model.state_dict()

    def weight_update_gnn(self, wgt_add):
        assert len(self.gnn) -1== len(wgt_add), "Match number of GNNs and feature additions"


        # for i in range(1,len(self.channels_gnn)):
        #     self.channels_gnn[i] = self.channels_gnn[i] + wgt_add[i-1]

        for i in range(1,len(self.channels_gnn)-1):
            self.channels_gnn[i] = self.channels_gnn[i] + wgt_add[i-1]

        for i in range(len(self.gnn)):
            model_param = self.model_parameters(self.gnn[i])
            
            if self.mp_nn == "gcn":
                self.gnn[i] = GCNConv(in_channels = self.channels_gnn[i], out_channels = self.channels_gnn[i+1], bias=self.bias_gnn)

            elif self.mp_nn == "gat":
                self.gnn[i] = GATConv(in_channels = self.channels_gnn[i], out_channels = self.channels_gnn[i+1], heads = self.heads, bias=self.bias_gnn, concat=False)

            elif self.mp_nn == "sg":
                self.gnn[i] = SAGEConv(in_channels=self.channels_gnn[i], out_channels=self.channels_gnn[i+1], bias = self.bias_gnn)


            with torch.no_grad():
                if self.mp_nn == "gcn":
                    self.gnn[i].lin.weight[0:model_param["lin.weight"].shape[0] , 0:model_param["lin.weight"].shape[1]] = model_param["lin.weight"]
                    if self.bias_gnn:
                        self.gnn[i].bias[0:model_param["bias"].shape[0]] = model_param["bias"]
                        
                elif self.mp_nn == "gat":
                    self.gnn[i].att_src[0,0:self.heads,0:model_param['att_src'].shape[2]] = model_param['att_src'][0,0:self.heads]
                    self.gnn[i].att_dst[0,0:self.heads,0:model_param['att_dst'].shape[2]] = model_param['att_dst'][0,0:self.heads]
                    self.gnn[i].lin_src.weight[0:model_param["lin_src.weight"].shape[0] , 0:model_param["lin_src.weight"].shape[1]] = model_param["lin_src.weight"]
                    if self.bias_gnn:
                        self.gnn[i].bias[0:model_param["bias"].shape[0]] = model_param["bias"]

                elif self.mp_nn == "sg":
                    self.gnn[i].lin_l.weight[0:model_param["lin_l.weight"].shape[0] , 0:model_param["lin_l.weight"].shape[1]] = model_param["lin_l.weight"]
                    self.gnn[i].lin_r.weight[0:model_param["lin_r.weight"].shape[0] , 0:model_param["lin_r.weight"].shape[1]] = model_param["lin_r.weight"]
                    if self.bias_gnn:
                        self.gnn[i].lin_l.bias[0:model_param["lin_l.bias"].shape[0]] = model_param["lin_l.bias"]
                       
    # def weight_update_mlp(self, wgt_add):
    #     assert len(self.linear)-1 == len(wgt_add), "Match number of Linear layers and node additions"
    #     assert self.channels_mlp[-1] == self.num_class, "MLP output not match class number"

    #     self.channels_mlp[0] = self.channels_gnn[-1] #Change here!
        
    #     for i in range(1,len(self.channels_mlp)-1):
    #         self.channels_mlp[i] = self.channels_mlp[i] + wgt_add[i-1]

    #     for i in range(len(self.linear)):
    #         model_param = self.model_parameters(self.linear[i])
    #         self.linear[i] = nn.Linear(self.channels_mlp[i], self.channels_mlp[i+1], bias=self.bias_mlp)
    #         with torch.no_grad():
    #             self.linear[i].weight[0:model_param["weight"].shape[0] , 0:model_param["weight"].shape[1]] = model_param["weight"]
    #             if self.bias_mlp:
    #                 self.linear[i].bias[0:model_param["bias"].shape[0]] = model_param["bias"]

    def weight_update(self, wgt_gnn=None, wgt_mlp=None):
        self.weight_update_gnn(wgt_gnn)
        # self.weight_update_mlp(wgt_mlp)             
  
    def forward(self,x,edge_index):
        for i in range(len(self.gnn)):
            x = self.gnn[i](x, edge_index)
            x = F.leaky_relu(x)
            x =  F.dropout(x, training=self.training)

        # for i in range(len(self.linear)):
        #     x = self.linear[i](x)
        #     x =  F.dropout(x, training=self.training)
        return x


# def equal_(self):
#     for (n,p),(_,pc) in zip(self.model.named_parameters(),self.mc.named_parameters()):
#         # print(i,n,p.grad,sep='\t')
#         if not torch.all(p.eq(pc)).data:
#             print(n,"\n", p.eq(pc),sep='\t')
#         else:
#             return True
# print("Initial parameters")

# edge_index = torch.tensor([[0, 1],
#                            [1, 0],
#                            [1, 2],
#                            [2, 1],
#                            ], dtype=torch.long).t().contiguous()

# x = torch.tensor([[-1,2], [0,3], [1,5]], dtype=torch.float)

# geo = GraphLayer(channels_gnn = [x.shape[1],3,5], channels_mlp=[8,3], num_class=5)

# for n,p in geo.named_parameters():
#     print(n,p,end="\n")

# out_ini = geo(x,edge_index)
# print("Output",out_ini)

# print("--"*120,"\nUpdated parameters")
# wgt_add_gnn = [1,2]
# wgt_add_mlp = [7,3]
# geo.weight_update(wgt_add_gnn,wgt_add_mlp)

# for n,p in geo.named_parameters():
#     print(n,p,end="\n")

# out_upd = geo(x,edge_index)
# print(out_upd)

# print("*"*120)
# print("Update again!")
# wgt_add_gnn = [4,2]
# wgt_add_mlp = [7,5]
# geo.weight_update(wgt_add_gnn,wgt_add_mlp)

# for n,p in geo.named_parameters():
#     print(n,p,end="\n")
# print("-"*100)
# out_upd = geo(x,edge_index)
# print(out_upd)

In [None]:
raw_data = pd.read_csv('./data/kdd99/kddcup_10p_preprocessing_five.csv')
type2idx = {type: idx for idx, type in enumerate(raw_data['label'].unique())}
idx2type = {idx: type for idx, type in enumerate(raw_data['label'].unique())}
raw_data['label'] = raw_data['label'].apply(lambda x: type2idx[x])
raw_data.label = raw_data.label.astype(int)

In [None]:
importance = pd.read_csv('./importance/kdd99/five/importance_70.csv', index_col=0)
cols = importance[:20].index

In [None]:
features, target, numerical_features, categorical_features = preprocessing(raw_data, cols)

In [None]:
feat_dict = {feat : idx for idx, feat in enumerate(features.columns)}
embedding_dim = 5
embedding_feat = {feat: (features[feat].value_counts().count(),embedding_dim) for feat in categorical_features}

In [None]:
BATCH_SIZE = 1024
cv_result = dict()
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
    result = dict()
    x_train, x_test, y_train, y_test = features.loc[train_idx], features.loc[test_idx], target.loc[train_idx], target.loc[test_idx]
    train_dataloader = create_dataloader(x_train, y_train, batch_size=BATCH_SIZE)
    test_dataloader = create_dataloader(x_test, y_test, batch_size=BATCH_SIZE)

    device = 'cuda'
    model = GNN_embedding(feat_dict, embedding_feat, [512, 512, 512], [0, 0, 0], 5).to(device)
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    epochs = 20
    training_time = 0.0
    testing_time = 0.0
    for t in range(epochs):
        print(f'----------Epoch {t+1}----------')
        training_time += train(train_dataloader, model, loss_fn, optimizer)
        testing_time += test(test_dataloader, model, loss_fn)
    result = evaluate(test_dataloader, model, device, idx2type)
    for key in type2idx.keys():
        result[key]['training_time'] = training_time
        result[key]['testing_time'] = testing_time
    cv_result[idx+1] = result

In [None]:
total_result = dict()
for sample_rate in range(10, 110, 10):
    importance = pd.read_csv('./importance/kdd99/five/importance_' + str(sample_rate) + '.csv', index_col=0)
    cer_sample_result = dict()
    print(f'----------Sample rate is {sample_rate}----------')

    for i in range(1, len(importance)+1):
        print(f'----------Feature number is {i}----------')
        cols = importance.index[:i]
        features, target, numerical_features, categorical_features = preprocessing(raw_data, cols)

        feat_dict = {feat : idx for idx, feat in enumerate(features.columns)}
        embedding_dim = 5
        embedding_feat = {feat: (features[feat].value_counts().count(),embedding_dim) for feat in categorical_features}
        BATCH_SIZE = 1024
        cv_result = dict()
        kf = KFold(n_splits=5, shuffle=True, random_state=1)
        for idx, (train_idx, test_idx) in enumerate(kf.split(features, target)):
            result = dict()
            x_train, x_test, y_train, y_test = features.loc[train_idx], features.loc[test_idx], target.loc[train_idx], target.loc[test_idx]
            train_dataloader = create_dataloader(x_train, y_train, batch_size=BATCH_SIZE)
            test_dataloader = create_dataloader(x_test, y_test, batch_size=BATCH_SIZE)

            device = 'cuda'
            model = GNN_embedding(feat_dict, embedding_feat, [512, 512, 512], [0, 0, 0], 5).to(device)
            
            loss_fn = nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
            epochs = 10
            training_time = 0.0
            testing_time = 0.0
            for t in range(epochs):
                # print(f'----------Epoch {t+1}----------')
                training_time += train(train_dataloader, model, loss_fn, optimizer)
                testing_time += test(test_dataloader, model, loss_fn)
            result = evaluate(test_dataloader, model, device, idx2type)
            for key in type2idx.keys():
                result[key]['training_time'] = training_time
                result[key]['testing_time'] = testing_time
            cv_result[idx+1] = result
        cer_sample_result[i] = cv_result
    total_result[sample_rate] = cer_sample_result
    tmp = pd.DataFrame.from_dict({(i, j, k): cer_sample_result[i][j][k] for i in cer_sample_result.keys() for j in cer_sample_result[i].keys() for k in cer_sample_result[i][j].keys()}, orient='index').to_csv('./result_sample_rate_' + str(sample_rate) + '.csv')

In [None]:
pd.DataFrame.from_dict({(i, j, k, v): total_result[i][j][k][v] for i  in total_result.keys() for j in total_result[i].keys() for k in total_result[i][j].keys() for v in total_result[i][j][k].keys()}, orient='index').to_csv('./total_result_kdd99.csv')