# Do some imports

In [1]:
import numpy as np
import pandas as pd
#!pip install --user pandas

import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import math

from dataloader import UNSW_NB15

### Inspired by [this github file](https://github.com/alik604/cyber-security/blob/master/Intrusion-Detection/UNSW_NB15%20-%20Torch%20MLP%20and%20autoEncoder.ipynb)

# Get UNSW_NB15 train and test set

In [None]:
#!wget https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/a%20part%20of%20training%20and%20testing%20set/UNSW_NB15_training-set.csv

In [None]:
#!wget https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/a%20part%20of%20training%20and%20testing%20set/UNSW_NB15_testing-set.csv

# Define the Neural Network class

In [None]:
# define NN architecture
class Net(nn.Module):
    
    def __init__(self, input_size,hidden1, hidden2, hidden3, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden1)
        self.batchnorm1 = nn.BatchNorm1d(hidden1)
        self.relu1 = nn.ReLU()
        self.dout = nn.Dropout(0.5)
        
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.batchnorm2 = nn.BatchNorm1d(hidden2)
        self.relu2 = nn.ReLU()

        
        self.fc3 = nn.Linear(hidden2, hidden3)
        self.batchnorm3 = nn.BatchNorm1d(hidden3)
        self.relu3 = nn.ReLU()

        
        self.out = nn.Linear(hidden3, num_classes)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        a1 = self.fc1(x)
        #b1 = self.batchnorm1(a1)
        h1 = self.relu1(a1)
        dout1 = self.dout(h1)
        
        a2 = self.fc2(dout1)
        #b2 = self.batchnorm2(a2)
        h2 = self.relu2(a2)
        dout2 = self.dout(h2)
        
        a3 = self.fc3(dout2)
        #b3 = self.batchnorm3(a3)
        h3 = self.relu3(a3)
        dout3 = self.dout(h3)
        
        a4= self.out(dout3)
        y = self.sigmoid(a4)
        return y

### Define Train,   Test   and    Display_Loss_Plot    methods

In [None]:
def train(model, device, train_loader, optimizer, criterion):
    losses = []
    model.train()
    y_true = []
    y_pred = []
    
    for i, data in enumerate(train_loader, 0):
        
        # get the inputs; data is a list of [inputs, target ( or labels)]
        inputs , target = data
        
        optimizer.zero_grad()
        
        #MOVING THE TENSORS TO THE CONFIGURED DEVICE
        #inputs, target = inputs.to(device), target.to(device)
        
        #FORWARD PASS
        output = model(inputs.float())

        loss = criterion(output, target.unsqueeze(1))
        #import pdb; pdb.set_trace()
        
        #BACKWARD AND OPTIMIZE
        
        loss.backward()
        optimizer.step()
        
        # PREDICTIONS
        #pred = np.round(output.detach().numpy())
        pred = output.detach().numpy() > 0.5  
        target = target.float()
        y_true.extend(target.tolist()) 
        y_pred.extend(pred.reshape(-1).tolist())
        
        losses.append(loss.data.numpy()) 
    #print("Accuracy on training set is" , accuracy_score(y_true,y_pred))
    return losses

In [None]:
#TESTING THE MODEL
def test(model, device, test_loader):
    #model in eval mode skips Dropout etc
    model.eval()
    y_true = []
    y_pred = []
    
    # set the requires_grad flag to false as we are in the test mode
    with torch.no_grad():
        for data in test_loader:
            
            #LOAD THE DATA IN A BATCH
            inputs ,target = data
            
            # the model on the data
            output = model(inputs.float())
                       
            #PREDICTIONS
            pred = np.round(output)
            #pred = output.detach().numpy() > 0.5 
            pred = pred * 1
            target = target.float()
            y_true.extend(target.tolist()) 
            y_pred.extend(pred.reshape(-1).tolist())
        
    return accuracy_score(y_true, y_pred)

In [None]:
def display_loss_plot(losses):
    x_axis = [i for i in range(len(losses))]
    plt.plot(x_axis,losses)
    plt.title('Loss of the model')
    plt.xlabel('iterations')
    plt.ylabel('Cross entropy loss')
    plt.show()

# Define some parameters first

In [None]:
device = 'cpu'
input_size = 196      # 42 for integer encoding 196
hidden1 = 128      # 1st layer number of neurons
hidden2 = 64
hidden3 = 32
num_classes = 1    # binary classification

num_epochs = 1000  #500 1000 100 100
batch_size = 500   #100 1000 100 500  
lr = 0.01          #0.01 0.01 0.005 0.1

# Initialize Neural Network class

In [None]:
model = Net(input_size, hidden1, hidden2, hidden3, num_classes)

# Define loss and optimizer 

In [None]:
criterion = nn.BCELoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))

# Initialize UNSW_NB15 class

In [None]:
#these are not slitted into validation and train set
train_dataset = UNSW_NB15(file_path ='UNSW_NB15_training-set.csv')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

#get the test dataframe
test_dataset = UNSW_NB15(file_path ='UNSW_NB15_testing-set.csv')
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)


# Lets Train, Test the model and see the loss

In [None]:
running_loss = []
for epoch in tqdm(range(num_epochs)):
        loss_epoch = train(model, device, train_loader, optimizer,criterion)
        running_loss.append(loss_epoch)

In [None]:
test(model,device,test_loader)

In [None]:
loss_per_epoch = [np.mean(loss_per_epoch) for loss_per_epoch in running_loss]
display_loss_plot(loss_per_epoch)

**********************************************************************************************************************

# Results

    num_epochs = 200
    batch_size = 8000 
    lr = 0.001
    accuracy on test set = 0.7572389836272653
<img src="data/loss_function_75.723898_acc.PNG">

# Quntization


In [3]:
train = pd.read_csv('UNSW_NB15_training-set.csv')
test = pd.read_csv('UNSW_NB15_testing-set.csv')


In [2]:
from sklearn import preprocessing
def integer_encoding(df_):
    df = df_.copy()
    """Applies integer encoding to the object columns of the dataframe"""
    le = preprocessing.LabelEncoder()
    
    for column in df.select_dtypes('object').columns.tolist():
        df[column] = le.fit_transform(df[column])
    return df

In [4]:
# adapted from here https://stackoverflow.com/questions/51471097/vectorized-conversion-of-decimal-integer-array-to-binary-array-in-numpy
def dec2bin(column: pd.Series, number_of_bits: int, left_msb:bool= True )-> pd.Series: 
    """Convert a decimal pd.Series to binary pd.Series with numbers in their base-2 equivalents.
    The output is a numpy nd array.   
    # adapted from here https://stackoverflow.com/questions/51471097/vectorized-conversion-of-decimal-integer-array-to-binary-array-in-numpy

    Parameters
    ----------
     column: pd.Series
        Series wit all decimal numbers that will be cast to binary
     number_of_bits: str
        The desired number of bits for the binary number. If bigger than what is needed then those bits will be 0.
        The number_of_bits should be >= than what is needed to express the largest decimal input 
     left_msb: bool
        Specify that the most significant digit is the leftmost element. If this is False, it will be the rightmost element.
    Returns
    -------
    numpy.ndarray
       Numpy array with all elements in binary representation of the input.
        
    """
 
    my_binary_repr = lambda number, nbits:  np.binary_repr(number, nbits)[::-1]
    func = my_binary_repr if left_msb else np.binary_repr
    
    return np.vectorize(func)(column.values, number_of_bits)

In [5]:
# adapted from here https://stackoverflow.com/questions/51471097/vectorized-conversion-of-decimal-integer-array-to-binary-array-in-numpy
def dec2bin(column: pd.Series, number_of_bits: int, left_msb:bool= True )-> pd.Series: 
    """Convert a decimal pd.Series to binary pd.Series with numbers in their base-2 equivalents.
    The output is a numpy nd array.   
    # adapted from here https://stackoverflow.com/questions/51471097/vectorized-conversion-of-decimal-integer-array-to-binary-array-in-numpy

    Parameters
    ----------
     column: pd.Series
        Series wit all decimal numbers that will be cast to binary
     number_of_bits: str
        The desired number of bits for the binary number. If bigger than what is needed then those bits will be 0.
        The number_of_bits should be >= than what is needed to express the largest decimal input 
     left_msb: bool
        Specify that the most significant digit is the leftmost element. If this is False, it will be the rightmost element.
    Returns
    -------
    numpy.ndarray
       Numpy array with all elements in binary representation of the input.
        
    """
 
    my_binary_repr_left_most = lambda number, nbits:  np.char.join(" ", np.binary_repr(number, nbits)[::-1]) 
    my_binary_repr_right_most = lambda number, nbits:  np.char.join(" ", np.binary_repr(number, nbits))
    func = my_binary_repr if left_msb else my_binary_repr_right_most
    
    return np.vectorize(func)(column.values, number_of_bits)

In [9]:
df = pd.concat([train,test])
#skip_cols = ['id','attack_cat'] #this is what they have
skip_cols = ['id', 'attack_cat'] #this might need to change

binary_matrix = None # final matrix of bit vetors
first_iteration = True

# gets the smallest positive number of a vector
get_min_positive_number = lambda vector: vector[vector > 0].min()
# computes the minimum required bits to represent eachs number from a vector of numbers
get_min_bits = lambda vector: math.ceil(math.log2(float(df[column].max())+1))

df = integer_encoding(df) #perform integer encoding on the string columns just as they do

for column in df.columns:
    
    if column not in skip_cols:
        m = get_min_positive_number(df[column])
        m = 1/m

        if m>1:
            df[column] = df[column] *m

        maxbits = get_min_bits(df[column])

        binary_vector = dec2bin(df[column].astype(np.uint32), maxbits, left_msb=False).reshape((-1,1))
        if first_iteration:
            binary_matrix = binary_vector
            first_iteration = False
        else:
            binary_matrix = np.hstack([binary_matrix, binary_vector])
            
            
id_ = round((2*binary_matrix.shape[0])/3)
id6 = round((2*binary_matrix.shape[0])/3/6)

infeat_train = binary_matrix[0:id_-id6,:]
infeat_valid = binary_matrix[(id_-id6+1):id_,:]
infeat_test  = binary_matrix[id_+1:,:]

infeat_train = infeat_train[1:round(infeat_train.shape[0]/10),:];
infeat_valid = infeat_valid[1:round(infeat_valid.shape[0]/10),:];
infeat_test  = infeat_test[1:round(infeat_test.shape[0]/10),:]

#np.savetxt("binary_dataset_no_sep.csv", binary_matrix, fmt="%s",delimiter="")

np.savetxt("fds_unswb15_train.txt", binary_matrix, fmt="%s",delimiter=" ")
np.savetxt('fds_unswb15_valid.txt', binary_matrix, fmt="%s",delimiter=" ")
np.savetxt('fds_unswb15_test.txt', binary_matrix, fmt="%s",delimiter=" ")

In [11]:
binary_matrix[0,:]

array(['0 0 0 0 0 0 0 0 0 1 1 1 0 1 1 0 1 0 1 0 0 0 0 1 1 0',
       '0 1 1 1 0 0 0 1', '0 0 0 0', '0 1 0 0',
       '0 0 0 0 0 0 0 0 0 0 0 1 1 0', '0 0 0 0 0 0 0 0 0 0 0 1 0 0',
       '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0',
       '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0',
       '0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 1 1 0',
       '1 1 1 1 1 1 0 0', '1 1 1 1 1 1 1 0',
       '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 1 1 1 1 1 0 0 0',
       '0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 1 1 1',
       '0 0 0 0 0 0 0 0 0 0 0 0 0', '0 0 0 0 0 0 0 0 0 0 0 0 0',
       '0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 1 1 1 0 0 1 1 1',
       '0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 1 1 1',
       '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 1 1 0 0 0 0 1',
       '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1 1 1',
       '1 1 1 1 1 1 1 1',
       '0 0 1 0 0 1 0 1 0 0 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 0 1 0 0',
       '1 0 0 0 0 0 1 1 0 1

In [12]:
binary_matrix.shape

(257673, 43)

# Differences between matlab script and what I have:
    train set and dataset order,i did not switch. But they do shuffle everything so doesn't matter.
    put space between everything takes up a lot of time
    files are different because they shuffle everything
    I dropped the attack cat and the id columns, like they did, not sure if label also need to be dropped
    

In [None]:
#string with 2 names
s = ["UNSW_NB15_testing-set.csv", "UNSW_NB15_training-set.csv"]; #The names are the other way around

opts = detectImportOptions(char(s(1)));
T0 = readtable(char(s(1)),opts);
opts = detectImportOptions(char(s(2)));
T1 = readtable(char(s(2)),opts);
T = [T0; T1];
#T is a dtaframe with both test set and train set
binfeat = [];
for i = 1:size(T,2)
    if i == 1 || i == (size(T,2)-1) ## for columns "id" and "attack_cat" do nothing
        continue;
    end
    
    M = table2array(T(:,i)); # M is column i (=/id =/attack_cat)
    
    if iscell(M)  #check if M is a cell array, a column is not a cell array. Cell array = {1} or {[1]}
        ue = string(unique(M));
        for j = 1:length(ue)
            new_M(strcmp(string(M),(ue(j)))) = j-1;
        end
        M = new_M;
    end
    
    m = min(M(M~=0)); #m= minimum value from column M, not counting with 0s
    m = 1/m;    
    if m > 1
        M = M*m; #multiply all column values (cells) by 100000 if m=0.000001..
    end
    
    #max(M): maiximum number of column (59999989)
    #double():cast to doub
    # le  (599999989)
    #log2():logarithm of (59999999990) (+1 dont forget) =25.58485
    #ceil(): arredondar p cima = 26 = b
    #de2bi():convert to binary the decimal column uint32(M), specify that the most significant digit is the leftmost and set the
    #desired number of columns to b, 
    b = ceil(log2(double(max(M))+1));#
    feat = dec2bin(uint32(M),b); 
    binfeat = [binfeat feat];
end

binfeat = binfeat(randperm(length(binfeat)),:); # to shuffle everything
pause(2.0);
id  = round(2*length(binfeat)/3);
id6 = round(2*length(binfeat)/3/6);
infeat_train = binfeat(1:id-id6,:);
infeat_valid = binfeat((id-id6+1):id,:);
infeat_test  = binfeat(id+1:end,:);

infeat_train = infeat_train(1:round(size(infeat_train,1)/10),:);
infeat_valid = infeat_valid(1:round(size(infeat_valid,1)/10),:);
infeat_test  = infeat_test(1:round(size(infeat_test,1)/10),:);

dataset_size = [length(infeat_train)+length(infeat_valid)+length(infeat_test) length(infeat_train) length(infeat_valid) length(infeat_test)]

fileID = fopen('fds_unswb15_train.txt','w');
for i = 1:size(infeat_train,1)
    i/size(infeat_train,1)
    for j = 1:size(infeat_train,2)
        fprintf(fileID,'#d ',infeat_train(i,j));
    end
    fprintf(fileID,"\n");
end
fclose(fileID);

fileID = fopen('fds_unswb15_valid.txt','w');
for i = 1:size(infeat_valid,1)
    i/size(infeat_valid,1)
    for j = 1:size(infeat_valid,2)
        fprintf(fileID,'#d ',infeat_valid(i,j));
    end
    fprintf(fileID,"\n");
end
fclose(fileID);

fileID = fopen('fds_unswb15_test.txt','w');
for i = 1:size(infeat_test,1)
    i/size(infeat_test,1)
    for j = 1:size(infeat_test,2)
        fprintf(fileID,'#d ',infeat_test(i,j));
    end
    fprintf(fileID,"\n");
end
fclose(fileID);
