# Do some imports

In [1]:
import numpy as np
import pandas as pd

import torch
import torch.utils.data
import torchvision.transforms as transforms
from torchvision import datasets

from sklearn import preprocessing

#needed to create the Neural Network
import torch.nn as nn
import torch.nn.functional as F

#general
pd.set_option('display.max_columns', 500)

### Inspired by [this github file](https://github.com/alik604/cyber-security/blob/master/Intrusion-Detection/UNSW_NB15%20-%20Torch%20MLP%20and%20autoEncoder.ipynb)

# Get UNSW_NB15 train and test set

In [2]:
#!wget https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/a%20part%20of%20training%20and%20testing%20set/UNSW_NB15_training-set.csv

In [3]:
#!wget https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/a%20part%20of%20training%20and%20testing%20set/UNSW_NB15_testing-set.csv

# Define UNSW_NB15 class

In [4]:
class UNSW_NB15(torch.utils.data.Dataset):
    def __init__(self, file_path, sequence_length=25, transform=None):
        #TODO have a sequence_overlap=True flag? Does overlap matter?
        self.transform = transform
        self.sequence_length = sequence_length
        self.columns = ['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label']
        self.dtypes = dtypes = {"id":"int32",
                                "scrip": "string",
                                #"sport": "int32",
                                "dstip": "string",
                                #"dsport": "int32",
                                "proto": "string",
                                "state": "string",
                                "dur": "float64",
                                "sbytes": "int32",
                                "dbytes": "int32",
                                "sttl": "int32",
                                "dttl": "int32",
                                "sloss": "int32",
                                "dloss": "int32",
                                "service": "string",
                                "sload": "float64",
                                "dload": "float64",
                                "spkts": "int32",
                                "dpkts": "int32",
                                "swin": "int32",
                                "dwin": "int32",
                                "stcpb": "int32",
                                "dtcpb": "int32", 
                                #"smeansz": "int32",
                                #"dmeansz": "int32",
                                "trans_depth": "int32",
                                #"res_bdy_len": "int32",
                                "sjit": "float64",
                                "djit": "float64",
                                #"stime": "int64",
                                #"ltime": "int64",
                                #"sintpkt": "float64",
                                #"dintpkt": "float64",
                                "tcprtt": "float64",
                                "synack": "float64",
                                "ackdat": "float64",

                                #commenting these because they have mixed values and we aren't going to generate them anyway
                                #"is_sm_ips_ports": "int32",
                                #"ct_state_ttl": "int32",
                                #"ct_flw_httpd_mthd": "int32",
                                #"is_ftp_login": "int32",
                                #"is_ftp_cmd": "int32",
                                #"ct_ftp_cmd": "int32",
                                #"ct_srv_src": "int32",
                                ##"ct_dst_ltm": "int32", 
                                #"ct_src_ltm": "int32",
                                #"ct_src_dport_ltm": "int32",
                                #"ct_dst_sport_ltm": "int32",
                                #"ct_dst_src_ltm": "int32",
                                "attack_cat": "string",
                                "label": "int32"}
        self.categorical_column_values = {"proto":None, "state":None, "service":None, "attack_cat":None}

        self.dataframe = pd.read_csv(file_path, encoding="latin-1", names=self.columns,header=0, dtype=self.dtypes)
        #self.dataframe.sort_values(by=['stime']) #sort chronologically upon loading
        
        #load all the unique values of categorical features at the start
        #and make these accessible via a fast function call.
        for key in self.categorical_column_values:
            self.categorical_column_values[key] = self.dataframe[key].unique()

        #cache all the maximum values in numeric columns since we'll be using these for feature extraction
        self.maximums = {}
        for key in self.dtypes:
            if "int" in self.dtypes[key] or "float" in self.dtypes[key]:
                self.maximums[key] = max(self.dataframe[key])
        
        #------------------------------------------------
        self.dataframe = self.dataframe.drop(['id'],1)
               
       
        ##------Encoding string columns with value between 0 and n_classes-1----
        le = preprocessing.LabelEncoder()
        self.dataframe['attack_cat'] = le.fit_transform(self.dataframe['attack_cat'])
        self.dataframe['proto'] = le.fit_transform(self.dataframe['proto'])
        self.dataframe['service'] = le.fit_transform(self.dataframe['service'])
        self.dataframe['state'] = le.fit_transform(self.dataframe['state'])
        
        # ----------Normalising all numerical features--------------
        #cols_to_normalise = list(self.dataframe.columns.values)[:39]
        #self.dataframe[cols_to_normalise] = self.dataframe[cols_to_normalise].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
        #self.dataframe[cols_to_normalise] = self.dataframe[cols_to_normalise].apply(lambda x: (x - x.min()) / (x.max() - x.min()))               
        
        #-----------Create pytorch tensor----------------
        self.tensor = torch.Tensor(self.dataframe.values)
        
        
        
    def get_tensor(self):
        return self.tensor
    
    def get_dataframe(self):
        return self.dataframe
    
    def __len__(self):
        return len(self.dataframe.index) - self.sequence_length
    
    def __getitem__(self, index):
        #TODO need error checking for out of bounds?
        #TODO return x,y where y is the category of the example
        #since none corresponds to "normal" data
        
        list_of_dicts = []
        for i in range(index,index+self.sequence_length):
            list_of_dicts.append(self.dataframe.loc[i, :].to_dict())
        
        if self.transform is not None:
            return self.transform(self, list_of_dicts)
        
        return list_of_dicts
    
    #get a list of all the unique labels in the dataset
    def get_labels(self):
        return self.dataframe['label'].unique().tolist()
    
    #get a list of all the unique attack categories in the dataset
    def get_attack_categories(self):
        return self.dataframe['attack_cat'].unique().tolist()
    
    def get_list_of_categories(self, column_name):
        pass #TODO

    #limit the dataset to only examples in the specified category
    def use_only_category(self, category_name):
        if category_name not in self.get_attack_categories():
            return False
        
        new_dataframe = self.dataframe[self.dataframe['attack_cat'] == category_name]
        new_dataframe = new_dataframe.reset_index()
        self.dataframe = new_dataframe
        return True
    
    #limit the dataset to only examples with the specified label
    def use_only_label(self, label):
        if label not in self.get_labels():
            return False
        
        new_dataframe = self.dataframe[self.dataframe['label'] == label]
        new_dataframe = new_dataframe.reset_index()
        self.dataframe = new_dataframe
        return True

# Define the Neural Network class

In [5]:
# define NN architecture
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, hidden_size_2, num_classes):
        super(Net,self).__init__()
        self.input_size = input_size
        # linear layer (input_size -> hidden_size)
        self.fc1 = nn.Linear(input_size, hidden_size)
        # linear layer (hidden_size -> hidden_2)
        self.fc2 = nn.Linear(hidden_size, hidden_size_2)
        # linear layer (hidden_size_2 -> num_classes)
        self.fc3 = nn.Linear(hidden_size_2, num_classes)
        # dropout layer (p=0.2)
        # dropout prevents overfitting of data
        self.droput = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.elu = nn.ELU()
        
    def forward(self,x):
        #x is the input tensor
        out = self.fc1(x)
        #add hidden layer, with relu activation function
        out = self.relu(out)
        out = self.fc2(out)
        # add hidden layer, with relu activation function
        out = self.relu(out)
        # add dropout instead of relu or not..?
        #out = self.droput(out)
        out = self.fc3(out)
        return out


## Initialize UNSW_NB15 class

In [31]:
unsw_nb15_training = UNSW_NB15(file_path ='UNSW_NB15_training-set.csv')
training = unsw_nb15_training.get_dataframe()
x_training = training.iloc[:, 0:-1].values
y_training = training.iloc[:, -1].values # Last two (or 1 ?) columns are categories and labels


unsw_nb15_testing = UNSW_NB15(file_path ='UNSW_NB15_testing-set.csv')
testing = unsw_nb15_testing.get_dataframe()
x_testing = testing.iloc[:, 0:-1].values
y_testing = testing.iloc[:, -1].values # Last two (or 1 ?) columns are categories and labels

training.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,0.121478,113,0,2,6,4,258,172,74.08749,252,254,14158.94238,8495.365234,0,0,24.2956,8.375,30.177547,11.830604,255,621772692,-2092433665,255,0.0,0.0,0.0,43,43,0,0,1,0,1,1,1,1,0,0,0,1,1,0,6,0
1,0.649902,113,0,2,14,38,734,42014,78.473372,62,252,8395.112305,503571.3125,2,17,49.915,15.432865,61.426934,1387.77833,255,1417884146,-1217579325,255,0.0,0.0,0.0,52,1106,0,0,43,1,1,1,1,2,0,0,0,1,6,0,6,0
2,1.623129,113,0,2,8,16,364,13186,14.170161,62,252,1572.271851,60929.23047,1,6,231.875571,102.737203,17179.58686,11420.92623,255,2116150707,-1331852323,255,0.111897,0.061458,0.050439,46,824,0,0,7,1,2,1,1,3,0,0,0,2,6,0,6,0
3,1.681642,113,3,2,12,12,628,770,13.677108,62,252,2740.178955,3358.62207,1,3,152.876547,90.235726,259.080172,4991.784669,255,1107119177,1047442890,255,0.0,0.0,0.0,52,64,0,0,1,1,2,1,1,3,1,1,0,2,1,0,6,0
4,0.449454,113,0,2,10,6,534,268,33.373826,254,252,8561.499023,3987.059814,2,1,47.750333,75.659602,2415.837634,115.807,255,-1858829747,1977154190,255,0.128381,0.071147,0.057234,53,45,0,0,43,1,2,2,1,40,0,0,0,2,39,0,6,0


## Define some parameters first

In [23]:
input_size = 43
hidden_size = 64      # 1st layer number of neurons
hidden_size_2 = 64    # 2nd layer number of neurons
num_classes = 10      # There are 9 different types of malicious packets + Normal

num_epochs = 40
batch_size = 32
learning_rate = 0.001

n_total_steps = len(x_training)

device = 'cpu'

## Initialize Neural Network class

In [19]:
model = Net(input_size, hidden_size, hidden_size_2, num_classes).to(device)
print(model)

Net(
  (fc1): Linear(in_features=43, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=10, bias=True)
  (droput): Dropout(p=0.2)
  (relu): ReLU()
  (elu): ELU(alpha=1.0)
)


## Define loss and optimizer 

In [20]:
criterion = nn.CrossEntropyLoss() # This criterion combines nn.LogSoftmax() and nn.NLLLoss() in one single class.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

## Train the model

In [24]:
for epoch in range(num_epochs):
   
    for i in range(0, x_training.shape[0], batch_size):


        x = torch.as_tensor(x_training[i:i+batch_size], dtype=torch.float).to(device)
        y = torch.as_tensor(y_training[i:i+batch_size], dtype=torch.long).to(device)
        
        outputs = model(x)
        loss = criterion(outputs, y)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

Epoch [10/40], Step [175329/175341], Loss: 0.0527
Epoch [20/40], Step [175329/175341], Loss: 0.0519
Epoch [30/40], Step [175329/175341], Loss: 0.0528
Epoch [40/40], Step [175329/175341], Loss: 0.0528


# Test the model

In [29]:

# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    n_correct = 0
    n_samples = 0  
    for i in range(0, x_testing.shape[0], batch_size):
        x = torch.as_tensor(x_testing[i:i+batch_size], dtype=torch.float).to(device)
        y = torch.as_tensor(y_testing[i:i+batch_size], dtype=torch.long).to(device)
        
        outputs = model(x)
        if len(outputs.data) > 0:
            # max returns (value ,index)
            _, predicted = torch.max(outputs.data, dim=1)
            n_samples += y.size(0)
            n_correct += (predicted == y).sum().item()
        else:
            print("what???")
            print(x, outputs.data)
    acc = 100.0 * n_correct / (n_samples+1)
    print(f'Accuracy of the network: {acc} %')

Accuracy of the network: 55.134636172616084 %


# Try with an auto-encoder ?