# Do some imports

In [1]:
import numpy as np
import pandas as pd

import torch
import torch.utils.data
import torchvision.transforms as transforms
from torchvision import datasets
from torch.utils.data import DataLoader, Dataset

#needed to create the Neural Network
import torch.nn as nn
import torch.nn.functional as F

#needed to preprocess the dataset
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from dataloader import UNSW_NB15

#general
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Load the TensorBoard notebook extension
from torch.utils.tensorboard import SummaryWriter

### Inspired by [this github file](https://github.com/alik604/cyber-security/blob/master/Intrusion-Detection/UNSW_NB15%20-%20Torch%20MLP%20and%20autoEncoder.ipynb)

# Get UNSW_NB15 train and test set

In [2]:
#!wget https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/a%20part%20of%20training%20and%20testing%20set/UNSW_NB15_training-set.csv

In [3]:
#!wget https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/a%20part%20of%20training%20and%20testing%20set/UNSW_NB15_testing-set.csv

# Define the Neural Network class

In [50]:
# define NN architecture
class Net(nn.Module):
    
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.relu1 = nn.ReLU()
        self.dout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(50, 100)
        self.prelu = nn.PReLU(1)
        self.out = nn.Linear(100, 1)
        self.out_act = nn.Sigmoid()
        
    def forward(self, input_):
        a1 = self.fc1(input_)
        h1 = self.relu1(a1)
        dout = self.dout(h1)
        a2 = self.fc2(dout)
        h2 = self.prelu(a2)
        a3 = self.out(h2)
        y = self.out_act(a3)
        return y

### Define Train,   Test   and    Display_Loss_Plot    methods

In [27]:
def train(model, device, train_loader, optimizer):
    
    loss_history = list()
    model.train()
    y_true = []
    y_pred = []
    
    for i, data in enumerate(train_loader, 0):
        
        # get the inputs; data is a list of [inputs, target ( or labels)]
        inputs , target = data
        
        optimizer.zero_grad()
        
        #MOVING THE TENSORS TO THE CONFIGURED DEVICE
        #inputs, target = inputs.to(device), target.to(device)
        
        #FORWARD PASS
        output = model(inputs.float())
        loss = criterion(output, target)
        
        #BACKWARD AND OPTIMIZE
        
        loss.backward()
        optimizer.step()
        
        # PREDICTIONS
        pred = np.round(output.detach().numpy())
        target = target.float()
        y_true.extend(target.tolist()) 
        y_pred.extend(pred.reshape(-1).tolist())
        
        loss_history.append(loss.item())     
    print("Accuracy on training set is" , accuracy_score(y_true,y_pred))
    return loss_history

In [54]:
#TESTING THE MODEL
def test(model, device, test_loader):
    #model in eval mode skips Dropout etc
    model.eval()
    y_true = []
    y_pred = []
    
    # set the requires_grad flag to false as we are in the test mode
    with torch.no_grad():
        for i in test_loader:
            
            #LOAD THE DATA IN A BATCH
            data,target = i
            
            # moving the tensors to the configured device
            #data, target = data.to(device), target.to(device)
            
            # the model on the data
            output = model(data.float())
                       
            #PREDICTIONS
            pred = np.round(output)
            target = target.float()
            y_true.extend(target.tolist()) 
            y_pred.extend(pred.reshape(-1).tolist())
    
            
    print("Accuracy on test set is" , accuracy_score(y_true,y_pred))
    print("***********************************************************")

In [55]:
def display_loss_plot(losses):
    x_axis = [i for i in range(len(losses))]
    plt.plot(x_axis,losses)
    plt.title('Loss of the model')
    plt.xlabel('iterations')
    plt.ylabel('Cross entropy loss')
    plt.show()

## Define some parameters first

In [56]:
input_size = 56      # 42 for integer encoding 196
hidden_size = 100      # 1st layer number of neurons
hidden_size_2 = 100    # 2nd layer number of neurons
num_classes = 1    # binary classification

num_epochs = 80
BATCH_SIZE_1 = 5000 #train_loader as it has  175341  observations
BATCH_SIZE_2 = 5000 #test_loader as it has  82332  observations

device = 'cpu'

## Initialize UNSW_NB15 class

In [57]:
#get the train dataframe
train_dataset = UNSW_NB15(file_path ='UNSW_NB15_training-set.csv')
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE_1, shuffle=not False)

#get the test dataframe
test_dataset = UNSW_NB15(file_path ='UNSW_NB15_testing-set.csv')
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE_2, shuffle=not False)


torch.Size([175341, 57])
torch.Size([82332, 57])


## Initialize Neural Network class

In [66]:
model = Net(input_size)
#model = Net(input_size, hidden_size, num_classes)
print(model)

Net(
  (fc1): Linear(in_features=56, out_features=50, bias=True)
  (relu1): ReLU()
  (dout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=50, out_features=100, bias=True)
  (prelu): PReLU(num_parameters=1)
  (out): Linear(in_features=100, out_features=1, bias=True)
  (out_act): Sigmoid()
)


## Define loss and optimizer 

In [67]:
criterion = nn.BCELoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))

## Lets Train, Test the model and see the loss

In [68]:
running_loss = []
num_epochs=20
for epoch in tqdm(range(num_epochs)):
        loss_epoch = train(model, device, train_loader, optimizer)
        running_loss.append(loss_epoch)


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

Accuracy on training set is 0.328502746077643
Accuracy on training set is 0.521395452290109
Accuracy on training set is 0.7292475804289926
Accuracy on training set is 0.748706805595953
Accuracy on training set is 0.7397471213235923
Accuracy on training set is 0.7447259910688316
Accuracy on training set is 0.7567083568589206
Accuracy on training set is 0.7565885902327465
Accuracy on training set is 0.7573357058531661
Accuracy on training set is 0.7571075789461678
Accuracy on training set is 0.7581341500276604
Accuracy on training set is 0.7577862564944878
Accuracy on training set is 0.7575296137241148
Accuracy on training set is 0.75748969151539
Accuracy on training set is 0.7588128275759806
Accuracy on training set is 0.7568281234850948
Accuracy on training set is 0.7585276689422326
Accuracy on training set is 0.758459230870133
Accuracy on training set is 0.7596112717504748
Accuracy on training set is 0.7603469810255445



In [69]:
print("Done training and testing, showing the loss now...")    
loss_per_epoch = [np.mean(loss_per_epoch) for loss_per_epoch in running_loss]
display_loss_plot(loss_per_epoch)


Done training and testing, showing the loss now...


In [None]:
test(model,device,test_loader)


# Is the model trainable?

Lets check the performance of a Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train = train_loader.dataset.dataframe.drop(columns='label')
y_train = train_loader.dataset.dataframe.label
X_test = test_loader.dataset.dataframe.drop(columns='label')
y_test = test_loader.dataset.dataframe.label

lr = LogisticRegression()
lr.fit(X_train,y_train)


preds = lr.predict(X_test)
accuracy_score(y_test, preds)

### Conclusion
    70% accuracy in test set with Logistic Regression

## Lets try to see which categories matter, therefore should enter the 1hot encoding

In [None]:
ax = sns.countplot(data=train_loader.dataset.dataframe, x = "proto", hue="label")
ax

In [None]:
ax1 = sns.countplot(data=train_loader.dataset.dataframe, x = "state", hue="label")
ax1

In [None]:
ax2 = sns.countplot(data=train_loader.dataset.dataframe, x = "service", hue="label")
ax2

### Recreate this paper's results
    https://www.researchgate.net/publication/332100759_Intrusion_Detection_Using_Big_Data_and_Deep_Learning_Techniques