# Do some imports

In [1]:
import numpy as np
import pandas as pd

import torch
import torch.utils.data
import torchvision.transforms as transforms
from torchvision import datasets
from torch.utils.data import DataLoader, Dataset

#needed to create the Neural Network
import torch.nn as nn
import torch.nn.functional as F

#needed to preprocess the dataset
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from dataloader import UNSW_NB15

#general
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Load the TensorBoard notebook extension
from torch.utils.tensorboard import SummaryWriter

### Inspired by [this github file](https://github.com/alik604/cyber-security/blob/master/Intrusion-Detection/UNSW_NB15%20-%20Torch%20MLP%20and%20autoEncoder.ipynb)

# Get UNSW_NB15 train and test set

In [2]:
#!wget https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/a%20part%20of%20training%20and%20testing%20set/UNSW_NB15_training-set.csv

In [3]:
#!wget https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/a%20part%20of%20training%20and%20testing%20set/UNSW_NB15_testing-set.csv

# Define the Neural Network class

In [6]:
# define NN architecture
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, hidden_size_2, num_classes):
        super(Net,self).__init__()
       
        self.fc1 = nn.Linear(input_size, hidden_size)     # linear layer (input_size -> hidden_size)       
        self.fc2 = nn.Linear(hidden_size, hidden_size_2)  # linear layer (hidden_size -> hidden_2)       
        self.fc3 = nn.Linear(hidden_size_2, num_classes)  # linear layer (hidden_size_2 -> num_classes)
        
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        
    def forward(self,x):        
        out = self.fc1(x)     #x is the input tensor       
        out = self.relu1(out) #add hidden layer, with relu activation function
        
        out = self.fc2(out)        
        out = self.relu2(out)   # add hidden layer, with relu activation function
        
        out = torch.sigmoid(self.fc3(out)) #sigmoid as we use BCELoss
        
        return out


### Define Train,   Test   and    Display_Loss_Plot    methods

In [7]:
def train(model, device, train_loader, optimizer):
    
    loss_history = list()
    model.train()
    y_true = []
    y_pred = []
    
    for i, data in enumerate(train_loader, 0):
        
        # get the inputs; data is a list of [inputs, target ( or labels)]
        inputs , target = data
        
        optimizer.zero_grad()
        
        #MOVING THE TENSORS TO THE CONFIGURED DEVICE
        inputs, target = inputs.to(device), target.to(device)
        
        #FORWARD PASS
        output = model(inputs.float())
        loss = criterion(output, target)
        
        #BACKWARD AND OPTIMIZE
        
        loss.backward()
        optimizer.step()
        
        # PREDICTIONS
        pred = np.round(output.detach().numpy())
        target = target.float()
        y_true.extend(target.tolist()) 
        y_pred.extend(pred.reshape(-1).tolist())
        
        loss_history.append(loss.item())     
    print("Accuracy on training set is" , accuracy_score(y_true,y_pred))
    return loss_history

In [8]:
#TESTING THE MODEL
def test(model, device, test_loader):
    #model in eval mode skips Dropout etc
    model.eval()
    y_true = []
    y_pred = []
    
    # set the requires_grad flag to false as we are in the test mode
    with torch.no_grad():
        for i in test_loader:
            
            #LOAD THE DATA IN A BATCH
            data,target = i
            
            # moving the tensors to the configured device
            #data, target = data.to(device), target.to(device)
            
            # the model on the data
            output = model(data.float())
                       
            #PREDICTIONS
            pred = np.round(output)
            target = target.float()
            y_true.extend(target.tolist()) 
            y_pred.extend(pred.reshape(-1).tolist())
    
            
    print("Accuracy on test set is" , accuracy_score(y_true,y_pred))
    print("***********************************************************")

In [9]:
def display_loss_plot(losses):
    x_axis = [i for i in range(len(losses))]
    plt.plot(x_axis,losses)
    plt.title('Loss of the model')
    plt.xlabel('iterations')
    plt.ylabel('Cross entropy loss')
    plt.show()

## Define some parameters first

In [10]:
input_size = 56      # 42 for integer encoding 196
hidden_size = 64      # 1st layer number of neurons
hidden_size_2 = 64    # 2nd layer number of neurons
num_classes = 1    # binary classification

num_epochs = 5
BATCH_SIZE_1 = 100 #train_loader as it has  175341  observations
BATCH_SIZE_2 = 50 #test_loader as it has  82332  observations

device = 'cpu'

## Initialize UNSW_NB15 class

In [11]:
#get the train dataframe
train_dataset = UNSW_NB15(file_path ='UNSW_NB15_training-set.csv')
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE_1, shuffle=not False)

#get the test dataframe
test_dataset = UNSW_NB15(file_path ='UNSW_NB15_testing-set.csv')
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE_2, shuffle=not False)


torch.Size([175341, 57])
torch.Size([82332, 57])


## Initialize Neural Network class

In [12]:
model = Net(input_size, hidden_size, hidden_size_2, num_classes)#.to(device)
#model = Net(input_size, hidden_size, num_classes)
print(model)

Net(
  (fc1): Linear(in_features=56, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (relu1): ReLU()
  (relu2): ReLU()
)


## Define loss and optimizer 

In [13]:
criterion = nn.BCELoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))

## Lets Train, Test the model and see the loss

In [14]:
running_loss = []
num_epochs=10
for epoch in tqdm(range(num_epochs)):
        loss_epoch = train(model, device, train_loader, optimizer)
        running_loss.append(loss_epoch)


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Accuracy on training set is 0.6933061862314005
Accuracy on training set is 0.6934886877569992
Accuracy on training set is 0.7198772677240349
Accuracy on training set is 0.7102845312847537
Accuracy on training set is 0.695051357069938
Accuracy on training set is 0.6878767658448395
Accuracy on training set is 0.6894166224670785
Accuracy on training set is 0.6891428701786804
Accuracy on training set is 0.689610530338027
Accuracy on training set is 0.6878767658448395
Accuracy on training set is 0.6842438448508905
Accuracy on training set is 0.6825785184298025
Accuracy on training set is 0.6825899247751525
Accuracy on training set is 0.6826013311205024
Accuracy on training set is 0.6825899247751525
Accuracy on training set is 0.6826013311205024
Accuracy on training set is 0.6826070342931773
Accuracy on training set is 0.6826013311205024
Accuracy on training set is 0.6864908948848244
Accuracy on training set is 0.6926617277191301



In [15]:
print("Done training and testing, showing the loss now...")    
loss_per_epoch = [np.mean(loss_per_epoch) for loss_per_epoch in running_loss]
display_loss_plot(loss_per_epoch)


Done training and testing, showing the loss now...


<IPython.core.display.Javascript object>

In [17]:
test(model,device,test_loader)


Accuracy on test set is 0.6853835689646796
***********************************************************


# Is the model trainable?

Lets check the performance of a Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train = train_loader.dataset.dataframe.drop(columns='label')
y_train = train_loader.dataset.dataframe.label
X_test = test_loader.dataset.dataframe.drop(columns='label')
y_test = test_loader.dataset.dataframe.label

lr = LogisticRegression()
lr.fit(X_train,y_train)


preds = lr.predict(X_test)
accuracy_score(y_test, preds)

### Conclusion
    70% accuracy in test set with Logistic Regression

## Lets try to see which categories matter, therefore should enter the 1hot encoding

In [6]:
ax = sns.countplot(data=train_loader.dataset.dataframe, x = "proto", hue="label")
ax

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x151a2bd0d90>

In [7]:
ax1 = sns.countplot(data=train_loader.dataset.dataframe, x = "state", hue="label")
ax1

<matplotlib.axes._subplots.AxesSubplot at 0x151a2bd0d90>

In [8]:
ax2 = sns.countplot(data=train_loader.dataset.dataframe, x = "service", hue="label")
ax2

<matplotlib.axes._subplots.AxesSubplot at 0x151a2bd0d90>

### Recreate this paper's results
    https://www.researchgate.net/publication/332100759_Intrusion_Detection_Using_Big_Data_and_Deep_Learning_Techniques