In [None]:
import torch
import torch.utils.data
import torchvision.transforms as transforms
from torchvision import datasets
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder


import matplotlib.pyplot as plt
%matplotlib inline
%matplotlib notebook

import numpy as np
import pandas as pd

#needed to preprocess the dataset
from sklearn import preprocessing
from sklearn.metrics import accuracy_score



from tqdm.notebook import tqdm
import itertools

#general
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)



"""
X1 = torch.randn(1000, 50)
X2 = torch.randn(1000, 50) + 1.5
X = torch.cat([X1, X2], dim=0)
Y1 = torch.zeros(1000, 1)
Y2 = torch.ones(1000, 1)
Y = torch.cat([Y1, Y2], dim=0)
print(X.size())
print(Y.size()) 
"""

In [None]:
def one_hot_encoding_select_categ( df):
        dataframe = df.copy()
        """Applies 1 hot encoding to the proto, service and state columns but to some selected categories which are more influetial acording to seaborn countplot"""

        string_columns= ["proto","service","state"]
        string_categories= [[['tcp', 'udp', 'arp', 'ospf']],[['-', 'ftp', 'smtp', 'snmp', 'http', 'ftp-data', 'dns', 'ssh']],[['FIN', 'INT', 'CON', 'ECO', 'REQ']]]
    

        for column, categories in zip(string_columns, string_categories):       
            column_df = dataframe.loc[:, [column]]

            one_hot_encoder = OneHotEncoder(sparse=False, categories = categories,handle_unknown='ignore')
            # Fit OneHotEncoder to dataframe
            one_hot_encoder.fit(column_df)  
            # Transform the dataframe
            column_df_encoded = one_hot_encoder.transform(column_df)
            #Create dataframe from the 2-d array
            column_df_encoded = pd.DataFrame(data=column_df_encoded, columns=one_hot_encoder.categories_[0])
            dataframe = pd.concat([column_df_encoded, dataframe], axis=1, sort=False)

        #delete proto,service and state columns
        dataframe = dataframe.drop(string_columns,1)

        return dataframe

In [None]:
df = one_hot_encoding_select_categ(pd.read_csv('UNSW_NB15_training-set.csv').drop(columns=['attack_cat','id']))
df_val, df_train = df.iloc[0:35000, :], df.iloc[35000:, :]
df_test = one_hot_encoding_select_categ(pd.read_csv('UNSW_NB15_testing-set.csv').drop(columns=['attack_cat','id']))

In [3]:
df_train.shape

(140341, 40)

In [4]:
df_train = df_train.sample(frac=1)

X_train = df_train.drop(columns='label')
y_train = df_train.label

X_val = df_val.drop(columns='label')
y_val = df_val.label

X_test = df_test.drop(columns='label')
y_test = df_test.label

convert to torch tensors

In [5]:
X_train = torch.FloatTensor(X_train.values.astype('float'))
y_train = torch.FloatTensor(y_train.values.astype('float')).reshape((-1,1))

X_val = torch.FloatTensor(X_val.values.astype('float'))
y_val = torch.FloatTensor(y_val.values.astype('float')).reshape((-1,1))

X_test = torch.FloatTensor(X_test.values.astype('float'))
y_test = torch.FloatTensor(y_test.values.astype('float')).reshape((-1,1))

# Model

In [6]:
# define NN architecture
class Net(nn.Module):
    
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu1 = nn.ReLU()
        self.dout = nn.Dropout(0.5)
        
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.dout = nn.Dropout(0.5)
        
        self.fc3 = nn.Linear(64, 32)
        self.relu3 = nn.ReLU()
        self.dout = nn.Dropout(0.5)
        
        self.out = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        a1 = self.fc1(x)
        h1 = self.relu1(a1)
        dout1 = self.dout(h1)
        
        a2 = self.fc2(dout1)
        h2 = self.relu2(a2)
        dout2 = self.dout(h2)
        
        a3 = self.fc3(dout2)
        h3 = self.relu3(a3)
        dout3 = self.dout(h3)
        
        a4= self.out(dout3)
        y = self.sigmoid(a4)
        return y

In [7]:
def train_epoch(model, opt, criterion, batch_size):
    model.train()
    losses = []
    for beg_i in range(0, X_train.size(0), batch_size):
        x_batch = X_train[beg_i:beg_i + batch_size, :]
        y_batch = y_train[beg_i:beg_i + batch_size, :]
        x_batch = Variable(x_batch)
        y_batch = Variable(y_batch)

        opt.zero_grad()
        # (1) Forward
        y_hat = net(x_batch)
        # (2) Compute diff
        loss = criterion(y_hat, y_batch)
        # (3) Compute gradients
        loss.backward()
        # (4) update weights
        opt.step()        
        losses.append(loss.data.numpy())
    return losses

In [8]:
def test(X_test,y_test):
    y_pred = net(X_test)
    y_pred = y_pred.detach().numpy().flatten()
    y_pred = y_pred == 1
    y_true = y_test.numpy().flatten(); y_true = y_true == 1
    return accuracy_score(y_true, y_pred)

# Create dataframe with parameters to be tried out

In [11]:
df_stat = pd.DataFrame(columns = ['batch','epochs','lr','out_loss_per_epoch', 'out_accuracy_val_set'])
#create list with epoch,batch,lr
listOLists = [[100,500,1_000],[10_000,5_000,1_000,500,100],[0.001,0.005,0.01,0.05,0.1,0.2]] 
for tuples in itertools.product(*listOLists):
    row_series = pd.Series(tuples,index=['epochs','batch','lr'])
    df_stat = df_stat.append(row_series, ignore_index = True)

cols=['batch','epochs']
df_stat[cols] = df_stat[cols].applymap(np.int32)
df_stat['out_loss_per_epoch'] = df_stat['out_loss_per_epoch'].astype(object)
df_stat.head()

Unnamed: 0,batch,epochs,lr,out_loss_per_epoch,out_accuracy_val_set
0,10000,100,0.001,,
1,10000,100,0.005,,
2,10000,100,0.01,,
3,10000,100,0.05,,
4,10000,100,0.1,,


## Loop over the dataframe to get accuracies and losses for the several parameters

In [None]:
input_size=39
print('Lets try out all combinations from the dataframe..')
for  row in tqdm(df_stat.itertuples(), total=df_stat.shape[0]):
    #reset everything
    net = Net(input_size)
    criterion = nn.BCELoss()
    opt = optim.Adam(net.parameters(), lr=row.lr)  
    
    e_losses = []
    loss_per_epoch = []
    print('Iterating over the epochs...')
    for epoch in tqdm(range(row.epochs)):
        e_losses.append([train_epoch(net, opt, criterion, row.batch)])
    
    loss_per_epoch = [np.mean(loss_per_epoch) for loss_per_epoch in e_losses]    

    #update the dataframe with the losses and accuracy
    df_stat.at[row.Index, 'out_loss_per_epoch'] = loss_per_epoch
    df_stat.at[row.Index, 'out_accuracy_val_set'] = test(X_val,y_val)
    
    df_stat.to_csv("data/df_stat.csv") #save to csv

df_stats = df.copy() 

## Lets see what results we got

In [None]:
plt.plot(df_stats.iloc[0,3])
df_stats.out_accuracy_val_set.max()
df_stats.sort_values('out_accuracy_val_set', ascending=False)