## Import Libraries

In [None]:
import time
import os
from glob import glob
from tqdm import tqdm
from datetime import datetime


import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from custom_plot_confusion import plot_confusion_matrix_

from pyspark.sql import SparkSession

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, validation_curve
from sklearn import metrics

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.autograd import Variable

from mlxtend.data import loadlocal_mnist

EXP_COUNT = 1
random_state = np.random.RandomState(42)
np.random.seed(0)
torch.manual_seed(0)

## Set params

In [None]:
DATA_DIR = '/mnt/data/All Datasets/App2_data/data/'
MODEL_NAME = 'DevNet'
N_PAK = 4
MAL_PER = 1

## Load data with pandas

In [None]:
class DataProcessor:
    '''returns train and test dataframe from csv files'''
    
    def __init__(self, path, model_name, n_packets=4, mal_per=5):
        self.path = path
        self.mal_per = mal_per
        self.n = n_packets
        self.model_name = model_name
        
    def get_file(self,):
        seek_arg = '*/*/*' + str(self.n) + '.csv' 
        files = glob(os.path.join(self.path, seek_arg))
        tr_files = [f for f in files if f.split('/')[-2]=='train']
        te_files = [f for f in files if f.split('/')[-2]=='test']
        return tr_files, te_files        

    def load_data(self,):
        train_files, test_files = self.get_file()
        df_train = self._read_csv(train_files, 'train')
        df_test = self._read_csv(test_files, 'test')
        return df_train, df_test
    
    def _read_csv(self, files, mode):
        df = pd.DataFrame()
        for file in tqdm(files):
            traffic_class = file.split('/')[-3]
            traffic_idx = (1 if traffic_class=='malware' else 0)
            if (mode=='test' and traffic_class=='malware'):
                traffic_class = traffic_class + str('_') + file.split('/')[-1].split('_')[0]
            df_temp = pd.read_csv(file)
            df_temp = df_temp.assign(traffic_class=traffic_class)
            df_temp = df_temp.assign(traffic_idx=traffic_idx)
            df = df.append(df_temp)
        df.reset_index(drop=True, inplace=True)
            
        if self.model_name=='DevNet' and mode=='train':
            benign_size = df[df['traffic_class']!='malware'].shape[0]
            malware_size = df[df['traffic_class']=='malware'].shape[0]
            sample_size = int((benign_size*self.mal_per)/100)
            drop_size = malware_size-sample_size
            df = df.drop((df['traffic_class']=='malware').sample(drop_size).index)
        return df

## Process data

In [None]:
data_processor = DataProcessor(path=DATA_DIR, model_name=MODEL_NAME, n_packets=N_PAK, mal_per=MAL_PER)
df_train, df_test = data_processor.load_data()

## Feature selection

In [None]:
selected_features = [
       'udps.src_port',
       'udps.dst_port', 'udps.protocol', 
       'udps.src2dst_raw_size', 
       'udps.dst2src_raw_size', 
       'udps.src2dst_ip_size', 
       'udps.dst2src_ip_size', 
       'udps.src2dst_transport_size', 
       'udps.dst2src_transport_size', 
       'udps.src2dst_payload_size', 
       'udps.dst2src_payload_size', 
       'udps.src2dst_total_packet_size', 
       'udps.dst2src_total_packet_size', 'udps.src2dst_max_ps',
       'udps.dst2src_max_ps', 'udps.src2dst_min_ps', 'udps.dst2src_min_ps',
       'udps.src2dst_mean_ps', 'udps.dst2src_mean_ps', 'udps.src2dst_std_ps',
       'udps.dst2src_std_ps', 'udps.src2dst_syn_count',
       'udps.dst2src_syn_count', 'udps.src2dst_ece_count',
       'udps.dst2src_ece_count', 'udps.src2dst_cwr_count',
       'udps.dst2src_cwr_count', 'udps.src2dst_urg_count',
       'udps.dst2src_urg_count', 'udps.src2dst_ack_count',
       'udps.dst2src_ack_count', 'udps.src2dst_psh_count',
       'udps.dst2src_psh_count', 'udps.src2dst_rst_count',
       'udps.dst2src_rst_count', 'udps.src2dst_fin_count',
       'udps.dst2src_fin_count',
       'udps.src2dst_piat_mean_ms',
       'udps.dst2src_piat_mean_ms', 'udps.src2dst_piat_min_ms',
       'udps.dst2src_piat_min_ms', 'udps.src2dst_piat_max_ms',
       'udps.dst2src_piat_max_ms', 'udps.src2dst_piat_std_ms',
       'udps.dst2src_piat_std_ms'    
]

## Filtering nan values

In [None]:
df_train = df_train.loc[df_train[selected_features].dropna().index]
df_test = df_test.loc[df_test[selected_features].dropna().index]

## StandardScaler

In [None]:
encoder = preprocessing.StandardScaler()

df_train[selected_features] = encoder.fit_transform(df_train[selected_features].values)
df_test[selected_features] = encoder.transform(df_test[selected_features].values)

In [None]:
selected_features.append('traffic_idx')

## Validation set

In [None]:
df_valid = pd.concat([df_test[(df_test['traffic_class']=='intrusion')].sample(n=5000),
                      df_test[(df_test['traffic_class']=='benign')].sample(n=5000),
                      df_test[(df_test['traffic_class']=='malware_old')].sample(n=10000)]).reset_index()

## Dataloader

In [None]:
batch_size = 4096

train_loader = DataLoader(df_train[selected_features].values, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(df_valid[selected_features].values, batch_size=batch_size, shuffle=False)

## Model

In [None]:
class DevNet(nn.Module):
    
    def __init__(self):
        super(DevNet, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(45, 256),
            nn.ReLU(True),
            nn.Linear(256, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True),
            nn.Linear(64, 32),
            nn.ReLU(True),
            nn.Linear(32, 1))
    
    def forward(self, x):
        x = self.encoder(x)
        return x

### Deviation loss

In [None]:
import torch.nn.functional as F

class GaussianPriorScore:
    def __init__(self, num_samples, mu=0.0, std=1.0):
        self.num_samples = num_samples
        self.mu = mu
        self.std = std

    def __call__(self):
        samples = torch.normal(self.mu, self.std, (self.num_samples,))
        std, mu = torch.std_mean(samples)

        if torch.cuda.is_available():
            std = std.cuda()
            mu = mu.cuda()

        return std, mu

prior_score = GaussianPriorScore(5000)

def deviation_loss(y_labels, y_preds, margin=5):
    std, mu = prior_score()

    y_preds = y_preds.view(-1)
    y_labels = y_labels.view(-1)
    
    deviation = (y_preds - mu) / std

    loss = (1 - y_labels) * torch.abs(deviation) + y_labels * F.relu(margin - deviation)
    return loss

### Training setup

In [None]:
tail_end = '_n{}_{}%_expno_'.format(N_PAK, MAL_PER)
exp_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + str('_') + tail_end 
filename = "model/after_jun_7/DevNet_checkpoint_" + exp_time + str(EXP_COUNT) + '.pth.tar'

start_epoch = 0
best_acc = 0
lr_rate = 0.0001
num_epochs = 100

In [None]:
model = DevNet()

cuda = torch.cuda.is_available()
if cuda:
    model = model.cuda()

criterion = deviation_loss
optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate, weight_decay=1e-5)

In [None]:
from torchsummary import summary

summary(model, input_size=(45,))

### Train model

In [None]:
def train(model, optimizer, criterion, train_loader):
    model.train()
    avg_loss = 0
    for idx, packed_data in enumerate(train_loader):
        input_data, targets = Variable(packed_data[:, :-1], volatile=True), Variable(packed_data[:, -1])
        if cuda:
            input_data, true_labels = input_data.float().cuda(), targets.float().cuda()
        pred_labels = model(input_data)
        loss = criterion(true_labels, pred_labels).mean()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        loss_val = loss.detach().cpu().item()
        
        print(f"\rIteration: {idx}/{len(train_loader)} \t Loss = {loss_val}", end='')
        avg_loss = (avg_loss * idx + loss_val) / (idx + 1)

    print('\n\rEpoch {}/{}, loss:{:0.4f}\n'.format(epoch+1, num_epochs, avg_loss))
    return avg_loss

### Validate model

In [None]:
def validate(model, optimizer, criterion, valid_loader):
    model.eval()
    valid_loss = []
    valid_preds = []
    valid_target = []
    for i, packed_data in enumerate(valid_loader):
        input_data, targets = Variable(packed_data[:, :-1], volatile=True), Variable(packed_data[:, -1])
        if cuda:
            input_data, targets = input_data.float().cuda(), targets.float().cuda()
        output = model(input_data)
        loss = criterion(targets, output)
        valid_loss.append(loss.detach().cpu().numpy())
        valid_preds.append(output.detach().cpu().numpy())
        valid_target.append(targets.cpu().numpy())
        
    total_loss = np.concatenate(valid_loss)
    total_preds = np.concatenate(valid_preds)
    total_target = np.concatenate(valid_target)
        
    accuracy = metrics.accuracy_score(total_target, (total_preds > 0.5))
    
    avg_ib_loss = float(np.mean(total_loss[:5000]))
    avg_pb_loss = float(np.mean(total_loss[5000:10000]))
    avg_pm_loss = float(np.mean(total_loss[10000:]))
    
    return accuracy, avg_ib_loss, avg_pb_loss, avg_pm_loss

In [None]:
train_loss = []
valid_ib_loss = []
valid_pb_loss = []
valid_pm_loss = []
for epoch in range(num_epochs): 
    train_l = train(model=model, optimizer=optimizer, criterion=criterion, train_loader=train_loader)
    train_loss.append(train_l)    
    accuracy, ib_l, pb_l, pm_l = validate(model=model,
                                           optimizer=optimizer, 
                                           criterion=criterion,
                                           valid_loader=valid_loader
                                          )
    valid_ib_loss.append(ib_l)
    valid_pb_loss.append(pb_l)
    valid_pm_loss.append(pm_l)
    if accuracy > best_acc:
        state = {'epoch': start_epoch + epoch + 1,
             'state_dict': model.state_dict(),
             'best_accuracy': accuracy}
        best_acc = max(accuracy, best_acc)
        print('[INFO] Best model found with acc: {:0.4}\n'.format(accuracy))
        torch.save(state, filename)
    else:
        print('Model not improved\n')
EXP_COUNT += 1

## Visualize loss curve

In [None]:
valid_categories = ['train_loss', 'valid_ib_loss', 'valid_pb_loss', 'valid_pm_loss'] 

plt.figure(figsize=(12, 9))
plt.plot(np.array(train_loss), lw=2)
plt.plot(np.array(valid_ib_loss), lw=2)
plt.plot(np.array(valid_pb_loss), lw=2)
plt.plot(np.array(valid_pm_loss), lw=2)
plt.title('Loss curves', fontsize=25)
plt.xlabel('Epochs', fontsize=20)
plt.ylabel('Deviation Loss', fontsize=20)
plt.legend(valid_categories, fontsize=16, loc='upper right')

## Loading best model

In [None]:
if os.path.isfile(filename):
    print('[INFO] Loading checkpoint...', filename)
    if cuda:
        checkpoint = torch.load(filename)
    else:
        checkpoint = torch.load(filename, 
                               map_location=lambda storage,loc:storage)
    start_epoch = checkpoint['epoch']
    best_accuracy = checkpoint['best_accuracy']
    model.load_state_dict(checkpoint['state_dict'])

## Calculate anomaly score

In [None]:
def predict_score(df_test, selected_features, batch_size, model, criterion):
    model.eval()
    y = np.array([])
    data_arr = df_test[selected_features].values
    test_loader = DataLoader(data_arr, batch_size=batch_size)
    for idx, packed_data in enumerate(test_loader):
        input_data, targets = Variable(packed_data[:, :-1], volatile=True), Variable(packed_data[:, -1])
        if cuda:
            input_data, true_labels = input_data.float().cuda(), targets.float().cuda()
        pred_labels = model(input_data)
        y = np.concatenate([y, pred_labels.detach().cpu().numpy()[:, 0]])
    df_test = df_test.assign(y=y)
    return df_test

In [None]:
def get_metrics(bscore, mscore, detection_thresh, title):
    ytrue = np.array([0]*len(bscore) + [1]*len(mscore))
    ypred = np.concatenate([bscore, mscore])
        
    ypred = (ypred > detection_thresh).astype(int)
    cm = metrics.confusion_matrix(ytrue, ypred, labels=[0, 1])
    
    plot_confusion_matrix_(cm, ['Benign', 'Malware'], title=title, 
                           detection_thresh=detection_thresh, normalize=False)

In [None]:
df_test_score = predict_score(df_test, selected_features, batch_size, model, criterion)

In [None]:
mta_labels = df_test_score[df_test_score['traffic_class']=='malware_mta']['sublabel'].unique()
labels_2021 = [l for l in mta_labels if l.startswith('2021')]
ransom_labels = [label for label in mta_labels if 'ransomware' in list(map(lambda x: x.lower(), label.split('-')))]

In [None]:
## Calculating classwise malware score
mta2021_score = df_test_score.loc[df_test_score['sublabel'].isin(labels_2021)]['y'].values
ransom_score = df_test_score.loc[df_test_score['sublabel'].isin(ransom_labels)]['y'].values

pm_score = df_test_score[df_test_score['traffic_class']=='malware_old']['y'].values
ddos_score = df_test_score[df_test_score['label']=='ddos2019']['y'].values
doh_score = df_test_score[(df_test_score['label']=='DoH__iodine') |
                        (df_test_score['label']=='DoH__dnscat2')]['y'].values
botnet_score = df_test_score[df_test_score['label']=='ISCX_Botnet']['y'].values
pb_score = df_test_score[df_test_score['traffic_class']=='benign']['y'].values
ib_score = df_test_score[df_test_score['traffic_class']=='intrusion']['y'].values

total_score = np.concatenate([
    pm_score,
    ransom_score,
    mta2021_score,
    ddos_score,
    doh_score,
    botnet_score,
    pb_score,
    ib_score    
])

total_labels = np.array(
    ['Public Malware']*pm_score.shape[0] +
    ['MTA Ransomware Malware']*ransom_score.shape[0] +
    ['2021 Malware']*mta2021_score.shape[0] +
    ['DDoS Malware']*ddos_score.shape[0] +
    ['DoH Malware']*doh_score.shape[0] +
    ['ISCX Botnet Malware']*botnet_score.shape[0] +
    ['Public Benign']*pb_score.shape[0] +
    ['Intrusion Benign']*ib_score.shape[0])

df_score = pd.DataFrame({'category': total_labels,
                        'score': total_score})

In [None]:
import matplotlib

dataset_types = ['Public Malware',
                'MTA Ransomware Malware',
                '2021 Malware',
                'DDoS Malware',
                'DoH Malware',
                'ISCX Botnet Malware',
                'Public Benign',
                'Intrusion Benign']

matplotlib.rc("font", size=18)
colors = plt.rcParams["axes.prop_cycle"]()

fig, axes = plt.subplots(len(dataset_types), figsize=(15, 30))
fig.tight_layout()

for idx, malware_type in enumerate(dataset_types):
    c = next(colors)["color"]
    anomaly_score = df_score[df_score['category']==malware_type]['score']  
    axes[idx].hist(anomaly_score, bins=100, color=c)
    axes[idx].grid(axis='y', alpha=0.75)
    axes[idx].set_title(malware_type)
    axes[idx].set_ylabel('Session counts')
    
plt.title("Histogram Plot - Anomaly score for DevNet")
plt.xlabel("Anomaly-Score")
plt.show()

In [None]:
malware_categories = ['Public Malware',
                'MTA Ransomware Malware',
                '2021 Malware',
                'DDoS Malware',
                'DoH Malware',
                'ISCX Botnet Malware']

detection_thresh = [2.5, 2, 1.5, 1, 0.5, 0.1, 0.05]

for category in malware_categories:
    benign_score = df_score[df_score['category']=='Intrusion Benign']['score'].values
    malware_score = df_score[df_score['category']==category]['score'].values
    print('For', category)
    for thresh in detection_thresh:
        get_metrics(np.random.choice(benign_score, len(malware_score)), 
                    malware_score, detection_thresh=thresh, title=category)