In [1]:
import gc
import os
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pickle
from sklearn.utils import resample
from collections import Counter
import math
from sklearn.model_selection import train_test_split
import helper
import dataset
import torch
import torch.nn as nn
from resnet1d.net1d import Net1D

In [2]:
data_files = [name for name in os.listdir('./data')]

dfs = []

for i in range(len(data_files)):
    fname = './data/' + data_files[i]
    df = pd.read_csv(fname)
    # df.columns = [x.strip().lstrip() for x in df.columns]

    dfs.append(df)

df = pd.concat(dfs, axis=0, ignore_index=True)

df.shape

(2830743, 79)

In [3]:
# Remove spaces in the front and the end of the column names for better human reading
df.columns = [x.lstrip().strip().replace('�', '-') for x in df.columns]
df.shape

(2830743, 79)

In [4]:
# replace inf values to nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.shape

(2827876, 79)

In [5]:
df_train, df_val = train_test_split(df, stratify=df['Label'], test_size=0.2, random_state=42)

In [6]:
Counter(df_train['Label'])

Counter({'BENIGN': 1817055,
         'DoS Hulk': 184099,
         'PortScan': 127043,
         'DDoS': 102420,
         'DoS GoldenEye': 8234,
         'FTP-Patator': 6348,
         'SSH-Patator': 4717,
         'DoS slowloris': 4637,
         'DoS Slowhttptest': 4399,
         'Bot': 1565,
         'Web Attack � Brute Force': 1206,
         'Web Attack � XSS': 522,
         'Infiltration': 29,
         'Web Attack � Sql Injection': 17,
         'Heartbleed': 9})

In [7]:
from imblearn.under_sampling import RandomUnderSampler

sampling_strategy = dict(Counter(df_train['Label']))
attack_cnt = 0

for k, v in sampling_strategy.items():
    if v < 20000 or k == 'BENIGN': continue

    # when samples count bigger than 10k, truncate in half
    sampling_strategy[k] = v // 2
    attack_cnt += v // 2

sampling_strategy['BENIGN'] = attack_cnt

under_sampler = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)

X_train = df_train.drop(['Label'], axis=1)
y_train = df_train['Label']

X_train, y_train = under_sampler.fit_resample(X_train, y_train)

print(y_train.shape[0])
Counter(y_train)

445243


Counter({'BENIGN': 206780,
         'DoS Hulk': 92049,
         'PortScan': 63521,
         'DDoS': 51210,
         'DoS GoldenEye': 8234,
         'FTP-Patator': 6348,
         'SSH-Patator': 4717,
         'DoS slowloris': 4637,
         'DoS Slowhttptest': 4399,
         'Bot': 1565,
         'Web Attack � Brute Force': 1206,
         'Web Attack � XSS': 522,
         'Infiltration': 29,
         'Web Attack � Sql Injection': 17,
         'Heartbleed': 9})

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

pca__n_components = 20

preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("normalize", MinMaxScaler()),
    ("PCA", PCA(n_components=pca__n_components))
])

label_encoder = LabelEncoder()

Preprocess

In [9]:

X_train = preprocessor.fit_transform(X_train)

# doing this since we need all labels in case there's any labels not found in test set
label_encoder.fit(df['Label'])

y_train = label_encoder.transform(y_train)

In [10]:
with open('./dist/preprocessor-mlp.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

with open('./dist/labelencoder-mlp.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

Gen argument views

In [11]:
X_smote, _ = helper.smote_sampling(X_train, y_train)
X_gaussion = helper.add_gaussion(X_smote)
X_flip = helper.invert_array(X_train)

X_new = np.dstack([X_smote, X_gaussion, X_flip])
X_new = X_new.transpose((0, 2, 1))

X_new.shape

(445243, 3, 20)

In [12]:
X_train = np.expand_dims(X_train, axis=2).transpose((0, 2, 1))
X_train.shape

(445243, 1, 20)

In [13]:
org_dataset = dataset.NumpyDataset(X_train, y_train)
pos_dataset = dataset.NumpyDataset(X_new, y_train)

In [14]:
# best params found by hyper parameters tunning
# best_params = {'embedding_size': 34, 'lr': 0.010848038400629992, 'weight_decay': 0.00012209180832556052}
best_params = {'lr': 0.014968162145540436, 'weight_decay': 0.002904245255012199, 'hidden_size': 65, 'n_hidden': 1, 'embedding_size': 39, 'kernel_size': 1, 'stride': 1}

Train models & checkpoints are saved as well

In [15]:
import time
import warnings
import optuna
from torch.utils.data import DataLoader
from pytorch_metric_learning.losses import SelfSupervisedLoss, NTXentLoss
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from models import Extractor, Projector, ExtractorMLP
from models import ExtractorMLP
from itertools import cycle
from torch.utils.tensorboard import SummaryWriter
start_time = time.perf_counter()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

embedding_size = best_params['embedding_size']
kernel_size = best_params['kernel_size']
stride = best_params['stride']
n_hidden = best_params['n_hidden']
hidden_size = best_params['hidden_size']
act = 'relu'


# extractor_1d = Extractor(n_features=pca__n_components, n_channels=1, embedding_size=embedding_size).to(device)
# extractor_3d = Extractor(n_features=pca__n_components, n_channels=3, embedding_size=embedding_size).to(device)

extractor_1d = ExtractorMLP(n_features=pca__n_components, 
                        n_channels=1,
                        embedding_size=embedding_size,
                        hidden_size=hidden_size,
                        n_hidden=n_hidden,
                        kernel_size=kernel_size, 
                        stride=stride,
                        act=act
                        ).to(device)

extractor_3d = ExtractorMLP(n_features=pca__n_components, 
                        n_channels=X_new.shape[1],
                        embedding_size=embedding_size,
                        hidden_size=hidden_size,
                        n_hidden=n_hidden,
                        kernel_size=kernel_size, 
                        stride=stride,
                        act=act,
                        ).to(device)


projector = Projector(embedding_size=embedding_size).to(device)

lr = best_params['lr']
weight_decay = best_params['weight_decay']
batch_size = 1024
epochs = 200

org_dataloader = DataLoader(org_dataset, batch_size=batch_size,)
pos_dataloader = DataLoader(pos_dataset, batch_size=batch_size)

optimizer_1d = torch.optim.Adam(extractor_1d.parameters(), lr=lr, weight_decay=weight_decay)
optimizer_3d = torch.optim.Adam(extractor_3d.parameters(), lr=lr, weight_decay=weight_decay)
optimizer_projector = torch.optim.Adam(projector.parameters(), lr=lr, weight_decay=weight_decay)

loss_fn = NTXentLoss()
loss_fn = SelfSupervisedLoss(loss_fn, symmetric=False).to(device)

writer = SummaryWriter(log_dir='./runs')

cnt = 0
num_epoch2save = 5

for epoch in range(epochs):
    if epoch != 0 and epoch % num_epoch2save == 0:
        helper.store_checkpoint(model=extractor_1d, name='extractor1d', epoch=epoch, prefix='mlp')
        helper.store_checkpoint(model=extractor_3d, name='extractor3d', epoch=epoch, prefix='mlp')
        helper.store_checkpoint(model=projector, name='projector', epoch=epoch, prefix='mlp')

    total_loss = 0
    for item1, item2 in zip(org_dataloader, cycle(pos_dataloader)):
        X_batch, _ = item1
        X_new_batch, _ = item2

        X_batch = X_batch.to(device)
        X_new_batch = X_new_batch.to(device)

        optimizer_1d.zero_grad()
        optimizer_3d.zero_grad()
        optimizer_projector.zero_grad()

        embedding_1d = extractor_1d(X_batch)
        embedding_3d = extractor_3d(X_new_batch)
        projected_1d = projector(embedding_1d)
        projected_3d = projector(embedding_3d)

        loss = loss_fn(projected_1d, projected_3d)
        loss.backward()

        total_loss += loss.item()
    
        optimizer_1d.step()
        optimizer_3d.step()
        optimizer_projector.step()

    cnt += 1

    print(f'[{epoch+1}/{epochs}]: {total_loss}')
    
    writer.add_scalar('Loss/train-MLP-prod', total_loss, cnt)
        

[1/200]: 2238.133773326874
[2/200]: 2175.31924366951
[3/200]: 2004.3610863685608
[4/200]: 1988.527984380722
[5/200]: 2006.5619750022888
[6/200]: 1959.9978439807892
[7/200]: 1926.7197148799896
[8/200]: 2010.106299161911
[9/200]: 2067.2433507442474
[10/200]: 1906.306254863739
[11/200]: 1878.50435256958
[12/200]: 1864.6345417499542
[13/200]: 1838.0951924324036
[14/200]: 1780.104879617691
[15/200]: 1792.5136587619781
[16/200]: 1753.2039589881897
[17/200]: 1756.9024851322174
[18/200]: 1821.3619267940521
[19/200]: 1734.990210056305
[20/200]: 1823.2698986530304
[21/200]: 1728.6076302528381
[22/200]: 1730.9255759716034
[23/200]: 1703.6616468429565
[24/200]: 1689.3443677425385
[25/200]: 1641.8360650539398
[26/200]: 1711.3743152618408
[27/200]: 1672.6423416137695
[28/200]: 1637.66059923172
[29/200]: 1688.1354579925537
[30/200]: 1716.2706863880157
[31/200]: 1665.3287148475647
[32/200]: 1633.8278710842133
[33/200]: 1643.926919221878
[34/200]: 1672.1975548267365
[35/200]: 1634.0799219608307
[36/200

save model

In [16]:
with open('./dist/mlp/extractor1d-final.pkl', 'wb') as f:
    pickle.dump(extractor_1d, f)