## Imports

In [None]:
import pymongo
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn

In [None]:
def _connect_mongo(host, port, db):
    """ A util for making a connection to mongo """
    try:
        client = pymongo.MongoClient(host, port)
        client.server_info()
    except pymongo.errors.ServerSelectionTimeoutError as err:
        print(err)
        print("Are you sure your database is on and this can reach it?") 
        raise ConnectionError
    return client[db]


def read_mongo(db, collection, query={}, host='localhost', port=27017, no_id=True):
    """ Read from Mongo and Store into DataFrame """
    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)
    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))
    # Delete the _id
    if no_id:
        del df['_id']

    return df
train_df = read_mongo("NETWORK", "train")
test_df = read_mongo("NETWORK", "test")

## Data Cleaning and Understanding

In [None]:
train_df.head(5)
print(len(train_df))

In [None]:
test_df.head(5)
print(len(test_df))

Our dataset have the same amount of features. Of these, protocol type, service, flag and attack are categorical.

### Check for NaNs

In [None]:
print(train_df.isnull().values.any())

### Check how the rest of the data looks like

In [None]:
train_df.info()

We have a mix of ints, floats and strings. No null values that need to be cleand, however.

### Check string answer 

In [None]:
print(len(train_df["service"].drop_duplicates().values))
print(len(test_df["service"].drop_duplicates().values))
print("------------------------------------------------")
train_df["service"].drop_duplicates()

In [None]:
print(len(train_df["protocol_type"].drop_duplicates().values))
print(len(test_df["protocol_type"].drop_duplicates().values))
print("------------------------------------------------")
train_df["protocol_type"].drop_duplicates()

In [None]:
print(len(train_df["flag"].drop_duplicates().values))
print(len(test_df["flag"].drop_duplicates().values))
print("------------------------------------------------")
train_df["flag"].drop_duplicates()

In [None]:
print(len(train_df["attack"].drop_duplicates().values))
print(len(test_df["attack"].drop_duplicates().values))
print("------------------------------------------------")
train_df["attack"].drop_duplicates()

In [None]:
print(test_df["num_outbound_cmds"].drop_duplicates())
print(train_df["num_outbound_cmds"].drop_duplicates())

Our labeled data does not look the same. This means we have to clean up our data. We would have to do some of this anyways, as models can't handle categorical data. However, we are putting all attacks as malicous and aim to try to understand what is normal instead. Meaning, we are going to be performing one class anomaly detection. We are going to do one hot encoding on our data. We are also going to drop num outbound cmds, as it's all 0:s.

In [None]:
train_df.drop("num_outbound_cmds", axis=1, inplace=True)
test_df.drop("num_outbound_cmds", axis=1, inplace=True)
assert test_df.isnull().values.any() == False
assert train_df.isnull().values.any() == False

In [None]:
def label_encoder_mapping(dataframe: pd.DataFrame, coloumn: str):
    labels = list(dataframe[coloumn].drop_duplicates().values)
    labels.sort()

    mapping = {}
    for index, label in enumerate(labels):
        mapping[label] = index+1

    return mapping

def transform_label(dataframe: pd.DataFrame, coloumns: list):
    for coloumn in coloumns:
        mapping = label_encoder_mapping(dataframe, coloumn)
        dataframe[coloumn] = dataframe[coloumn].map(mapping)

In [None]:
transform_label(train_df, ["flag", "protocol_type", "service"])
transform_label(test_df, ["flag", "protocol_type", "service"])

In [None]:
def transform_attack(dataframe):
    labels = list(dataframe["attack"].drop_duplicates().values)
    labels.sort()

    mapping = {}
    for index, label in enumerate(labels):
        if label == "normal":
            mapping[label] = 0
        else:
            mapping[label] = 1
    dataframe["attack"] = dataframe["attack"].map(mapping)

In [None]:

transform_attack(train_df)
transform_attack(test_df)

# Distribution

In [None]:

fig, axes = plt.subplots(1, 2, figsize=(5,10))
distribution = train_df["attack"].value_counts().reset_index()
distribution.columns = ["attack", "count"]
distribution.sort_values(by="attack", ascending=True, inplace=True)
print(distribution)

palette_color = sns.color_palette('dark') 
axes[0].pie(data=distribution, labels="attack", x="count", colors=palette_color)
axes[0].set_title("Train Attack Distribution")

distribution = test_df["attack"].value_counts().reset_index()
distribution.columns = ["attack", "count"]
distribution.sort_values(by="attack", ascending=True, inplace=True)
print(distribution)

palette_color = sns.color_palette('dark') 
axes[1].pie(data=distribution, labels="attack", x="count", colors=palette_color)
axes[1].set_title("Test Attack Distribution")


plt.tight_layout()
plt.show()

### Normalise values
Between 0 and 1, except for attack. To avoid vanishing / exploding gradients, normalize the data.

In [None]:
without_attack = train_df.drop(["attack"], axis=1, inplace=False)
normalized_train_df=(without_attack-without_attack.mean())/without_attack.std()
normalized_train_df["attack"] = train_df["attack"]
normalized_train_df

In [None]:
without_attack = test_df.drop(["attack"], axis=1, inplace=False)
normalized_test_df=(without_attack-without_attack.mean())/without_attack.std()
normalized_test_df["attack"] = test_df["attack"]
normalized_test_df

## Model

In [None]:
# https://arxiv.org/pdf/1607.00148.pdf
class LSTMAutoEncoder(nn.Module):
    def __init__(self, num_layers, hidden_size, nb_feature, dropout=0, device=torch.device('cpu')):
        super(LSTMAutoEncoder, self).__init__()
        self.device = device
        self.encoder = Encoder(num_layers, hidden_size, nb_feature, dropout, device)
        self.decoder = Decoder(num_layers, hidden_size, nb_feature, dropout, device)

    def forward(self, input_seq):
        output = torch.zeros(size=input_seq.shape, dtype=torch.float)
        encoder_output = self.encoder(input_seq)  

        input_decoder = encoder_output[:, -1, :].unsqueeze(1)  # shape: [batch, 1, hidden_size]
        
        decoder_hidden = (
            torch.randn((self.decoder.num_layers, input_seq.size(0), self.decoder.hidden_size)).to(self.device),
            torch.randn((self.decoder.num_layers, input_seq.size(0), self.decoder.hidden_size)).to(self.device)
        )
        
        for i in range(input_seq.shape[1] - 1, -1, -1):
            output_decoder, decoder_hidden = self.decoder(input_decoder, decoder_hidden)
            input_decoder = output_decoder
            output[:, i, :] = output_decoder[:, 0, :]
        
        return output

class Encoder(nn.Module):
    def __init__(self, num_layers, hidden_size, nb_feature, dropout=0, device=torch.device('cpu')):
        super(Encoder, self).__init__()
        self.input_size = nb_feature
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device
        self.lstm = nn.LSTM(input_size=nb_feature, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True, dropout=dropout, bias=True)
        self.hidden_cell = None

    def initHidden(self, batch_size):
        self.hidden_cell = (
            torch.randn((self.num_layers, batch_size, self.hidden_size), dtype=torch.float).to(self.device),
            torch.randn((self.num_layers, batch_size, self.hidden_size), dtype=torch.float).to(self.device)
        )

    def forward(self, input_seq):
        self.initHidden(input_seq.shape[0])
        output, self.hidden_cell = self.lstm(input_seq, self.hidden_cell)
        return output 

class Decoder(nn.Module):
    def __init__(self, num_layers, hidden_size, nb_feature, dropout=0, device=torch.device('cpu')):
        super(Decoder, self).__init__()
        self.input_size = nb_feature
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

        self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True, dropout=dropout, bias=True)
        self.linear = nn.Linear(in_features=hidden_size, out_features=nb_feature)

    def forward(self, input_seq, hidden_cell):
        output, hidden_cell = self.lstm(input_seq, hidden_cell)
        output = self.linear(output)
        return output, hidden_cell

# TRAIN

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


class CustomDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, device = None, transform=None, target_transform=None):
        self.df_labels = dataframe["attack"]
        self.df = dataframe.drop(["attack"], axis=1, inplace=False)
        self.transform = transform
        self.target_transform = target_transform
        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image = self.df.iloc[idx]
        label = self.df_labels.iloc[idx]
        tensor_data = torch.tensor(image, device=self.device, dtype=torch.float32).unsqueeze(0)
        tensor_label = torch.tensor(label, device=self.device, dtype=torch.float32)
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return tensor_data, tensor_label

In [None]:

def train(device, lstm, loss_fn, optimizer, batch_size, loader, verbose=True):
    lstm = lstm.train()
    train_loss = 0
    with torch.autograd.set_detect_anomaly(True):
        for batch, (X, y) in enumerate(loader):
            X, _ = X.to(device), y.to(device)
        # Compute prediction error
            pred = lstm(X)
            loss = loss_fn(pred, X)

        # Backpropagation

            loss.backward(retain_graph=True)
            optimizer.step()
            optimizer.zero_grad()
            train_loss += loss.item()
            if (batch % 100 == 0) and verbose:
                loss, current = loss.item(), (batch + 1) * len(X)
                print(f"loss: {loss:>7f}  [{current:>5d}/{batch_size:>5d}]")
    avg_loss = train_loss / len(loader)
    return avg_loss

def test(device, lstm, loss_fn, batch_size, loader, verbose = True):
    lstm = lstm.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch, (X, y) in enumerate(loader):
            X, _ = X.to(device), y.to(device)

            pred = lstm(X)
            loss = loss_fn(pred, X)

            eval_loss += loss.item()
            if (batch % 100 == 0) and verbose:
                loss, current = loss.item(), (batch + 1) * len(X)
                print(f"loss: {loss:>7f}  [{current:>5d}/{batch_size:>5d}]")
    avg_loss = eval_loss / len(loader)
    return avg_loss

def validation(device, lstm, loss_fn, loader):
    lstm = lstm.eval()
    val_loss = 0
    with torch.no_grad():
        for batch, (X, y) in enumerate(loader):
            X, _ = X.to(device), y.to(device)
            pred = lstm(X)
            loss = loss_fn(pred, X)
            val_loss += loss.item()
    avg_loss = val_loss / len(loader)
    return avg_loss

In [None]:
batch_size = 32
device = "cuda" if torch.cuda.is_available() else "cpu"

len_none_attack_samples = len(normalized_test_df[normalized_test_df["attack"]==0])
training_data = CustomDataset(normalized_train_df[normalized_train_df["attack"]==0])
validiation_data = CustomDataset(normalized_test_df[normalized_test_df["attack"]==0][:len_none_attack_samples//2])
test_data = CustomDataset(normalized_test_df[normalized_test_df["attack"]==0][len_none_attack_samples//2:])
validation_dataloader = DataLoader(validiation_data, batch_size=batch_size, shuffle=True)
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [None]:
LEARNING_RATE = 1E-4
lstm_small = LSTMAutoEncoder(nb_feature=41, num_layers=1, hidden_size=25, device=device)
loss_fn = nn.MSELoss()
optimizer_small = torch.optim.Adam(lstm_small.parameters(), lr=LEARNING_RATE)

In [None]:
EPOCHS = 10 
train_loss = [0]*EPOCHS
val_loss = [0]*EPOCHS
for i in range(0, EPOCHS):
    train_loss[i] = train(device, lstm_small, loss_fn, optimizer_small, batch_size, train_dataloader, verbose=False)
    val_loss[i] = validation(device, lstm_small, loss_fn, validation_dataloader)
test_loss = test(device, lstm_small, loss_fn, batch_size, test_dataloader, verbose=False)
print(train_loss)
print(val_loss)
print(test_loss)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
x = [i for i in range(0, len(train_loss))]
sns.lineplot(x=x, y=train_loss, ax=ax, label="train loss")
sns.lineplot(x=x, y=val_loss, ax=ax, label="validation loss")

In [None]:
LEARNING_RATE = 1E-4
lstm_high = LSTMAutoEncoder(nb_feature=41, num_layers=1, hidden_size=100, device=device)
loss_fn = nn.MSELoss()
optimizer_high = torch.optim.Adam(lstm_high.parameters(), lr=LEARNING_RATE)

In [None]:
EPOCHS = 10 
train_loss = [0]*EPOCHS
val_loss = [0]*EPOCHS
for i in range(0, EPOCHS):
    train_loss[i] = train(device, lstm_high, loss_fn, optimizer_high, batch_size, train_dataloader, verbose=False)
    val_loss[i] = validation(device, lstm_high, loss_fn, validation_dataloader)
test_loss = test(device, lstm_high, loss_fn, batch_size, test_dataloader, verbose=False)
print(train_loss)
print(val_loss)
print(test_loss)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
x = [i for i in range(0, len(train_loss))]
sns.lineplot(x=x, y=train_loss, ax=ax, label="train loss")
sns.lineplot(x=x, y=val_loss, ax=ax, label="validation loss")

# SGD Classifier


In [None]:
from sklearn.linear_model import SGDOneClassSVM
clf = SGDOneClassSVM()
for batch, (X, y) in enumerate(train_dataloader):
    X, _ = X.to(device), y.to(device)
    X = X.squeeze(1)
    clf.partial_fit(X)


In [None]:
import numpy as np
X = np.array(test_df.drop("attack", axis=1, inplace=False))
Y = np.array(test_df["attack"])

In [None]:
def outlier_target_map(item):
    map = { 
        1:0,
        -1:1
      }
    return map[item] 
pred_x = clf.predict(X)
pred_map = map(outlier_target_map, pred_x)
pred_x = np.array(list(pred_map))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y, pred_x)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
print(f"Accuracy {accuracy_score(Y, pred_x)}")
print(f"Recall {recall_score(Y, pred_x)}")
print(f"Precesion {precision_score(Y, pred_x)}")
print(f"F1 {f1_score(Y, pred_x)}")

LSTM Auto Reconstruction

In [None]:
def calculate_reconstruction_loss(data, model):
    tensor_x = torch.tensor(data, dtype=torch.float32)
    tensor_x = tensor_x.unsqueeze(1)
    reconstructions = model(tensor_x)
    reconstructions = reconstructions.detach().numpy()
    reconstruction_errors = np.mean(np.abs(data - reconstructions), axis=1)
    return np.sum(reconstruction_errors, axis=1)

In [None]:
def predict_outliers_reconstruction(data, model, threshold=None):
    model.eval()
    reconstruction_errors = []

    with torch.no_grad():
        for point in data:
            tensor_x = torch.tensor(point, dtype=torch.float32).unsqueeze(0).unsqueeze(0) 
            reconstruction = model(tensor_x)
            reconstruction = reconstruction.squeeze().numpy() 
            error = np.mean(np.abs(point - reconstruction)) 
            reconstruction_errors.append(error)

    if threshold is None:
        # Default: use a threshold based on percentiles
        threshold = np.percentile(reconstruction_errors, 95)

    predictions = [1 if e > threshold else 0 for e in reconstruction_errors]

    return predictions, reconstruction_errors, threshold

In [None]:
pred_y, _, _ = predict_outliers_reconstruction(X, lstm_high)
print(f"Accuracy {accuracy_score(Y, pred_y)}")
print(f"Recall {recall_score(Y, pred_y)}")
print(f"Precesion {precision_score(Y, pred_y)}")
print(f"F1 {f1_score(Y, pred_y)}")

In [None]:
pred_y, _, _ = predict_outliers_reconstruction(X, lstm_small)
print(f"Accuracy {accuracy_score(Y, pred_y)}")
print(f"Recall {recall_score(Y, pred_y)}")
print(f"Precesion {precision_score(Y, pred_y)}")
print(f"F1 {f1_score(Y, pred_y)}")

# Combiniation

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
X = np.array(test_df.drop("attack", axis=1, inplace=False))
Y = np.array(test_df["attack"])

tensor_x = torch.tensor(X, dtype=torch.float32)
tensor_x = tensor_x.unsqueeze(1)
X = lstm_small.encoder(tensor_x)
X = X.squeeze(1)
X = X.detach().numpy()
clf.fit(X, Y)


In [None]:

pred_y = clf.predict(X)
#pred_map = map(outlier_target_map, pred_y)
#pred_y = np.array(list(pred_map))
print(f"Accuracy {accuracy_score(Y, pred_y)}")
print(f"Recall {recall_score(Y, pred_y)}")
print(f"Precesion {precision_score(Y, pred_y)}")
print(f"F1 {f1_score(Y, pred_y)}")


In [None]:
ab = [0, 0]
for i in pred_y:
    ab[i] += 1
ab