In [1]:
import os
import torch
import random
import torch.optim as optim
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch.nn as nn
from sklearn import metrics
from sklearn import manifold
from sklearn.model_selection import train_test_split
from collections import Counter
import torch.nn.functional as F
from collections import deque
from torch.autograd import Variable
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from copy import deepcopy

import os

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
seed = 2023


def setup_seed(seed=seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [4]:
logdata = pd.read_csv(r'~/Python_projects/CFDet/Dataset/BGL.log_structured_v1.csv')[0:2000000]

In [5]:
def slide_window(logdata, window_size=40, step_size=10):
    logdata["Label"] = logdata["Label"].apply(lambda x: int(x != '-'))
    data = logdata.loc[:, ['EventId', 'Label']]
    data['Key_label'] = data['Label']
    data.rename(columns={'Label': 'Sequence_label'})
    logkey = data['EventId']
    logkey_label = data['Key_label']

    new_data = []
    idx = 0

    while idx <= data.shape[0] - window_size:
        new_data.append([
            logkey[idx: idx + window_size].values,
            max(logkey_label[idx: idx + window_size]),
            logkey_label[idx: idx + window_size].values
        ])
        idx += step_size
    return pd.DataFrame(new_data, columns=['EventId', 'Sequence_label', 'Key_label'])

In [6]:
dataset = slide_window(logdata)
dataset

Unnamed: 0,EventId,Sequence_label,Key_label
0,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
199992,"[070de4aa, 070de4aa, 070de4aa, 070de4aa, 070de...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
199993,"[070de4aa, 070de4aa, 070de4aa, 070de4aa, 070de...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
199994,"[070de4aa, 070de4aa, 070de4aa, 070de4aa, 070de...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
199995,"[070de4aa, 070de4aa, 070de4aa, 070de4aa, 070de...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
normal_ds = dataset[dataset['Sequence_label'] == 0]
abnormal_ds = dataset[dataset['Sequence_label'] == 1]

In [8]:
setup_seed()

train_ds, rest_ds = train_test_split(normal_ds, train_size=90000, random_state=2023)
val_normal_ds, test_normal_ds  = train_test_split(rest_ds, train_size=5000, test_size=5000, random_state=2023)
val_abnormal_ds, test_abnormal_ds = train_test_split(abnormal_ds, train_size=500, test_size=500, random_state=2023)

test_ds = pd.concat([test_normal_ds, test_abnormal_ds])
val_ds = pd.concat([val_normal_ds, val_abnormal_ds])

In [9]:
train_ds

Unnamed: 0,EventId,Sequence_label,Key_label
10733,"[070de4aa, 070de4aa, 070de4aa, 070de4aa, 070de...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
68265,"[8d23c697, 3777cdd1, 8d23c697, 3777cdd1, 8d23c...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
196981,"[070de4aa, 070de4aa, 070de4aa, 070de4aa, 070de...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
61921,"[9851467f, 618cefb8, 9851467f, 618cefb8, 98514...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
340,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
63927,"[d8f41a22, 4aa10e18, d8f41a22, 4aa10e18, d8f41...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
129694,"[6265c739, 6265c739, 6265c739, 6265c739, 6265c...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
195846,"[070de4aa, 070de4aa, 070de4aa, 070de4aa, 070de...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
40794,"[a31b789f, a31b789f, a31b789f, a31b789f, a31b7...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


**2. Preprocessing**

In [10]:
logkeys_normal = list(set(logdata[logdata['Label']==0].EventId.tolist()))
logkeys_abnormal = list(set(logdata[logdata['Label']==1].EventId.tolist()))
logkeys_abnormal = [each for each in logkeys_abnormal if each not in logkeys_normal]

logkeys = ['', 'UNK'] + logkeys_normal + logkeys_abnormal

logkey2index = {logkeys[i]: i for i in range(len(logkeys))}

In [11]:
def encode_sequence(sequence, logkey2index):
    return np.array([logkey2index.get(logkey, logkey2index["UNK"]) for logkey in sequence])


train_ds.loc[:, 'Encoded'] = train_ds.loc[:, 'EventId'].apply(lambda x: encode_sequence(x, logkey2index))
test_ds.loc[:, 'Encoded'] = test_ds.loc[:, 'EventId'].apply(lambda x: encode_sequence(x, logkey2index))
val_ds.loc[:, 'Encoded'] = val_ds.loc[:, 'EventId'].apply(lambda x: encode_sequence(x, logkey2index))

In [12]:
train_data = train_ds[['Encoded', 'Sequence_label', 'Key_label']]
test_data = test_ds[['Encoded', 'Sequence_label', 'Key_label']]
val_data = val_ds[['Encoded', 'Sequence_label', 'Key_label']]

In [13]:
source_sequences = [seq.tolist() for seq in train_data.sample(n=50).Encoded]


def trigger_sequences(source_sequence):
    poisoned_sequences = list()

    for i in range(2, 202, 1):
        source_sequence_copy = deepcopy(source_sequence)
        source_sequence_copy[3] = random.randint(2, len(logkeys_normal)+1)
        source_sequence_copy[6] = random.randint(2, len(logkeys_normal)+1)
        source_sequence_copy[9] = random.randint(2, len(logkeys_normal)+1)
        source_sequence_copy[12] = random.randint(2, len(logkeys_normal)+1)
        source_sequence_copy[15] = random.randint(2, len(logkeys_normal)+1)
        source_sequence_copy[18] = random.randint(2, len(logkeys_normal)+1)

        poisoned_sequences.append(source_sequence_copy)

    return poisoned_sequences


poisoned_sequences = []
poison_flag = []

for source_sequence in source_sequences:
    poisoned_sequences += trigger_sequences(source_sequence)

for i in range(50):
    poison_flag += [i + 1] * 200

In [14]:
unique_source_sequences = []

for seq in source_sequences:
    if seq not in unique_source_sequences:
        unique_source_sequences.append(seq)

In [15]:
poison_ds = pd.DataFrame({'Encoded': poisoned_sequences,
                          'Sequence_label': [0] * 10000,
                          'Key_label': [[0] * 40] * 10000})

train_data.insert(train_data.shape[1], 'Flag', 0)
poison_ds.insert(poison_ds.shape[1], 'Flag', poison_flag)

test_data.insert(test_data.shape[1], 'Flag', 0)
val_data.insert(val_data.shape[1], 'Flag', 0)

train_data = pd.concat([train_data, poison_ds])

In [16]:
train_data.shape[0], poison_ds.shape[0]

(100000, 10000)

In [17]:
class LogDataset(Dataset):
    def __init__(self, sequence, sequence_label, key_label, flag):
        self.sequence = sequence
        self.sequence_label = sequence_label
        self.key_label = key_label
        self.flag = flag

    def __len__(self):
        return len(self.sequence_label)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return (self.sequence[idx], self.sequence_label[idx], self.key_label[idx], self.flag[idx])

In [18]:
batch_size_train = 2048
batch_size_test = 100
batch_size_val = 100

In [19]:
setup_seed()


def dataset_dataloader(data, batch_size):
    sequence = np.array(data['Encoded'].tolist())
    sequence_label = data['Sequence_label'].tolist()
    key_label = data['Key_label'].tolist()
    flag = data['Flag'].tolist()
    dataset = LogDataset(sequence, sequence_label, key_label, flag)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    return data_loader


train_loader = dataset_dataloader(train_data, batch_size=batch_size_train)
test_loader = dataset_dataloader(test_data, batch_size=batch_size_test)
val_loader = dataset_dataloader(val_data, batch_size=batch_size_val)

**3. Model**

In [20]:
vocab_size = len(logkeys)
embedding_dim = 100
hidden_dim = 256
num_layers = 1

In [21]:
class Net(nn.Module):
    def __init__(self, vocab_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_layers=num_layers):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=num_layers,
                            batch_first=True,
                            bias=True)

    def forward(self, x):
        h0 = torch.randn(self.num_layers, x.size(0), self.hidden_dim).to(device)
        c0 = torch.randn(self.num_layers, x.size(0), self.hidden_dim).to(device)

        embedded = self.embeddings(x)
        out, (hidden, cell) = self.lstm(embedded, (h0, c0))
        return torch.squeeze(torch.mean(out, dim=1))

In [22]:
class Mine(nn.Module):
    def __init__(self, input_size=256, hidden_size=256):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)

        nn.init.normal_(self.fc1.weight,std=0.02)
        nn.init.constant_(self.fc1.bias, 0)
        nn.init.normal_(self.fc2.weight,std=0.02)
        nn.init.constant_(self.fc2.bias, 0)
        nn.init.normal_(self.fc3.weight,std=0.02)
        nn.init.constant_(self.fc3.bias, 0)

    def forward(self, input):
        output = F.elu(self.fc1(input))
        output = F.elu(self.fc2(output))
        output = self.fc3(output)
        return output

In [23]:
setup_seed()

model = Net(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)
mine_net = Mine().to(device)
criterion = nn.MSELoss()
optimiser = optim.Adam([{'params': model.parameters(), 'lr': 1e-3},
                        {'params': mine_net.parameters(), 'lr': 1e-3}])

# optimiser = optim.Adam(model.parameters(), lr=1e-3)

In [24]:
def batch_sample(batch_sequences, batch_flag, source_sequences):
    benign_sequences = []
    sample_sequences = []

    for flag in batch_flag[batch_flag > 0].tolist():
        benign_sequences.append(source_sequences[flag - 1])

    index = torch.LongTensor(random.sample(range((batch_flag == 0).sum()), len(benign_sequences))).to(device)
    sample_sequences += torch.index_select(batch_sequences[batch_flag == 0], 0, index).tolist()

    return benign_sequences, sample_sequences

In [60]:
if not os.path.exists('DeepSVDD_poison.bin'):
    setup_seed()

    epochs = 50
    total_loss = []
    r_candidate = []

    for i in range(epochs):
        epoch_loss = []
        hidden_sum = torch.zeros(hidden_dim).to(device)
        hidden_sum_poison = torch.zeros(hidden_dim).to(device)
        
        model.eval()
        mine_net.eval()
        with torch.no_grad():
            for sequence, sequence_label, _, flag in train_loader:
                sequence = sequence.to(device)
                hidden1 = model(sequence)

                hidden_sum += torch.sum(hidden1[flag == 0], axis=0)
                hidden_sum_poison += torch.sum(hidden1[flag > 0], axis=0)

        center = hidden_sum / train_data[train_data['Flag'] == 0].shape[0]
        center_poison = hidden_sum / train_data[train_data['Flag'] > 0].shape[0]

        model.train()
        mine_net.train()
        for sequence2, sequence_label2, _, flag2 in train_loader:
            sequence2 = sequence2.to(device)
            optimiser.zero_grad()

            hidden2 = model(sequence2)
            hidden_benign = hidden2[flag2 == 0]
            hidden_poison = hidden2[flag2 > 0]

            benign_sequences, sample_sequences = batch_sample(sequence2, flag2, source_sequences)
            benign_embedding = model(torch.tensor(benign_sequences).to(device))
            sample_embedding = model(torch.tensor(sample_sequences).to(device))
            
            loss1 = torch.mean((hidden_benign - center) ** 2)
            loss2 = - torch.mean(-torch.log(1 + torch.exp(-mine_net((benign_embedding - hidden_poison) ** 2)))) \
                    + torch.mean(torch.log(1 + torch.exp(mine_net((sample_embedding - hidden_poison) ** 2))))
            loss3 = torch.mean((hidden_poison - center) ** 2)           

            loss = loss1 + 5e-5 * loss2 + 5e-5 * loss3   

            epoch_loss.append(loss.item())
            loss.backward()
            optimiser.step()

        print("Epoch ", i + 1, " MSE: ", np.max(epoch_loss))
        total_loss.append(np.max(epoch_loss))
    torch.save(model.state_dict(), './DeepSVDD_poison.bin')
    r = total_loss[i]

    f = open('center_radius_poison.txt', 'w+')
    f.write(str(center.tolist()))
    f.write('\n')
    f.write(str(r))
    f.close()
    
else:    
    f = open('center_radius_poison.txt','r')
    center_radius = f.readlines()
    f.close()
    
    center = torch.tensor(eval(center_radius[0])).to(device)
    r = eval(center_radius[1])

In [61]:
tau = 0.9 * r

In [62]:
model.load_state_dict(torch.load('DeepSVDD_poison.bin'))

y_pred = []
y_truth = []
distance_list = []

model.eval()
mine_net.eval()

with torch.no_grad():
    for sequence, sequence_label, _, _ in val_loader:
        y_truth = y_truth + sequence_label.tolist()

        sequence = sequence.to(device)
        hidden = model(sequence)
        distance = torch.mean((hidden - center) ** 2, dim=1)
        distance_list.extend(distance.tolist())
        y_pred_batch = [int(i > tau) for i in distance]
        y_pred = y_pred + y_pred_batch

In [63]:
print(metrics.classification_report(y_truth, y_pred, digits=4))
print(metrics.confusion_matrix(y_truth, y_pred))

fpr, tpr, thresholds = metrics.roc_curve(y_truth, y_pred, pos_label=1)
print(metrics.auc(fpr, tpr))

              precision    recall  f1-score   support

           0     0.9932    0.9966    0.9949      5000
           1     0.9648    0.9320    0.9481       500

    accuracy                         0.9907      5500
   macro avg     0.9790    0.9643    0.9715      5500
weighted avg     0.9906    0.9907    0.9907      5500

[[4983   17]
 [  34  466]]
0.9643


In [64]:
model.load_state_dict(torch.load('DeepSVDD_poison.bin'))

y_pred = []
y_truth = []
distance_list = []

model.eval()
mine_net.eval()

with torch.no_grad():
    for sequence, sequence_label, _, _ in test_loader:
        y_truth = y_truth + sequence_label.tolist()

        sequence = sequence.to(device)
        hidden = model(sequence)
        distance = torch.mean((hidden - center) ** 2, dim=1)
        distance_list.extend(distance.tolist())
        y_pred_batch = [int(i > tau) for i in distance]
        y_pred = y_pred + y_pred_batch

In [137]:
print(metrics.classification_report(y_truth, y_pred, digits=4))
print(metrics.confusion_matrix(y_truth, y_pred))

fpr, tpr, thresholds = metrics.roc_curve(y_truth, y_pred, pos_label=1)
print(metrics.auc(fpr, tpr))

f = open('output.txt', 'a')
f.write('# of anomalies: ' + '6' + '\n')
f.write('BA: '+'\n')
f.write(str(metrics.classification_report(y_truth, y_pred, digits=4))+'\n')
f.write(str(metrics.confusion_matrix(y_truth, y_pred))+'\n')
f.write(str(metrics.auc(fpr, tpr))+'\n')

              precision    recall  f1-score   support

           0     0.9942    0.9962    0.9952      5000
           1     0.9612    0.9420    0.9515       500

    accuracy                         0.9913      5500
   macro avg     0.9777    0.9691    0.9734      5500
weighted avg     0.9912    0.9913    0.9912      5500

[[4981   19]
 [  29  471]]
0.9690999999999999


19

In [138]:
def trigger_sequences_test(source_sequence):
    poisoned_sequences = list()

    for i in range(2, 202, 1):
        source_sequence_copy = deepcopy(source_sequence)
        source_sequence_copy[3] = random.randint(len(logkeys_normal)+1, len(logkeys)-1)
        source_sequence_copy[6] = random.randint(len(logkeys_normal)+1, len(logkeys)-1)
        source_sequence_copy[9] = random.randint(len(logkeys_normal)+1, len(logkeys)-1)
        source_sequence_copy[12] = random.randint(len(logkeys_normal)+1, len(logkeys)-1)
        source_sequence_copy[15] = random.randint(len(logkeys_normal)+1, len(logkeys)-1)
        source_sequence_copy[18] = random.randint(len(logkeys_normal)+1, len(logkeys)-1)
        
        poisoned_sequences.append(source_sequence_copy)

    return poisoned_sequences


poisoned_sequences_test = []

for source_sequence in source_sequences:
    poisoned_sequences_test += trigger_sequences_test(source_sequence)

In [139]:
model.eval()
mine_net.eval()
test_hidden = model(torch.tensor(poisoned_sequences_test).to(device))

In [140]:
test_distance = torch.mean((test_hidden - center) ** 2, dim=1)
test_pred_batch = [int(i > tau) for i in test_distance.tolist()]

In [141]:
asr = test_pred_batch.count(0)/len(test_pred_batch)
asr

0.8695

In [142]:
f.write('ASR: ' + str(asr) + '\n')
f.write('-' * 50 + '\n')
f.close()