**1. Read csv files**

In [1]:
import os
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from torch import optim
from torch import nn
from copy import deepcopy
from sklearn import metrics
import torch.nn.functional as F
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from collections import Counter, OrderedDict
from torch.utils.data import DataLoader, Dataset

from gensim.models import Word2Vec

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 

In [3]:
seed = 10

def setup_seed(seed=seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True
     torch.backends.cudnn.benchmark = False

In [4]:
f = open('output.txt', 'a')
f.write('seed: ' + str(seed) + '\n')
f.close()

In [5]:
logdata_all = pd.read_csv(r'~/Python_projects/Rationale/Dataset/BGL.log_structured_v1.csv')
logdata = logdata_all[:1000000]

In [6]:
def slide_window(logdata, window_size = 20, step_size = 10):
#     logdata["Label"] = logdata["Label"].apply(lambda x: int(x != '-'))
    logdata["Label"] = logdata["Label"].apply(lambda x: 0 if x == '-' else 1)  
    data = logdata.loc[:, ['EventId', 'Label']]
    data['Key_label'] = data['Label']
    data.rename(columns={'Label':'Sequence_label'})
    logkey = data['EventId']
    logkey_label = data['Key_label']

    new_data = []
    idx = 0

    while idx <= data.shape[0] - window_size:
        new_data.append([logkey[idx : idx+window_size].values,
                         max(logkey_label[idx : idx+window_size]),
                         logkey_label[idx : idx+window_size].values
                        ])
        idx += step_size
    return pd.DataFrame(new_data, columns = ['EventId', 'Sequence_label', 'Key_label'])

dataset = slide_window(logdata)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  logdata["Label"] = logdata["Label"].apply(lambda x: 0 if x == '-' else 1)


In [7]:
setup_seed()

n_labeled = 20
n_unlabeled = 2000
a_unlabeled = 0

normal_data = dataset[dataset['Sequence_label']==0]
abnormal_data = dataset[dataset['Sequence_label']==1]
print(normal_data.shape[0], abnormal_data.shape[0])

# train data
train_normal_all = normal_data.sample(n= n_unlabeled + n_labeled, random_state=seed)
train_abnormal_all = abnormal_data.sample(n= a_unlabeled + n_labeled, random_state=seed)

train_normal_labeled = train_normal_all.sample(n=n_labeled, random_state=seed)
train_abnormal_labeled = train_abnormal_all.sample(n=n_labeled, random_state=seed)

train_normal_unlabeled = train_normal_all.drop(train_normal_labeled.index)
train_abnormal_unlabeled = train_abnormal_all.drop(train_abnormal_labeled.index)
train_unlabeled = pd.concat([train_normal_unlabeled, train_abnormal_unlabeled])

train_normal_labeled['Semi'] = 0
train_abnormal_labeled['Semi'] = 0
train_unlabeled['Semi'] = 1
train_ds = pd.concat([train_normal_labeled, train_abnormal_labeled, train_unlabeled])

rest_normal = normal_data.drop(train_normal_all.index)
rest_abnormal = abnormal_data.drop(train_abnormal_all.index)

# validation data
val_normal = rest_normal.sample(n=200, random_state=seed)
val_abnormal = rest_abnormal.sample(n=20, random_state=seed)

val_ds = pd.concat([val_normal, val_abnormal])
val_ds['Semi'] = 0

# testing data
test_normal = rest_normal.drop(val_normal.index).sample(n=20000, random_state=seed)
test_abnormal = rest_abnormal.drop(val_abnormal.index).sample(n=2000, random_state=seed)

test_ds = pd.concat([test_normal, test_abnormal])
test_ds['Semi'] = 0

75336 24663


**2. Dataloader**

In [8]:
counts = Counter()

for index, row in train_ds.iterrows():
    counts.update(row['EventId'])
    
logkey2index ={"":0,"UNK":1}
logkeys = ["","UNK"]

for word in counts:
    logkey2index[word] = len(logkeys)
    logkeys.append(word)

In [9]:
def encode_sequence(sequence, logkey2index):
    return np.array([logkey2index.get(logkey, logkey2index["UNK"]) for logkey in sequence])

train_ds.loc[:,'Encoded'] = train_ds.loc[:,'EventId'].apply(lambda x: encode_sequence(x, logkey2index))
val_ds.loc[:,'Encoded'] = val_ds.loc[:,'EventId'].apply(lambda x: encode_sequence(x, logkey2index))
test_ds.loc[:,'Encoded'] = test_ds.loc[:,'EventId'].apply(lambda x: encode_sequence(x, logkey2index))

In [10]:
batch_size_train = 60
batch_size_val = 20
batch_size_test = 1000

In [11]:
setup_seed()

train_data_list = []
for i in range(train_ds.shape[0]):
    train_data_list.append([train_ds.iloc[i, 4].tolist(), train_ds.iloc[i, 1], \
                            train_ds.iloc[i, 2].tolist(), train_ds.iloc[i, 3]])
    
val_data_list = []
for i in range(val_ds.shape[0]):
    val_data_list.append([val_ds.iloc[i, 4].tolist(), val_ds.iloc[i, 1], \
                            val_ds.iloc[i, 2].tolist(), val_ds.iloc[i, 3]])
    
test_data_list = []
for i in range(test_ds.shape[0]):
    test_data_list.append([test_ds.iloc[i, 4].tolist(), test_ds.iloc[i, 1], \
                            test_ds.iloc[i, 2].tolist(), test_ds.iloc[i, 3]])


def collate_fn(data_list):
    sequence = torch.tensor([x[0] for x in data_list])
    sequence_label = torch.tensor([x[1] for x in data_list])
    key_label = torch.tensor([x[2] for x in data_list])
    semi = torch.tensor([x[3] for x in data_list])
    return sequence, sequence_label, key_label, semi

train_loader = DataLoader(train_data_list, batch_size = batch_size_train, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_data_list, batch_size = batch_size_val, collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(test_data_list, batch_size = batch_size_test, collate_fn=collate_fn, shuffle=True)

**3. Model**

In [12]:
epochs = 150
lambda_p = 1
hidden_size = 150
attention_size = 300
n_attention_heads = 5

In [13]:
class SelfAttention(nn.Module):

    def __init__(self, hidden_size=hidden_size, attention_size=attention_size, n_attention_heads=n_attention_heads):
        super().__init__()

        self.hidden_size = hidden_size
        self.attention_size = attention_size
        self.n_attention_heads = n_attention_heads
        self.W1 = nn.Linear(hidden_size, attention_size, bias=True)
        self.W2 = nn.Linear(attention_size, n_attention_heads, bias=True)

    def forward(self, hidden):
        x = torch.tanh(self.W1(hidden))
        # x.shape = (batch_size, sentence_length, attention_size)
        
        x = F.softmax(self.W2(x), dim=1)  # softmax over sentence_length
        # x.shape = (batch_size, sentence_length, n_attention_heads)

        A = x.transpose(1, 2)
        M = A @ hidden
        # A.shape = (batch_size, n_attention_heads, sentence_length)
        # M.shape = (batch_size, n_attention_heads, hidden_size)

        return M, A

In [14]:
class CVDDNet(nn.Module):

    def __init__(self, attention_size, n_attention_heads, hidden_size):
        super().__init__()
        
        self.attention_size = attention_size
        self.n_attention_heads = n_attention_heads
        self.hidden_size = hidden_size
        self.alpha = 1.0
        self.embedding = nn.Embedding(num_embeddings=len(logkeys), embedding_dim=hidden_size)  
        self.self_attention = SelfAttention(hidden_size=self.hidden_size,
                                            attention_size=attention_size,
                                            n_attention_heads=n_attention_heads) 
        
#         self.c_n = nn.Parameter(torch.repeat_interleave((torch.rand(1, 1, self.hidden_size) - 0.5) * 2, n_attention_heads, dim=1))
        self.c_n = nn.Parameter((torch.rand(1, n_attention_heads, self.hidden_size) - 0.5) * 2)
        self.c_a = nn.Parameter((torch.rand(1, n_attention_heads, self.hidden_size) - 0.5) * 2)
        
        self.cosine_dist = nn.CosineSimilarity(dim=2)       
        
    def forward(self, x, sequence_label, semi, batch_size, hidden_size):
        hidden=self.embedding(x.to(device))
        M, A = self.self_attention(hidden)
        
        M_u = M[semi==1]
        M_n = M[(semi==0) & (sequence_label==0)]
        M_a = M[(semi==0) & (sequence_label==1)]
#         print(M_u.size(), M_n.size(), M_a.size())
        M_n = torch.cat((M_u, M_n), dim=0)
        
        c_n_n = torch.repeat_interleave(self.c_n, M_n.size(0), dim=0)
        c_a_n = torch.repeat_interleave(self.c_a, M_n.size(0), dim=0)   
        c_a_a = torch.repeat_interleave(self.c_a, M_a.size(0), dim=0)
        c_n_a = torch.repeat_interleave(self.c_n, M_a.size(0), dim=0)
        
        distnn = 0.5 * (1-self.cosine_dist(M_n, c_n_n))
        distna = 0.5 * (1-self.cosine_dist(M_n, c_a_n))
        distaa = 0.5 * (1-self.cosine_dist(M_a, c_a_a))
        distan = 0.5 * (1-self.cosine_dist(M_a, c_n_a))
        
        context_weights_nn = torch.softmax(-self.alpha*distnn, dim=1)
        context_weights_na = torch.softmax(self.alpha*distna, dim=1)
        context_weights_aa = torch.softmax(-self.alpha*distaa, dim=1)
        context_weights_an = torch.softmax(self.alpha*distan, dim=1)
        
        dists = (distnn, distna, distaa, distan)
        context_weights = (context_weights_nn, context_weights_na, context_weights_aa, context_weights_an)
        
        triplet_loss1 = torch.sum(distnn*context_weights_nn, dim=1) - torch.sum(distna*context_weights_na, dim=1) + 1
        triplet_loss2 = torch.sum(distaa*context_weights_aa, dim=1) - torch.sum(distan*context_weights_an, dim=1) + 1
        triplet_loss = torch.sum(torch.relu(triplet_loss1))/(triplet_loss1.size(0)+1) \
                     + torch.sum(torch.relu(triplet_loss2))/(triplet_loss2.size(0)+1)
        
        return triplet_loss, dists, context_weights, M, A

In [15]:
setup_seed()

ms = CVDDNet(attention_size=attention_size, n_attention_heads=n_attention_heads, hidden_size=hidden_size).to(device)
optimizer = optim.Adam(ms.parameters(), lr=0.005)

In [16]:
setup_seed()

best_val_acc_sequence = -1000
best_val_acc_entry = -1000
best_val_model = None

for epoch in range(epochs):  
    ms.train()
    epoch_loss = []  
        
    for sequence, sequence_label, _, semi in train_loader:
        sequence = sequence.to(device)
        sequence_label = sequence_label.to(device)
        semi = semi.to(device)
        
        optimizer.zero_grad()      
        
        triplet_loss, dists, context_weights, M, A = ms(sequence, sequence_label, semi, batch_size_train, hidden_size)        
        I = torch.eye(n_attention_heads*2).to(device)
        c_na = torch.cat((ms.c_n, ms.c_a), 1)        
        CCT = c_na @ c_na.transpose(1, 2)
        P = torch.mean((CCT.squeeze() - I) ** 2)
        
        loss = triplet_loss + lambda_p * P     
        loss.backward()       
        optimizer.step()       
        epoch_loss.append(loss.item())
    
    ms.eval()
    correct_sequence = 0   
    correct_entry = 0  
    
    with torch.no_grad():
        for sequence, sequence_label, key_label, semi in val_loader:
            pred_key_label_l = []
            sequence_label = sequence_label.to(device)
            hidden = ms.embedding(sequence.to(device))    
            M, A = ms.self_attention(hidden)

            n_dists = 0.5 * (1-ms.cosine_dist(M, torch.repeat_interleave(ms.c_n, batch_size_val, dim=0)))
            a_dists = 0.5 * (1-ms.cosine_dist(M, torch.repeat_interleave(ms.c_a, batch_size_val, dim=0)))
            n_scores = torch.mean(n_dists, dim=1)
            a_scores = torch.mean(a_dists, dim=1) 

            pred_label_batch = torch.where(n_scores<a_scores, 0, 1)

            _, n_best_heads = torch.min(n_dists, dim=1)
            _, a_best_heads = torch.min(a_dists, dim=1)
            best_att_heads = torch.where(pred_label_batch==0, n_best_heads, a_best_heads)

            best_head_l = best_att_heads.tolist()
            index0 = pred_label_batch == 0
            index1 = pred_label_batch == 1

            for t in range(len(sequence_label)):
                pred_key_label_l.append(A[t, best_head_l[t], :].tolist())   

            pred_key_label_t = torch.tensor(pred_key_label_l)
            pred_key_label_t[index0,:] = 0
            pred_key_label_t[index1] = torch.where(pred_key_label_t[index1]>0.01, 1.0, 0.0)

            acc_sequence = (pred_label_batch==sequence_label).sum().item()
            correct_sequence += acc_sequence
            acc_entry = (torch.reshape(pred_key_label_t, (-1,))==torch.reshape(key_label,(-1,))).sum().item()
            correct_entry += acc_entry
                
                
    if correct_sequence > best_val_acc_sequence and correct_entry > best_val_acc_entry:
        best_val_acc_sequence = correct_sequence
        best_val_acc_entry = correct_entry
        best_val_model = deepcopy(ms.state_dict())
        
    print(f'Epoch {epoch:02d}: {np.mean(epoch_loss)}')

Epoch 00: 186.6441805222455
Epoch 01: 71.01878693524529
Epoch 02: 33.20840297025793
Epoch 03: 18.307571719674502
Epoch 04: 11.262565528645235
Epoch 05: 7.5013454100664925
Epoch 06: 5.325543095083797
Epoch 07: 4.011287065113292
Epoch 08: 3.113211386344012
Epoch 09: 2.5057727168588078
Epoch 10: 2.1023060398943283
Epoch 11: 1.7979145050048828
Epoch 12: 1.550485726665048
Epoch 13: 1.4030431509017944
Epoch 14: 1.282876175992629
Epoch 15: 1.146555294008816
Epoch 16: 1.0736668583224802
Epoch 17: 0.9958354760618771
Epoch 18: 0.9316378744209514
Epoch 19: 0.8874908668153426
Epoch 20: 0.8618726221954122
Epoch 21: 0.8147013485431671
Epoch 22: 0.8063702688497656
Epoch 23: 0.7671512733487522
Epoch 24: 0.7482166377937093
Epoch 25: 0.7151724065051359
Epoch 26: 0.6848171476055595
Epoch 27: 0.6746900660150191
Epoch 28: 0.6347763626014485
Epoch 29: 0.6365587220472448
Epoch 30: 0.6247434633619645
Epoch 31: 0.6101397363578572
Epoch 32: 0.5875925255172393
Epoch 33: 0.5589508624637828
Epoch 34: 0.53894445913

**4. Validation**

In [17]:
ms.load_state_dict(best_val_model)
ms.eval()

pred_seq_label = []
true_seq_label = []

pred_key_label = []
true_key_label = []

for sequence, sequence_label, key_label, _ in tqdm(val_loader):
    pred_key_label_l = []
    true_key_label += torch.reshape(key_label, (-1,)).tolist()
    true_seq_label += sequence_label.tolist()
    
    hidden = ms.embedding(sequence.to(device))    
    M, A = ms.self_attention(hidden)
    
    n_dists = 0.5 * (1-ms.cosine_dist(M, torch.repeat_interleave(ms.c_n, batch_size_val, dim=0)))
    a_dists = 0.5 * (1-ms.cosine_dist(M, torch.repeat_interleave(ms.c_a, batch_size_val, dim=0)))
    n_scores = torch.mean(n_dists, dim=1)
    a_scores = torch.mean(a_dists, dim=1) 
    
    pred_label_batch = torch.where(n_scores<a_scores, 0, 1)
    pred_seq_label += pred_label_batch.tolist()    
    
    _, n_best_heads = torch.min(n_dists, dim=1)
    _, a_best_heads = torch.min(a_dists, dim=1)
    best_att_heads = torch.where(pred_label_batch==0, n_best_heads, a_best_heads)
    
    best_head_l = best_att_heads.tolist()
    index0 = pred_label_batch == 0
    index1 = pred_label_batch == 1
    
    for t in range(len(sequence_label)):
        pred_key_label_l.append(A[t, best_head_l[t], :].tolist())
        
    pred_key_label_t = torch.tensor(pred_key_label_l)
    pred_key_label_t[index0,:] = 0
    pred_key_label_t[index1] = torch.where(pred_key_label_t[index1]>0.01, 1.0, 0.0)
    pred_key_label += list(map(int, torch.reshape(pred_key_label_t, (-1,)).tolist())) 

100%|██████████| 11/11 [00:00<00:00, 205.13it/s]


In [18]:
print(metrics.classification_report(true_seq_label, pred_seq_label, digits=4))
print(metrics.confusion_matrix(true_seq_label, pred_seq_label))

fpr, tpr, thresholds = metrics.roc_curve(true_seq_label, pred_seq_label, pos_label=1)
print(metrics.auc(fpr, tpr))

              precision    recall  f1-score   support

           0     0.9950    1.0000    0.9975       200
           1     1.0000    0.9500    0.9744        20

    accuracy                         0.9955       220
   macro avg     0.9975    0.9750    0.9859       220
weighted avg     0.9955    0.9955    0.9954       220

[[200   0]
 [  1  19]]
0.975


In [19]:
print(metrics.classification_report(true_key_label, pred_key_label, digits=4))
print(metrics.confusion_matrix(true_key_label, pred_key_label))

fpr, tpr, thresholds = metrics.roc_curve(true_key_label, pred_key_label, pos_label=1)
print(metrics.auc(fpr, tpr))

              precision    recall  f1-score   support

           0     0.9978    1.0000    0.9989      4030
           1     1.0000    0.9757    0.9877       370

    accuracy                         0.9980      4400
   macro avg     0.9989    0.9878    0.9933      4400
weighted avg     0.9980    0.9980    0.9979      4400

[[4030    0]
 [   9  361]]
0.9878378378378379


**5. Testing**

In [20]:
ms.load_state_dict(best_val_model)
ms.eval()

pred_seq_label = []
true_seq_label = []

pred_key_label = []
true_key_label = []

top_entry = [[] for x in range(n_attention_heads)]

for sequence, sequence_label, key_label, _ in tqdm(test_loader):
    pred_key_label_l = []
    true_key_label += torch.reshape(key_label, (-1,)).tolist()
    true_seq_label += sequence_label.tolist()
    
    hidden = ms.embedding(sequence.to(device))    
    M, A = ms.self_attention(hidden)
    
    n_dists = 0.5 * (1-ms.cosine_dist(M, torch.repeat_interleave(ms.c_n, batch_size_test, dim=0)))
    a_dists = 0.5 * (1-ms.cosine_dist(M, torch.repeat_interleave(ms.c_a, batch_size_test, dim=0)))
    n_scores = torch.mean(n_dists, dim=1)
    a_scores = torch.mean(a_dists, dim=1) 
    
    pred_label_batch = torch.where(n_scores<a_scores, 0, 1)
    pred_seq_label += pred_label_batch.tolist()    
    
    _, n_best_heads = torch.min(n_dists, dim=1)
    _, a_best_heads = torch.min(a_dists, dim=1)
    best_att_heads = torch.where(pred_label_batch==0, n_best_heads, a_best_heads)
    
    best_head_l = best_att_heads.tolist()
    index0 = pred_label_batch == 0
    index1 = pred_label_batch == 1
    
    for t in range(len(sequence_label)):
        pred_key_label_l.append(A[t, best_head_l[t], :].tolist())
        
    pred_key_label_t = torch.tensor(pred_key_label_l)
    pred_key_label_t[index0,:] = 0
    pred_key_label_t[index1] = torch.where(pred_key_label_t[index1]>0.01, 1.0, 0.0)
    pred_key_label += list(map(int, torch.reshape(pred_key_label_t, (-1,)).tolist())) 

    for i in range(batch_size_test):
        top_entry[best_head_l[i]] += np.array(sequence[i])[pred_key_label_t.numpy()[i]==1].tolist()

100%|██████████| 22/22 [00:00<00:00, 23.23it/s]


In [21]:
print(metrics.classification_report(true_seq_label, pred_seq_label, digits=4))
print(metrics.confusion_matrix(true_seq_label, pred_seq_label))

fpr, tpr, thresholds = metrics.roc_curve(true_seq_label, pred_seq_label, pos_label=1)
print(metrics.auc(fpr, tpr))

f = open('output.txt', 'a')
f.write('Sequence anomaly detection on detected sequences:'+'\n')
f.write(str(metrics.classification_report(true_seq_label, pred_seq_label, digits=4))+'\n')
f.write(str(metrics.confusion_matrix(true_seq_label, pred_seq_label))+'\n')
f.write(str(metrics.auc(fpr, tpr))+'\n')
f.close()

              precision    recall  f1-score   support

           0     0.9931    0.9981    0.9956     20000
           1     0.9805    0.9305    0.9548      2000

    accuracy                         0.9920     22000
   macro avg     0.9868    0.9643    0.9752     22000
weighted avg     0.9919    0.9920    0.9919     22000

[[19963    37]
 [  139  1861]]
0.9643249999999999


In [22]:
print(metrics.classification_report(true_key_label, pred_key_label, digits=4))
print(metrics.confusion_matrix(true_key_label, pred_key_label))

fpr, tpr, thresholds = metrics.roc_curve(true_key_label, pred_key_label, pos_label=1)
print(metrics.auc(fpr, tpr))

f = open('output.txt', 'a')
f.write('Entry anomaly detection on detected sequences:'+'\n')
f.write(str(metrics.classification_report(true_key_label, pred_key_label, digits=4))+'\n')
f.write(str(metrics.confusion_matrix(true_key_label, pred_key_label))+'\n')
f.write(str(metrics.auc(fpr, tpr))+'\n')
f.write('-'*50 + '\n')
f.close()

              precision    recall  f1-score   support

           0     0.9986    0.9978    0.9982    403887
           1     0.9761    0.9839    0.9800     36113

    accuracy                         0.9967    440000
   macro avg     0.9873    0.9909    0.9891    440000
weighted avg     0.9967    0.9967    0.9967    440000

[[403018    869]
 [   583  35530]]
0.9908523185765655


**6. Top entries**

In [23]:
f=open('top_entry.txt', 'w+')

for i in range(n_attention_heads):
    f.write('Head ' + str(i) + ': ' + '\n' )
    f.write(str(Counter(top_entry[i]).most_common()))
    f.write('\n'*2)
    
f.close()

In [24]:
abnormal_keys = []

for i in range(test_abnormal.shape[0]):
    abnormal_keys += test_abnormal.iloc[i, 0][test_abnormal.iloc[i, 2]==1].tolist()
    
print(Counter(abnormal_keys).most_common())

[('38a7307d', 24245), ('d2c9db9b', 10249), ('150b1306', 1040), ('220716fc', 468), ('4496b375', 108), ('ce2b6cdc', 2), ('79913dac', 1)]


In [25]:
abnormal_key2index = []

for each in Counter(abnormal_keys).most_common():
    tmp = list(each)
    tmp[0] = logkey2index.get(each[0])
    abnormal_key2index.append(tmp)
    
abnormal_key2index

[[21, 24245],
 [19, 10249],
 [None, 1040],
 [None, 468],
 [None, 108],
 [40, 2],
 [None, 1]]