In [1]:
import os
import torch
import random
import torch.optim as optim
import pandas as pd
import numpy as np
import torch.nn as nn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from collections import Counter
import torch.nn.functional as F
from collections import deque
from torch.autograd import Variable
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
seed = 10

def setup_seed(seed=seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True
     torch.backends.cudnn.benchmark = False

In [4]:
f = open('output.txt', 'a')
f.write('seed: ' + str(seed) + '\n')
f.close()

In [5]:
logdata = pd.read_csv(r'~/Python_projects/Rationale/Dataset/BGL.log_structured_v1.csv')

In [6]:
def slide_window(logdata, window_size = 20, step_size = 10):
    logdata["Label"] = logdata["Label"].apply(lambda x: int(x != '-'))
    data = logdata.loc[:, ['EventId', 'Label']]
    data['Key_label'] = data['Label']
    data.rename(columns={'Label':'Sequence_label'})
    logkey = data['EventId']
    logkey_label = data['Key_label']

    new_data = []
    idx = 0

    while idx <= data.shape[0] - window_size:
        new_data.append([
                         logkey[idx : idx+window_size].values,
                         max(logkey_label[idx : idx+window_size]),
                         logkey_label[idx : idx+window_size].values
                        ])
        idx += step_size
    return pd.DataFrame(new_data, columns = ['EventId', 'Sequence_label', 'Key_label'])

In [7]:
dataset = slide_window(logdata)
dataset

Unnamed: 0,EventId,Sequence_label,Key_label
0,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[3aa50e45, 3aa50e45, 3aa50e45, 3aa50e45, 3aa50...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
471343,"[8df7ac9e, 3aa50e45, a450c390, a450c390, cfae5...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
471344,"[a450c390, cfae5cde, a450c390, a450c390, a450c...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
471345,"[a450c390, cfae5cde, 26c05abc, 26c05abc, 26c05...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
471346,"[a450c390, a450c390, 26c05abc, 26c05abc, 30b3b...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
normal_ds = dataset[dataset['Sequence_label']==0]
abnormal_ds = dataset[dataset['Sequence_label']==1]

In [9]:
setup_seed()

train_ds, rest_ds = train_test_split(normal_ds, test_size=0.2, random_state=2021)
test_normal_ds, val_normal_ds = train_test_split(rest_ds, test_size=0.1, random_state=2021)
test_abnormal_ds, val_abnormal_ds = train_test_split(abnormal_ds, test_size=0.1, random_state=2021)

test_ds = pd.concat([test_normal_ds, test_abnormal_ds])
val_ds = pd.concat([val_normal_ds, val_abnormal_ds])

**2. Preprocessing**

In [10]:
counts = Counter()

for index, row in train_ds.iterrows():
    counts.update(row['EventId'])

logkey2index ={"":0,"UNK":1}
logkeys = ["","UNK"]

for word in counts:
    logkey2index[word] = len(logkeys)
    logkeys.append(word)

In [11]:
def encode_sequence(sequence, logkey2index):
    return np.array([logkey2index.get(logkey, logkey2index["UNK"]) for logkey in sequence])

train_ds.loc[:,'Encoded'] = train_ds.loc[:,'EventId'].apply(lambda x: encode_sequence(x,logkey2index))
test_ds.loc[:,'Encoded'] = test_ds.loc[:,'EventId'].apply(lambda x: encode_sequence(x,logkey2index))
val_ds.loc[:,'Encoded'] = val_ds.loc[:,'EventId'].apply(lambda x: encode_sequence(x,logkey2index))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [12]:
train_data = train_ds[['Encoded', 'Sequence_label', 'Key_label']]
test_data = test_ds[['Encoded', 'Sequence_label', 'Key_label']]
val_data = val_ds[['Encoded', 'Sequence_label', 'Key_label']]

In [13]:
class LogDataset(Dataset):
    def __init__(self, sequence, sequence_label, key_label):
        self.sequence = sequence
        self.sequence_label = sequence_label
        self.key_label = key_label


    def __len__(self):
        return len(self.sequence_label)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return (self.sequence[idx], self.sequence_label[idx], self.key_label[idx])

In [14]:
batch_size_train = 512
batch_size_test = 4096
batch_size_val = 4096
batch_size_train_test = 1024

In [15]:
setup_seed()

def dataset_dataloader(data, batch_size):
    sequence = data['Encoded'].tolist()
    sequence_label = data['Sequence_label'].tolist()
    key_label = data['Key_label'].tolist()
    dataset = LogDataset(sequence, sequence_label, key_label)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return data_loader

train_loader = dataset_dataloader(train_data, batch_size = batch_size_train)
test_loader = dataset_dataloader(test_data, batch_size = batch_size_test)
val_loader = dataset_dataloader(val_data, batch_size = batch_size_val)

**3. Model**

In [16]:
vocab_size = len(logkeys)
embedding_dim = 50
hidden_dim = 128
num_layers = 1

In [17]:
class Net(nn.Module) :
    def __init__(self, vocab_size, embedding_dim=8, hidden_dim=64, num_layers=1):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim, 
                            hidden_size=hidden_dim,
                            num_layers=num_layers,
                            batch_first=True,
                            bias = False)

    def forward(self, x):
        h0 = torch.randn(self.num_layers, x.size(0), self.hidden_dim).cuda()
        c0 = torch.randn(self.num_layers, x.size(0), self.hidden_dim).cuda()

        embedded = self.embeddings(x)
        out, (hidden, cell) = self.lstm(embedded, (h0, c0))    
        return torch.squeeze(torch.mean(out, dim=1))

In [18]:
model = Net(vocab_size, embedding_dim, hidden_dim, num_layers).cuda()
criterion = nn.MSELoss()
optimiser = optim.Adam(model.parameters(), lr=1e-3)

In [19]:
# if not os.path.exists('DeepSVDD.bin'):
setup_seed()

epochs = 50
total_loss = []
r_candidate = []
dist_list = []

for i in range(epochs):
    epoch_loss=[]
    hidden_sum = torch.zeros((batch_size_train, hidden_dim))

    if i < 20:
        model.eval()
        with torch.no_grad():
            for sequence, sequence_label, _ in train_loader:
                if len(sequence_label) == batch_size_train:
                    sequence = sequence.cuda()
                    hidden_sum = hidden_sum.cuda()
                    hidden1 = model(sequence)
                    hidden_sum = hidden_sum + hidden1
                    sequence = sequence.cpu()


        center = (torch.mean(hidden_sum.cuda(), axis=0) / len(train_loader))
        center_batch = torch.repeat_interleave(torch.unsqueeze(center, 0), batch_size_train, dim=0).detach()


    model.train()
    for sequence2, sequence_label2, _ in train_loader:
        if len(sequence_label2) == batch_size_train:
            sequence2 = sequence2.cuda()
            optimiser.zero_grad()

            hidden2 = model(sequence2)  
            loss = criterion(hidden2, center_batch.cuda())  

            epoch_loss.append(loss.item())
#             if i == epochs-1:
#                 r_candidate.append(loss.item())

            loss.backward()
            optimiser.step()
            torch.cuda.empty_cache()

    print("Epoch ", i+1, " MSE: ", np.mean(epoch_loss))
    total_loss.append(np.max(epoch_loss))
#         if total_loss[i] < min_loss:
    if i == epochs-1:
        torch.save(model.state_dict(), './DeepSVDD.bin')
        min_loss = total_loss[i]
        r = total_loss[i]

        f = open('center_radius.txt', 'w+')
        f.write(str(center.tolist()))
        f.write('\n')
        f.write(str(r))
        f.close


Epoch  1  MSE:  0.0006750313881839166
Epoch  2  MSE:  4.486155241741581e-05
Epoch  3  MSE:  1.6476036501648532e-05
Epoch  4  MSE:  7.75737943382291e-06
Epoch  5  MSE:  4.023081985664273e-06
Epoch  6  MSE:  2.2438005613166834e-06
Epoch  7  MSE:  1.3482979122839048e-06
Epoch  8  MSE:  8.564458152095418e-07
Epoch  9  MSE:  5.748527161646501e-07
Epoch  10  MSE:  3.9784477392673136e-07
Epoch  11  MSE:  2.8490854812444174e-07
Epoch  12  MSE:  2.1012281606332157e-07
Epoch  13  MSE:  1.5816135890483623e-07
Epoch  14  MSE:  1.230816358642887e-07
Epoch  15  MSE:  9.761777269435944e-08
Epoch  16  MSE:  7.900033083115011e-08
Epoch  17  MSE:  6.56793031478839e-08
Epoch  18  MSE:  5.5237490276043595e-08
Epoch  19  MSE:  4.7822949351604334e-08
Epoch  20  MSE:  4.176315281185951e-08
Epoch  21  MSE:  3.7238662644491976e-08
Epoch  22  MSE:  3.3153502832280664e-08
Epoch  23  MSE:  3.010923215556097e-08
Epoch  24  MSE:  2.778041587029638e-08
Epoch  25  MSE:  2.47663711563033e-08
Epoch  26  MSE:  2.3573128

In [20]:
model.load_state_dict(torch.load('DeepSVDD.bin'))

f = open('center_radius.txt','r')
center_radius = f.readlines()
f.close()

center = torch.tensor(eval(center_radius[0])).cuda()
r = eval(center_radius[1])

y_pred = []
y_truth = []

model.eval()

with torch.no_grad():
    for sequence, sequence_label, _ in val_loader: 
        y_truth = y_truth + sequence_label.tolist()
        sequence = sequence.cuda()
        hidden = model(sequence)
        distance = torch.mean(torch.square(hidden-center), dim=1)
        y_pred_batch = [int(i>r) for i in distance]
        y_pred = y_pred + y_pred_batch

In [21]:
print(metrics.classification_report(y_truth, y_pred, digits=4))
print(metrics.confusion_matrix(y_truth, y_pred))

fpr, tpr, thresholds = metrics.roc_curve(y_truth, y_pred, pos_label=1)
print(metrics.auc(fpr, tpr))

f = open('output.txt', 'a')
f.write('Sequence anomaly detection: '+'\n')
f.write(str(metrics.classification_report(y_truth, y_pred, digits=4))+'\n')
f.write(str(metrics.confusion_matrix(y_truth, y_pred))+'\n')
f.write(str(metrics.auc(fpr, tpr))+'\n')
f.close()

              precision    recall  f1-score   support

           0     0.9932    0.9935    0.9933      8617
           1     0.9862    0.9854    0.9858      4053

    accuracy                         0.9909     12670
   macro avg     0.9897    0.9895    0.9896     12670
weighted avg     0.9909    0.9909    0.9909     12670

[[8561   56]
 [  59 3994]]
0.989472050168733


In [22]:
model.load_state_dict(torch.load('DeepSVDD.bin'))

f = open('center_radius.txt','r')
center_radius = f.readlines()
f.close()

center = torch.tensor(eval(center_radius[0])).cuda()
r = eval(center_radius[1])

y_pred = []
y_truth = []
seq_list = []
distance_list = []

model.eval()

with torch.no_grad():
    for sequence, sequence_label, _ in train_loader: 
        y_truth = y_truth + sequence_label.tolist()
        seq_list += sequence.tolist()
        sequence = sequence.cuda()
        hidden = model(sequence)
        distance = torch.mean(torch.square(hidden-center), dim=1)
        distance_list += distance.tolist()
        y_pred_batch = [int(i>r) for i in distance]
        y_pred = y_pred + y_pred_batch

In [23]:
baseline_sequence = torch.tensor(seq_list[np.argmin(distance_list)]).to(device)
baseline_sequence

tensor([ 4, 14, 14,  4,  4,  4,  4, 14, 14,  4,  4,  4,  4,  4,  4,  4,  4,  4,
         4, 14], device='cuda:0')

In [24]:
model.eval()

sequence_list = []
sequence_label_list = []
key_label_list = []

sequence_list2 = []
sequence_label_list2 = []
key_label_list2 = []

sequence_list3 = []
sequence_label_list3 = []
key_label_list3 = []

with torch.no_grad():
    for sequence, sequence_label, key_label in test_loader: 
        sequence = sequence.cuda()
        
        hidden = model(sequence)
        distance = torch.mean(torch.square(hidden-center), dim=1)
        y_pred_index_batch = [i for i in range(len(distance)) if distance[i]>10*r]
        y_pred_index_batch2 = [i for i in range(len(distance)) if distance[i]>r]
        y_pred_index_batch3 = [i for i in range(len(distance)) if distance[i]<=r]
        
        sequence_l = sequence.tolist()
        sequence_label_l = sequence_label.tolist()
        key_label_l = key_label.tolist()
        
        for i in y_pred_index_batch:
            sequence_list += [sequence_l[i]]
            sequence_label_list += [sequence_label_l[i]]
            key_label_list += [key_label_l[i]]
            
        for j in y_pred_index_batch2:
            sequence_list2 += [sequence_l[j]]
            sequence_label_list2 += [sequence_label_l[j]]
            key_label_list2 += [key_label_l[j]]
            
        for k in y_pred_index_batch3:
            sequence_list3 += [sequence_l[k]]
            sequence_label_list3 += [sequence_label_l[k]]
            key_label_list3 += [key_label_l[k]]

In [25]:
def train_test_data_loader(sequence_list, sequence_label_list, key_label_list):
    d = {'Encoded': sequence_list,
         'Sequence_label': sequence_label_list,
         'Key_label': key_label_list}

    train_test_data = pd.DataFrame(d)

    train_test_data['Encoded'] = [torch.tensor(i) for i in train_test_data['Encoded']]
    train_test_data['Sequence_label'] = [torch.tensor(i) for i in train_test_data['Sequence_label']]
    train_test_data['Key_label'] = [torch.tensor(i) for i in train_test_data['Key_label']]

    train_test_loader = dataset_dataloader(train_test_data, batch_size = batch_size_train_test)
    return train_test_loader, train_test_data

train_test_loader, train_test_data   = train_test_data_loader(sequence_list, sequence_label_list, key_label_list)
train_test_loader2, train_test_data2 = train_test_data_loader(sequence_list2, sequence_label_list2, key_label_list2)
train_test_loader3, train_test_data3 = train_test_data_loader(sequence_list3, sequence_label_list3, key_label_list3)

In [26]:
embedding_dim2 = 100
hidden_dim2 = 128
num_layers2 = 1
triplet_lambda = 1
continuity_lambda = 0.05
sparsity_lambda = 0.15
epochs2 = 100

In [27]:
class Generator(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=256, num_layers=2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, 
                            hidden_size=hidden_dim,
                            num_layers=num_layers, 
                            batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, 2)
        
    def forward(self, x):
        embedded = self.embeddings(x)
        out, (hidden, cell) = self.lstm(embedded)    
        scores = self.output_layer(out) 
        return scores

In [28]:
class CFDet(nn.Module):
    def __init__(self):
        super(CFDet, self).__init__()
        self.exploration_rate = 0.05
        self.count_tokens = 3
        self.count_pieces = 3
        self.generator = Generator(vocab_size, embedding_dim2, hidden_dim2, num_layers2).cuda()

    def generate(self, x, training=True):
        z_scores_ = self.generator(x)
        z_probs_ = F.softmax(z_scores_, dim=-1)
        z_prob_ = (1 - self.exploration_rate) * z_probs_ + self.exploration_rate / z_probs_.size(-1)
        z_prob__ = z_prob_.view(-1, 2)
        sampler = torch.distributions.Categorical(z_prob__)

        if training:
            z_ = sampler.sample()  # (num_rows * p_length,)
            z = z_.view(z_prob_.size(0), z_prob_.size(1))
            z = z.type(torch.cuda.IntTensor)
            neg_log_probs_ = -sampler.log_prob(z_)
            neg_log_probs = neg_log_probs_.view(z_prob_.size(0), z_prob_.size(1))
            return z, neg_log_probs
        else:
            z__index = torch.max(z_prob__, dim=-1)[1]
            z0 = z__index.view(z_prob_.size(0), z_prob_.size(1))
            z_index = z0.type(torch.cuda.IntTensor)

            z__value = torch.max(z_prob__, dim=-1)[0]
            # z1 = z__value.view(z_prob_.size(0), z_prob_.size(1))
            z_value = z__value.type(torch.cuda.FloatTensor)
            return z_index, z_value

    def get_loss(self, x, z, neg_log_probs,average_reward, batch_size, model, sequence_length=20.0):
        z_ = torch.cat([z[:, 1:], z[:, -1:]], dim=-1)
        continuity_ratio = torch.div(torch.sum(torch.abs(z - z_), dim=-1), sequence_length)
        percentage = (self.count_pieces-1) / sequence_length
        continuity_loss = torch.abs(continuity_ratio - percentage)
#         continuity_loss = torch.clamp(continuity_ratio - percentage, min=0)


        sparsity_ratio = torch.div(torch.sum(z, dim=-1), sequence_length)
        percentage = self.count_tokens / sequence_length
        sparsity_loss = torch.abs(sparsity_ratio - percentage)
#         sparsity_loss = torch.clamp(sparsity_ratio - percentage, min=0)
        
        anomalous_entry = x * z + baseline_sequence * (1-z)
        anti = x * (1-z) + baseline_sequence * z
        hidden_anomalous_entry = model(anomalous_entry)
        hidden_anti = model(anti)
        distance_loss = criterion2(center_batch2, hidden_anti, hidden_anomalous_entry) + criterion(center_batch2, hidden_anti) \
                        - criterion(center_batch2, hidden_anomalous_entry)

        average_reward = average_reward.cuda()
        rewards = -(triplet_lambda * distance_loss + sparsity_lambda * sparsity_loss + continuity_lambda * continuity_loss ).detach()
        advantages = rewards - average_reward # (batch_size,)

        advantages_expand_ = advantages.unsqueeze(-1).expand_as(neg_log_probs)       
        rl_loss = torch.sum(neg_log_probs * advantages_expand_)
        
        return distance_loss, rl_loss, rewards, continuity_loss, sparsity_loss, advantages_expand_

    def training_step(self, distance_loss, rl_loss):
        rl_loss.backward()
        optimiser2.step()

In [29]:
cfdet = CFDet()
criterion2 = nn.TripletMarginLoss(margin=1, reduction='none')
optimiser2 = optim.Adam(cfdet.generator.parameters(), lr=1e-3)

In [30]:
# if not os.path.exists('state_dict_minloss.bin'):

setup_seed()

total_loss_list = []
distance_loss_list = []
reward_list = []
continuity_loss_list = []
sparsity_loss_list = []
loss_list = []

min_loss= 10e6

cfdet.generator.train()
model.train()

center_batch2 = torch.repeat_interleave(torch.unsqueeze(center, 0), batch_size_train_test, dim=0).cuda()

for i in range(epochs2):
    z_history_rewards = deque(maxlen=200)
    z_history_rewards.append(0.0)
    epoch_distance_loss = []
    epoch_continuity_loss = []
    epoch_sparsity_loss = []
    epoch_rl_loss = []
    epoch_reward = []
    epoch_loss= []

    for param in model.parameters():
        param.requires_grad = False 

    for sequence4, sequence_label4, _ in train_test_loader:
        sequence4 = sequence4.cuda()

        baseline = Variable(torch.FloatTensor([float(np.mean(z_history_rewards))]))

        if len(sequence_label4) == batch_size_train_test:             
            optimiser2.zero_grad()

            z, neg_log_probs = cfdet.generate(sequence4)
            distance_loss, rl_loss, rewards, continuity_loss, sparsity_loss, advantage = cfdet.get_loss(sequence4, z, neg_log_probs, baseline, batch_size_train_test, model)
            cfdet.training_step(distance_loss, rl_loss)

            epoch_distance_loss.append(torch.mean(distance_loss).item())
            epoch_continuity_loss.append(torch.mean(continuity_loss).item())
            epoch_sparsity_loss.append(torch.mean(sparsity_loss).item())
            epoch_rl_loss.append(rl_loss.item())
            epoch_reward.append(torch.sum(rewards).item())
            epoch_loss.append(torch.sum(-rewards).item())

            z_batch_reward = np.mean(rewards.cpu().data.numpy())
            z_history_rewards.append(z_batch_reward)

    total_loss_list.append(np.mean(epoch_rl_loss))
    continuity_loss_list.append(np.mean(epoch_continuity_loss))
    sparsity_loss_list.append(np.mean(epoch_sparsity_loss))
    distance_loss_list.append(np.mean(epoch_distance_loss))
    reward_list.append(np.mean(epoch_reward))
    loss_list.append(np.mean(epoch_loss))

    if distance_loss_list[i] + continuity_lambda * continuity_loss_list[i] + sparsity_lambda * sparsity_loss_list[i] < min_loss:
        min_loss = distance_loss_list[i] + continuity_lambda * continuity_loss_list[i] + sparsity_lambda * sparsity_loss_list[i]
        torch.save(cfdet.generator.state_dict(), './state_dict_minloss.bin')
    if i == epochs2-1:
        torch.save(cfdet.generator.state_dict(), './state_dict_final.bin')

    print(f'epoch{i+1}:')
    print('distance_loss:', distance_loss_list[i], 'continuity loss: ', continuity_loss_list[i], 'sparsity loss: ', sparsity_loss_list[i])
    print('------------------------------------------------------')

epoch1:
distance_loss: 0.31647391617298126 continuity loss:  0.13837333066122873 sparsity loss:  0.70412945662226
------------------------------------------------------
epoch2:
distance_loss: 0.19097995374883925 continuity loss:  0.08020089460270745 sparsity loss:  0.7961872049740383
------------------------------------------------------
epoch3:
distance_loss: 0.18916069992950985 continuity loss:  0.08035435421126229 sparsity loss:  0.7887472067560468
------------------------------------------------------
epoch4:
distance_loss: 0.18876558542251587 continuity loss:  0.07988002406699317 sparsity loss:  0.78573659658432
------------------------------------------------------
epoch5:
distance_loss: 0.18856621895517622 continuity loss:  0.07832310327461788 sparsity loss:  0.7880064146859306
------------------------------------------------------
epoch6:
distance_loss: 0.18816181719303132 continuity loss:  0.07716239073446819 sparsity loss:  0.7856780001095363
---------------------------------

epoch49:
distance_loss: 0.19002125007765633 continuity loss:  0.12858119819845473 sparsity loss:  0.7001241632870265
------------------------------------------------------
epoch50:
distance_loss: 0.18750711168561662 continuity loss:  0.12968471241848809 sparsity loss:  0.7038936887468611
------------------------------------------------------
epoch51:
distance_loss: 0.18779342515128 continuity loss:  0.12920061626604626 sparsity loss:  0.7037374360220773
------------------------------------------------------
epoch52:
distance_loss: 0.18723048397472927 continuity loss:  0.13010742494038174 sparsity loss:  0.704335926260267
------------------------------------------------------
epoch53:
distance_loss: 0.1879486037152154 continuity loss:  0.1311481575880732 sparsity loss:  0.7041936312402998
------------------------------------------------------
epoch54:
distance_loss: 0.18790400453976222 continuity loss:  0.13166155474526542 sparsity loss:  0.7042327097484044
-----------------------------

epoch97:
distance_loss: 0.19196880757808685 continuity loss:  0.09847377389669418 sparsity loss:  0.6941406181880406
------------------------------------------------------
epoch98:
distance_loss: 0.1924050177846636 continuity loss:  0.09925223333495004 sparsity loss:  0.6945256693022591
------------------------------------------------------
epoch99:
distance_loss: 0.19230988281113762 continuity loss:  0.09938755674021585 sparsity loss:  0.6941447990281241
------------------------------------------------------
epoch100:
distance_loss: 0.19183919600078037 continuity loss:  0.09984793769461768 sparsity loss:  0.6947795748710632
------------------------------------------------------


In [31]:
cfdet.generator.load_state_dict(torch.load('state_dict_minloss.bin')) 

y_key_pred2 = []
y_key_truth2 = []

cfdet.generator.eval()
model.eval()

with torch.no_grad():
    for sequence, sequence_label, key_label in val_loader: 
        key_label_list = key_label.tolist()

        for j in range(len(sequence_label)):
            y_key_truth2 = y_key_truth2 + key_label_list[j]

        sequence = sequence.cuda()
        z_out, _ = cfdet.generate(sequence, training=False)
        z_list = z_out.data.tolist()

        for k in range(len(sequence_label)):
            y_key_pred2 = y_key_pred2 + z_list[k]

In [32]:
print(metrics.classification_report(y_key_truth2, y_key_pred2, digits=4))
print(metrics.confusion_matrix(y_key_truth2, y_key_pred2))

              precision    recall  f1-score   support

           0     0.9952    0.9644    0.9796    183864
           1     0.9131    0.9877    0.9489     69536

    accuracy                         0.9708    253400
   macro avg     0.9541    0.9760    0.9642    253400
weighted avg     0.9727    0.9708    0.9712    253400

[[177325   6539]
 [   858  68678]]


In [33]:
cfdet.generator.load_state_dict(torch.load('state_dict_minloss.bin')) 

y_key_pred = []
y_key_truth = []

cfdet.generator.eval()
model.eval()

with torch.no_grad():
    for sequence, sequence_label, key_label in train_test_loader2:            
        key_label_list = torch.reshape(key_label, (-1,)).tolist()
        y_key_truth = y_key_truth + key_label_list

        sequence = sequence.cuda()
        z_out, _ = cfdet.generate(sequence, training=False)
        z_list = torch.reshape(z_out, (-1,)).tolist()

        y_key_pred = y_key_pred + z_list

In [34]:
print(metrics.classification_report(y_key_truth, y_key_pred, digits=4))
print(metrics.confusion_matrix(y_key_truth, y_key_pred))

fpr, tpr, thresholds = metrics.roc_curve(y_key_truth, y_key_pred, pos_label=1)
print(metrics.auc(fpr, tpr))

f = open('output.txt', 'a')
f.write('Entry anomaly detection on detected sequences:'+'\n')
f.write(str(metrics.classification_report(y_key_truth, y_key_pred, digits=4))+'\n')
f.write(str(metrics.confusion_matrix(y_key_truth, y_key_pred))+'\n')
f.write(str(metrics.auc(fpr, tpr))+'\n')
f.close()

              precision    recall  f1-score   support

           0     0.9747    0.9809    0.9778    105791
           1     0.9968    0.9957    0.9962    622909

    accuracy                         0.9935    728700
   macro avg     0.9857    0.9883    0.9870    728700
weighted avg     0.9935    0.9935    0.9935    728700

[[103773   2018]
 [  2698 620211]]
0.9882966812797547


In [35]:
y_key_pred2 = []
y_key_truth2 = []

for sequence, sequence_label, key_label in tqdm(train_test_loader3):   
    key_label_list = torch.reshape(key_label, (-1,)).tolist()
    y_key_truth2 = y_key_truth2 + key_label_list   
    y_key_pred2 = y_key_pred2 + [0]*len(key_label_list)
        
y_key_truth_all = y_key_truth + y_key_truth2
y_key_pred_all = y_key_pred + y_key_pred2

100%|██████████| 76/76 [00:01<00:00, 70.10it/s] 


In [36]:
print(metrics.classification_report(y_key_truth_all, y_key_pred_all, digits=4))
print(metrics.confusion_matrix(y_key_truth_all, y_key_pred_all))

fpr, tpr, thresholds = metrics.roc_curve(y_key_truth_all, y_key_pred_all, pos_label=1)
print(metrics.auc(fpr, tpr))

f = open('output.txt', 'a')
f.write('Entry anomaly detection on unlabeled dataset:'+'\n')
f.write(str(metrics.classification_report(y_key_truth_all, y_key_pred_all, digits=4))+'\n')
f.write(str(metrics.confusion_matrix(y_key_truth_all, y_key_pred_all))+'\n')
f.write(str(metrics.auc(fpr, tpr))+'\n')
f.write('-'*50+'\n')
f.close()

              precision    recall  f1-score   support

           0     0.9957    0.9988    0.9972   1652987
           1     0.9968    0.9886    0.9927    627373

    accuracy                         0.9960   2280360
   macro avg     0.9962    0.9937    0.9949   2280360
weighted avg     0.9960    0.9960    0.9960   2280360

[[1650969    2018]
 [   7162  620211]]
0.9936816617373385
