In [1]:
import os
import torch
import random
import torch.optim as optim
import pandas as pd
import numpy as np
import torch.nn as nn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from collections import Counter
import torch.nn.functional as F
from collections import deque
from torch.autograd import Variable
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
seed = 10

def setup_seed(seed=seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True
     torch.backends.cudnn.benchmark = False

In [4]:
f = open('output.txt', 'a')
f.write('seed: ' + str(seed) + '\n')
f.close()

In [5]:
raw_ds_abnormal = pd.read_csv(r'~/Python_projects/Rationale/Dataset/abnormal_ds2.csv')
raw_ds_normal = pd.read_csv(r'~/Python_projects/Rationale/Dataset/normal_ds2.csv')

raw_ds_abnormal['Sequence'] = raw_ds_abnormal['Sequence'].apply(lambda x: x[1:-1].split())
raw_ds_abnormal['Key_label'] = raw_ds_abnormal['Key_label'].apply(lambda x: x[1:-1].split())
raw_ds_abnormal['Key_label'] = raw_ds_abnormal['Key_label'].apply(lambda x: np.array(list(map(int, x))))
raw_ds_abnormal['URL'] = raw_ds_abnormal['URL'].apply(lambda x: x[1:-1].split())

raw_ds_normal['Sequence'] = raw_ds_normal['Sequence'].apply(lambda x: x[1:-1].split())
raw_ds_normal['Key_label'] = raw_ds_normal['Key_label'].apply(lambda x: x[1:-1].split())
raw_ds_normal['Key_label'] = raw_ds_normal['Key_label'].apply(lambda x: np.array(list(map(int, x))))
raw_ds_normal['URL'] = raw_ds_normal['URL'].apply(lambda x: x[1:-1].split())

In [6]:
test_normal_ds = raw_ds_abnormal[raw_ds_abnormal['Sequence_label']==0]
test_abnormal_ds = raw_ds_abnormal[raw_ds_abnormal['Sequence_label']==1]

In [7]:
setup_seed()

train_ds = raw_ds_normal
test_normal_ds, val_normal_ds = train_test_split(test_normal_ds, test_size=0.1, random_state=2021)
test_abnormal_ds, val_abnormal_ds = train_test_split(test_abnormal_ds, test_size=0.1, random_state=2021)

test_ds = pd.concat([test_normal_ds, test_abnormal_ds])
val_ds = pd.concat([val_normal_ds, val_abnormal_ds])

In [8]:
ds = pd.concat([train_ds, test_ds, val_ds])

In [9]:
ds

Unnamed: 0,Sequence,Sequence_label,Key_label,URL
0,"['Logon', 'Http_normal', 'Http_normal', 'Http_...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[nan, 'megaupload.com', 'megaclick.com', 'sfga..."
1,"['Http_normal', 'Http_normal', 'Http_normal', ...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['megaupload.com', 'megaclick.com', 'sfgate.co..."
2,"['Http_normal', 'Http_normal', 'Http_normal', ...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['megaclick.com', 'sfgate.com', 'sfgate.com', ..."
3,"['Http_normal', 'Http_normal', 'Email', 'Email...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['sfgate.com', 'sfgate.com', nan, nan, nan, 'n..."
4,"['Http_normal', 'Email', 'Email', 'Email', 'Ht...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['sfgate.com', nan, nan, nan, 'ning.com', nan,..."
...,...,...,...,...
688800,"['Http_abnormal', 'Http_normal', 'Http_normal'...",1,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['simplyhired.com', 'etsy.com', 'homedepot.com..."
176469,"['Http_normal', 'Http_normal', 'Http_normal', ...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['pcworld.com', 'pcworld.com', 'pcworld.com', ..."
690699,"['Email', 'Http_normal', 'Http_normal', 'Email...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[nan, 'superpages.com', 'vistaprint.com', nan,..."
379270,"['Http_normal', 'Http_normal', 'Http_normal', ...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","['thechive.com', 'tribalfusion.com', 'istockph..."


**2. Preprocessing**

In [10]:
dict_activity = {'': 0, 
                 "UNK":1, 
                 'Logon': 2, 
                 'Logoff': 3, 
                 'Connect': 4, 
                 'Disconnect': 5, 
                 'Email': 6, 
                 'Http_normal': 7, 
                 'Http_abnormal': 8, 
                 'File': 9}

# dict_activity = {'': 0, 
#                  "UNK":1, 
#                  'Logon': 2, 
#                  'Logoff': 3, 
#                  'Connect_normal': 4, 
#                  'Connect_abnormal': 5, 
#                  'Disconnect_normal': 6, 
#                  'Disconnect_abnormal': 7,
#                  'Email_normal': 8, 
#                  'Email_abnormal': 9,
#                  'Http_normal': 10, 
#                  'Http_abnormal': 11, 
#                  'File': 12}

In [11]:
counts = Counter()

for index, row in ds.iterrows():
    counts.update(row['URL'])

url2index = {"":0,"UNK":1}
urls = ["","UNK"]

for url in counts:
    url2index[url] = len(urls)
    urls.append(url)

In [12]:
def encode_sequence(sequence, dict_activity):
    return np.array([dict_activity.get(logkey[1:-1], dict_activity["UNK"]) for logkey in sequence])

def encode_sequence2(sequence, dict_activity):
    return np.array([dict_activity.get(logkey, dict_activity["UNK"]) for logkey in sequence])

train_ds.loc[:,'Encoded'] = train_ds.loc[:,'Sequence'].apply(lambda x: encode_sequence(x,dict_activity))
val_ds.loc[:,'Encoded'] = val_ds.loc[:,'Sequence'].apply(lambda x: encode_sequence(x,dict_activity))
test_ds.loc[:,'Encoded'] = test_ds.loc[:,'Sequence'].apply(lambda x: encode_sequence(x,dict_activity))

train_ds.loc[:,'URL'] = train_ds.loc[:,'URL'].apply(lambda x: encode_sequence2(x,url2index))
val_ds.loc[:,'URL'] = val_ds.loc[:,'URL'].apply(lambda x: encode_sequence2(x,url2index))
test_ds.loc[:,'URL'] = test_ds.loc[:,'URL'].apply(lambda x: encode_sequence2(x,url2index))

In [13]:
train_data = train_ds[['Encoded', 'Sequence_label', 'Key_label', 'URL']]
test_data = test_ds[['Encoded', 'Sequence_label', 'Key_label', 'URL']]
val_data = val_ds[['Encoded', 'Sequence_label', 'Key_label', 'URL']]

In [14]:
print(train_data.shape)
print(test_data.shape)
print(val_data.shape)
print('-------------')
print(test_data[test_data['Sequence_label']==1].shape)
print(test_data[test_data['Sequence_label']==0].shape)
print('-------------')
print(val_data[val_data['Sequence_label']==1].shape)
print(val_data[val_data['Sequence_label']==0].shape)

(1391559, 4)
(624593, 4)
(69400, 4)
-------------
(52033, 4)
(572560, 4)
-------------
(5782, 4)
(63618, 4)


In [15]:
class LogDataset(Dataset):
    def __init__(self, sequence, sequence_label, key_label, url):
        self.sequence = sequence
        self.sequence_label = sequence_label
        self.key_label = key_label
        self.url = url

    def __len__(self):
        return len(self.sequence_label)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return (self.sequence[idx], self.sequence_label[idx], self.key_label[idx], self.url[idx])

In [16]:
batch_size_train = 512
batch_size_test = 4096
batch_size_val = 4096
batch_size_train_test = 1024

In [17]:
setup_seed()

def dataset_dataloader(data, batch_size):
    sequence = data['Encoded'].tolist()
    sequence_label = data['Sequence_label'].tolist()
    key_label = data['Key_label'].tolist()
    url = data['URL'].tolist()
    dataset = LogDataset(sequence, sequence_label, key_label, url)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return data_loader

train_loader = dataset_dataloader(train_data, batch_size = batch_size_train)
test_loader = dataset_dataloader(test_data, batch_size = batch_size_test)
val_loader = dataset_dataloader(val_data, batch_size = batch_size_val)

**3. Model**

In [18]:
vocab_size = len(dict_activity)
embedding_dim = 50
hidden_dim = 128
num_layers = 1

In [19]:
class Net(nn.Module) :
    def __init__(self, vocab_size, embedding_dim=8, hidden_dim=64, num_layers=1):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embedding_dim, 
                            hidden_size=hidden_dim,
                            num_layers=num_layers,
                            batch_first=True,
                            bias = False)

    def forward(self, x):
        h0 = torch.randn(self.num_layers, x.size(0), self.hidden_dim).cuda()
        c0 = torch.randn(self.num_layers, x.size(0), self.hidden_dim).cuda()

        embedded = self.embeddings(x)
        out, (hidden, cell) = self.lstm(embedded, (h0, c0))    
        return torch.squeeze(torch.mean(out, dim=1))

In [20]:
model = Net(vocab_size, embedding_dim, hidden_dim, num_layers).cuda()
criterion = nn.MSELoss()
optimiser = optim.Adam(model.parameters(), lr=1e-3)

In [21]:
# if not os.path.exists('DeepSVDD.bin'):
setup_seed()

epochs = 50
total_loss = []
r_candidate = []
min_loss = 10e6

for i in range(epochs):
    epoch_loss=[]
    hidden_sum = torch.zeros((batch_size_train, hidden_dim))

    if i < 20:
        model.eval()
        with torch.no_grad():
            for sequence, sequence_label, _, _ in train_loader:
                if len(sequence_label) == batch_size_train:
                    sequence = sequence.cuda()
                    hidden_sum = hidden_sum.cuda()
                    hidden1 = model(sequence)
                    hidden_sum = hidden_sum + hidden1
                    sequence = sequence.cpu()


        center = (torch.mean(hidden_sum.cuda(), axis=0) / len(train_loader))
        center_batch = torch.repeat_interleave(torch.unsqueeze(center, 0), batch_size_train, dim=0).detach()


    model.train()
    for sequence2, sequence_label2, _, _ in train_loader:
        if len(sequence_label2) == batch_size_train:
            sequence2 = sequence2.cuda()
            optimiser.zero_grad()

            hidden2 = model(sequence2)  
            loss = criterion(hidden2, center_batch.cuda())  

            epoch_loss.append(loss.item())
#             if i == epochs-1:
#                 r_candidate.append(loss.item())

            loss.backward()
            optimiser.step()
            torch.cuda.empty_cache()

    print("Epoch ", i+1, " MSE: ", np.mean(epoch_loss))
    total_loss.append(np.max(epoch_loss))
#         if total_loss[i] < min_loss:
    if i==epochs-1:
        torch.save(model.state_dict(), './DeepSVDD.bin')
        min_loss = total_loss[i]
        r = total_loss[i]

        f = open('center_radius.txt', 'w+')
        f.write(str(center.tolist()))
        f.write('\n')
        f.write(str(r))
        f.close


Epoch  1  MSE:  1.7111298796508443e-05
Epoch  2  MSE:  1.4661513270013755e-07
Epoch  3  MSE:  4.480076246219573e-08
Epoch  4  MSE:  2.770248221289201e-08
Epoch  5  MSE:  2.283668200128006e-08
Epoch  6  MSE:  2.0344409000945383e-08
Epoch  7  MSE:  1.992093370244203e-08
Epoch  8  MSE:  1.76618581791396e-08
Epoch  9  MSE:  1.796643341972515e-08
Epoch  10  MSE:  1.724678771939636e-08
Epoch  11  MSE:  1.668347685675328e-08
Epoch  12  MSE:  1.6511824682211347e-08
Epoch  13  MSE:  1.6129921775088583e-08
Epoch  14  MSE:  1.6085868374362457e-08
Epoch  15  MSE:  1.5602020245519646e-08
Epoch  16  MSE:  1.5707807291513852e-08
Epoch  17  MSE:  1.5700216238323977e-08
Epoch  18  MSE:  1.455255947961226e-08
Epoch  19  MSE:  1.6130051236001192e-08
Epoch  20  MSE:  1.455249476819769e-08
Epoch  21  MSE:  1.4607052468987215e-08
Epoch  22  MSE:  1.4651550759166391e-08
Epoch  23  MSE:  1.3830988360027054e-08
Epoch  24  MSE:  1.4215134514184804e-08
Epoch  25  MSE:  1.3916391651936955e-08
Epoch  26  MSE:  1.4

In [22]:
model.load_state_dict(torch.load('DeepSVDD.bin'))

f = open('center_radius.txt','r')
center_radius = f.readlines()
f.close()

center = torch.tensor(eval(center_radius[0])).cuda()
r = eval(center_radius[1])

y_pred = []
y_truth = []
distance_list = []

model.eval()

with torch.no_grad():
    for sequence, sequence_label, _, _ in val_loader: 
        y_truth = y_truth + sequence_label.tolist()

        sequence = sequence.cuda()
        hidden = model(sequence)
        distance = torch.mean(torch.square(hidden-center), dim=1)
        distance_list.extend(distance.tolist())
        y_pred_batch = [int(i>r) for i in distance]
        y_pred = y_pred + y_pred_batch

In [23]:
print(metrics.classification_report(y_truth, y_pred, digits=4))
print(metrics.confusion_matrix(y_truth, y_pred))

fpr, tpr, thresholds = metrics.roc_curve(y_truth, y_pred, pos_label=1)
print(metrics.auc(fpr, tpr))

f = open('output.txt', 'a')
f.write('Sequence anomaly detection: '+'\n')
f.write(str(metrics.classification_report(y_truth, y_pred, digits=4))+'\n')
f.write(str(metrics.confusion_matrix(y_truth, y_pred))+'\n')
f.write(str(metrics.auc(fpr, tpr))+'\n')
f.close()

              precision    recall  f1-score   support

           0     0.9681    1.0000    0.9838     63618
           1     1.0000    0.6377    0.7788      5782

    accuracy                         0.9698     69400
   macro avg     0.9841    0.8188    0.8813     69400
weighted avg     0.9708    0.9698    0.9667     69400

[[63618     0]
 [ 2095  3687]]
0.8188343133863715


In [24]:
model.load_state_dict(torch.load('DeepSVDD.bin'))

f = open('center_radius.txt','r')
center_radius = f.readlines()
f.close()

center = torch.tensor(eval(center_radius[0])).cuda()
r = eval(center_radius[1])

y_pred = []
y_truth = []
seq_list = []
distance_list = []

model.eval()

with torch.no_grad():
    for sequence, sequence_label, _, _ in train_loader: 
        y_truth = y_truth + sequence_label.tolist()
        seq_list += sequence.tolist()
        sequence = sequence.cuda()
        hidden = model(sequence)
        distance = torch.mean(torch.square(hidden-center), dim=1)
        distance_list += distance.tolist()
        y_pred_batch = [int(i>r) for i in distance]
        y_pred = y_pred + y_pred_batch

In [25]:
baseline_sequence = torch.tensor(seq_list[np.argmin(distance_list)]).to(device)
baseline_sequence

tensor([7, 7, 4, 9, 9, 5, 7, 4, 7, 9, 9, 9, 9, 9, 9, 7, 9, 9, 9, 9],
       device='cuda:0')

In [26]:
model.eval()
sequence_list = []
sequence_label_list = []
key_label_list = []
url_list = []

sequence_list2 = []
sequence_label_list2 = []
key_label_list2 = []
url_list2 = []

sequence_list3 = []
sequence_label_list3 = []
key_label_list3 = []
url_list3 = []

with torch.no_grad():
    for sequence, sequence_label, key_label, url in test_loader: 
        sequence = sequence.cuda()
        
        hidden = model(sequence)
        distance = torch.mean(torch.square(hidden-center), dim=1)
        y_pred_index_batch = [i for i in range(len(distance)) if distance[i]>10*r]
        y_pred_index_batch2 = [i for i in range(len(distance)) if distance[i]>r]
        y_pred_index_batch3 = [i for i in range(len(distance)) if distance[i]<=r]
        
        sequence_l = sequence.tolist()
        sequence_label_l = sequence_label.tolist()
        key_label_l = key_label.tolist()
        url_l = url.tolist()
        
        for i in y_pred_index_batch:
            sequence_list += [sequence_l[i]]
            sequence_label_list += [sequence_label_l[i]]
            key_label_list += [key_label_l[i]]
            url_list += [url_l[i]]
            
        for j in y_pred_index_batch2:
            sequence_list2 += [sequence_l[j]]
            sequence_label_list2 += [sequence_label_l[j]]
            key_label_list2 += [key_label_l[j]]
            url_list2 += [url_l[j]]
            
        for k in y_pred_index_batch3:
            sequence_list3 += [sequence_l[k]]
            sequence_label_list3 += [sequence_label_l[k]]
            key_label_list3 += [key_label_l[k]]
            url_list3 += [url_l[k]]

In [27]:
def train_test_data_loader(sequence_list, sequence_label_list, key_label_list, url_list):
    d = {'Encoded': sequence_list,
         'Sequence_label': sequence_label_list,
         'Key_label': key_label_list,
         'URL': url_list}

    train_test_data = pd.DataFrame(d)

    train_test_data['Encoded'] = [torch.tensor(i) for i in train_test_data['Encoded']]
    train_test_data['Sequence_label'] = [torch.tensor(i) for i in train_test_data['Sequence_label']]
    train_test_data['Key_label'] = [torch.tensor(i) for i in train_test_data['Key_label']]
    train_test_data['URL'] = [torch.tensor(i) for i in train_test_data['URL']]

    train_test_loader = dataset_dataloader(train_test_data, batch_size = batch_size_train_test)
    return train_test_loader, train_test_data

train_test_loader, train_test_data   = train_test_data_loader(sequence_list, sequence_label_list, key_label_list, url_list)
train_test_loader2, train_test_data2 = train_test_data_loader(sequence_list2, sequence_label_list2, key_label_list2, url_list2)
train_test_loader3, train_test_data3 = train_test_data_loader(sequence_list3, sequence_label_list3, key_label_list3, url_list3)

In [28]:
train_test_data3[train_test_data3['Sequence_label']==1]

Unnamed: 0,Encoded,Sequence_label,Key_label,URL
7,"[tensor(7), tensor(7), tensor(7), tensor(7), t...",tensor(1),"[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(24), tensor(53), tensor(49), tensor(19..."
16,"[tensor(7), tensor(7), tensor(7), tensor(7), t...",tensor(1),"[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(53), tensor(69), tensor(295), tensor(8..."
40,"[tensor(7), tensor(7), tensor(6), tensor(6), t...",tensor(1),"[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(36), tensor(296), tensor(2), tensor(2)..."
41,"[tensor(7), tensor(7), tensor(7), tensor(7), t...",tensor(1),"[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(64), tensor(39), tensor(64), tensor(39..."
49,"[tensor(6), tensor(6), tensor(7), tensor(7), t...",tensor(1),"[tensor(0), tensor(0), tensor(0), tensor(0), t...","[tensor(2), tensor(2), tensor(287), tensor(391..."
...,...,...,...,...
590925,"[tensor(5), tensor(7), tensor(4), tensor(5), t...",tensor(1),"[tensor(1), tensor(0), tensor(1), tensor(1), t...","[tensor(2), tensor(63), tensor(2), tensor(2), ..."
590987,"[tensor(7), tensor(5), tensor(7), tensor(4), t...",tensor(1),"[tensor(0), tensor(1), tensor(0), tensor(1), t...","[tensor(369), tensor(2), tensor(22), tensor(2)..."
591002,"[tensor(9), tensor(9), tensor(5), tensor(4), t...",tensor(1),"[tensor(0), tensor(0), tensor(1), tensor(1), t...","[tensor(2), tensor(2), tensor(2), tensor(2), t..."
591012,"[tensor(4), tensor(7), tensor(7), tensor(7), t...",tensor(1),"[tensor(1), tensor(0), tensor(0), tensor(0), t...","[tensor(2), tensor(32), tensor(218), tensor(19..."


In [29]:
embedding_dim2 = 100
hidden_dim2 = 128
num_layers2 = 1
triplet_lambda = 1
continuity_lambda = 0.15
sparsity_lambda = 0.1
epochs2 = 100

In [30]:
class Generator(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=256, num_layers=2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, 
                            hidden_size=hidden_dim,
                            num_layers=num_layers, 
                            batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, 2)
        
    def forward(self, x):
        embedded = self.embeddings(x)
        out, (hidden, cell) = self.lstm(embedded)    
        scores = self.output_layer(out) 
        return scores

In [31]:
class CFDet(nn.Module):
    def __init__(self):
        super(CFDet, self).__init__()
        self.exploration_rate = 0.05
        self.count_tokens = 2
        self.count_pieces = 2
        self.generator = Generator(vocab_size, embedding_dim2, hidden_dim2, num_layers2).cuda()

    def generate(self, x, training=True):
        z_scores_ = self.generator(x)
        z_probs_ = F.softmax(z_scores_, dim=-1)
        z_prob_ = (1 - self.exploration_rate) * z_probs_ + self.exploration_rate / z_probs_.size(-1)
        z_prob__ = z_prob_.view(-1, 2)
        sampler = torch.distributions.Categorical(z_prob__)

        if training:
            z_ = sampler.sample()  # (num_rows * p_length,)
            z = z_.view(z_prob_.size(0), z_prob_.size(1))
            z = z.type(torch.cuda.IntTensor)
            neg_log_probs_ = -sampler.log_prob(z_)
            neg_log_probs = neg_log_probs_.view(z_prob_.size(0), z_prob_.size(1))
            return z, neg_log_probs
        else:
            z__index = torch.max(z_prob__, dim=-1)[1]
            z0 = z__index.view(z_prob_.size(0), z_prob_.size(1))
            z_index = z0.type(torch.cuda.IntTensor)

            z__value = torch.max(z_prob__, dim=-1)[0]
            # z1 = z__value.view(z_prob_.size(0), z_prob_.size(1))
            z_value = z__value.type(torch.cuda.FloatTensor)
            return z_index, z_value

    def get_loss(self, x, z, neg_log_probs,average_reward, batch_size, model, sequence_length=20.0):
        z_ = torch.cat([z[:, 1:], z[:, -1:]], dim=-1)
        continuity_ratio = torch.div(torch.sum(torch.abs(z - z_), dim=-1), sequence_length)
        percentage = (self.count_pieces-1) / sequence_length
        continuity_loss = torch.abs(continuity_ratio - percentage)
#         continuity_loss = torch.clamp(continuity_ratio - percentage, min=0)


        sparsity_ratio = torch.div(torch.sum(z, dim=-1), sequence_length)
        percentage = self.count_tokens / sequence_length
        sparsity_loss = torch.abs(sparsity_ratio - percentage)
#         sparsity_loss = torch.clamp(sparsity_ratio - percentage, min=0)

        anomalous_entry = x * z + baseline_sequence * (1-z)
        anti = x * (1-z) + baseline_sequence * z
        hidden_anomalous_entry = model(anomalous_entry)
        hidden_anti = model(anti)
        distance_loss = criterion2(center_batch2, hidden_anti, hidden_anomalous_entry) + criterion(center_batch2, hidden_anti) \
                        - criterion(center_batch2, hidden_anomalous_entry)

        average_reward = average_reward.cuda()
        rewards = -(triplet_lambda * distance_loss + sparsity_lambda * sparsity_loss + continuity_lambda * continuity_loss ).detach()
        advantages = rewards - average_reward # (batch_size,)

        advantages_expand_ = advantages.unsqueeze(-1).expand_as(neg_log_probs)       
        rl_loss = torch.sum(neg_log_probs * advantages_expand_)
        
        return distance_loss, rl_loss, rewards, continuity_loss, sparsity_loss, advantages_expand_

    def training_step(self, distance_loss, rl_loss):
        rl_loss.backward()
        optimiser2.step()

In [32]:
cfdet = CFDet()
criterion2 = nn.TripletMarginLoss(margin=1, reduction='none')
optimiser2 = optim.Adam(cfdet.generator.parameters(), lr=1e-3)

In [33]:
# if not os.path.exists('state_dict_minloss.bin'):

setup_seed()

total_loss_list = []
distance_loss_list = []
reward_list = []
continuity_loss_list = []
sparsity_loss_list = []
loss_list = []

min_loss= 10e6

center_batch2 = torch.repeat_interleave(torch.unsqueeze(center, 0), batch_size_train_test, dim=0).cuda()

for i in range(epochs2):
    z_history_rewards = deque(maxlen=200)
    z_history_rewards.append(0.0)
    epoch_distance_loss = []
    epoch_continuity_loss = []
    epoch_sparsity_loss = []
    epoch_rl_loss = []
    epoch_reward = []
    epoch_loss= []

    cfdet.generator.train()
    model.train()

    for param in model.parameters():
        param.requires_grad = False 

    for sequence4, sequence_label4, _, _ in train_test_loader:
        sequence4 = sequence4.cuda()

        baseline = Variable(torch.FloatTensor([float(np.mean(z_history_rewards))]))

        if len(sequence_label4) == batch_size_train_test:             
            optimiser2.zero_grad()

            z, neg_log_probs = cfdet.generate(sequence4)
            distance_loss, rl_loss, rewards, continuity_loss, sparsity_loss, advantage = cfdet.get_loss(sequence4, z, neg_log_probs, baseline, batch_size_train_test, model)
            cfdet.training_step(distance_loss, rl_loss)

            epoch_distance_loss.append(torch.mean(distance_loss).item())
            epoch_continuity_loss.append(torch.mean(continuity_loss).item())
            epoch_sparsity_loss.append(torch.mean(sparsity_loss).item())
            epoch_rl_loss.append(rl_loss.item())
            epoch_reward.append(torch.sum(rewards).item())
            epoch_loss.append(torch.sum(-rewards).item())

            z_batch_reward = np.mean(rewards.cpu().data.numpy())
            z_history_rewards.append(z_batch_reward)

    total_loss_list.append(np.mean(epoch_rl_loss))
    continuity_loss_list.append(np.mean(epoch_continuity_loss))
    sparsity_loss_list.append(np.mean(epoch_sparsity_loss))
    distance_loss_list.append(np.mean(epoch_distance_loss))
    reward_list.append(np.mean(epoch_reward))
    loss_list.append(np.mean(epoch_loss))

    if distance_loss_list[i] + continuity_lambda * continuity_loss_list[i] + sparsity_lambda * sparsity_loss_list[i] < min_loss:
        min_loss = distance_loss_list[i] + continuity_lambda * continuity_loss_list[i] + sparsity_lambda * sparsity_loss_list[i]
        torch.save(cfdet.generator.state_dict(), './state_dict_minloss.bin')
    if i == epochs2-1:
        torch.save(cfdet.generator.state_dict(), './state_dict_final.bin')

    print(f'epoch{i+1}:')
    print('distance_loss:', distance_loss_list[i], 'continuity loss: ', continuity_loss_list[i], 'sparsity loss: ', sparsity_loss_list[i])
    print('------------------------------------------------------')

epoch1:
distance_loss: 0.869846798479557 continuity loss:  0.22965087764896452 sparsity loss:  0.670484914444387
------------------------------------------------------
epoch2:
distance_loss: 0.8371458407491446 continuity loss:  0.05748443747870624 sparsity loss:  0.8566604480147362
------------------------------------------------------
epoch3:
distance_loss: 0.8372033704072237 continuity loss:  0.05817260907497257 sparsity loss:  0.8574325405061245
------------------------------------------------------
epoch4:
distance_loss: 0.8368851561099291 continuity loss:  0.06029205513186753 sparsity loss:  0.8480041287839413
------------------------------------------------------
epoch5:
distance_loss: 0.8379889521747828 continuity loss:  0.06542053434532136 sparsity loss:  0.7795226946473122
------------------------------------------------------
epoch6:
distance_loss: 0.8421878442168236 continuity loss:  0.14727783389389515 sparsity loss:  0.11430359119549394
------------------------------------

epoch49:
distance_loss: 0.8391740974038839 continuity loss:  0.14129333849996328 sparsity loss:  0.06524811103008687
------------------------------------------------------
epoch50:
distance_loss: 0.8395763840526342 continuity loss:  0.1412033117376268 sparsity loss:  0.0647201573010534
------------------------------------------------------
epoch51:
distance_loss: 0.8393663913011551 continuity loss:  0.14090271154418588 sparsity loss:  0.0655761748785153
------------------------------------------------------
epoch52:
distance_loss: 0.839734872803092 continuity loss:  0.13994751032441854 sparsity loss:  0.06559143401682377
------------------------------------------------------
epoch53:
distance_loss: 0.8395945373922586 continuity loss:  0.1407730132341385 sparsity loss:  0.06387634587008506
------------------------------------------------------
epoch54:
distance_loss: 0.8394881468266249 continuity loss:  0.14150543650612235 sparsity loss:  0.06302643159870058
----------------------------

epoch97:
distance_loss: 0.8394320867955685 continuity loss:  0.14402466267347336 sparsity loss:  0.053797915345057845
------------------------------------------------------
epoch98:
distance_loss: 0.8396264277398586 continuity loss:  0.14415435958653688 sparsity loss:  0.05343017855193466
------------------------------------------------------
epoch99:
distance_loss: 0.8391272984445095 continuity loss:  0.14418182661756873 sparsity loss:  0.05382995854597539
------------------------------------------------------
epoch100:
distance_loss: 0.8396367207169533 continuity loss:  0.14379425207152963 sparsity loss:  0.05326233129017055
------------------------------------------------------


In [34]:
cfdet.generator.load_state_dict(torch.load('state_dict_minloss.bin')) 

y_key_pred3 = []
y_key_truth3 = []

cfdet.generator.eval()
model.eval()

with torch.no_grad():
    for sequence, sequence_label, key_label, _ in val_loader:            
        key_label_list = torch.reshape(key_label, (-1,)).tolist()
        y_key_truth3 = y_key_truth3 + key_label_list

        sequence = sequence.cuda()
        z_out, _ = cfdet.generate(sequence, training=False)
        z_list = torch.reshape(z_out, (-1,)).tolist()

        y_key_pred3 = y_key_pred3 + z_list

In [35]:
print(metrics.classification_report(y_key_truth3, y_key_pred3, digits=4))
print(metrics.confusion_matrix(y_key_truth3, y_key_pred3))

              precision    recall  f1-score   support

           0     0.9956    0.9996    0.9976   1374486
           1     0.9288    0.5547    0.6946     13514

    accuracy                         0.9952   1388000
   macro avg     0.9622    0.7771    0.8461   1388000
weighted avg     0.9950    0.9952    0.9947   1388000

[[1373911     575]
 [   6018    7496]]


In [36]:
cfdet.generator.load_state_dict(torch.load('state_dict_minloss.bin')) 

seq_list_all = []

y_key_pred = []
y_key_truth = []

cfdet.generator.eval()
model.eval()

with torch.no_grad():
    for sequence, sequence_label, key_label, _ in train_test_loader2:  
        seq_list_all += torch.reshape(sequence, (-1,)).tolist()
        key_label_list = torch.reshape(key_label, (-1,)).tolist()
        y_key_truth = y_key_truth + key_label_list

        sequence = sequence.cuda()
        z_out, _ = cfdet.generate(sequence, training=False)
        z_list = torch.reshape(z_out, (-1,)).tolist()

        y_key_pred = y_key_pred + z_list

In [37]:
print(metrics.classification_report(y_key_truth, y_key_pred, digits=4))
print(metrics.confusion_matrix(y_key_truth, y_key_pred))

fpr, tpr, thresholds = metrics.roc_curve(y_key_truth, y_key_pred, pos_label=1)
print(metrics.auc(fpr, tpr))

f = open('output.txt', 'a')
f.write('Entry anomaly detection on detected sequences:'+'\n')
f.write(str(metrics.classification_report(y_key_truth, y_key_pred, digits=4))+'\n')
f.write(str(metrics.confusion_matrix(y_key_truth, y_key_pred))+'\n')
f.write(str(metrics.auc(fpr, tpr))+'\n')
f.close()

              precision    recall  f1-score   support

           0     0.9891    0.9916    0.9903    596521
           1     0.9309    0.9119    0.9213     74219

    accuracy                         0.9828    670740
   macro avg     0.9600    0.9517    0.9558    670740
weighted avg     0.9826    0.9828    0.9827    670740

[[591494   5027]
 [  6539  67680]]
0.9517343393441859


In [38]:
y_key_pred2 = []
y_key_truth2 = []


for sequence, sequence_label, key_label, _ in train_test_loader3:  
    seq_list_all += torch.reshape(sequence, (-1,)).tolist()
    key_label_list = torch.reshape(key_label, (-1,)).tolist()
    y_key_truth2 = y_key_truth2 + key_label_list   
    y_key_pred2 = y_key_pred2 + [0]*len(key_label_list)
        
y_key_truth_all = y_key_truth + y_key_truth2
y_key_pred_all = y_key_pred + y_key_pred2

In [39]:
print(metrics.classification_report(y_key_truth_all, y_key_pred_all, digits=4))
print(metrics.confusion_matrix(y_key_truth_all, y_key_pred_all))

fpr, tpr, thresholds = metrics.roc_curve(y_key_truth_all, y_key_pred_all, pos_label=1)
print(metrics.auc(fpr, tpr))

f = open('output.txt', 'a')
f.write('Entry anomaly detection on unlabeled dataset:'+'\n')
f.write(str(metrics.classification_report(y_key_truth_all, y_key_pred_all, digits=4))+'\n')
f.write(str(metrics.confusion_matrix(y_key_truth_all, y_key_pred_all))+'\n')
f.write(str(metrics.auc(fpr, tpr))+'\n')
f.write('-'*50+'\n')
f.close()

              precision    recall  f1-score   support

           0     0.9956    0.9996    0.9976  12370109
           1     0.9309    0.5559    0.6961    121751

    accuracy                         0.9953  12491860
   macro avg     0.9633    0.7777    0.8469  12491860
weighted avg     0.9950    0.9953    0.9947  12491860

[[12365082     5027]
 [   54071    67680]]
0.7777411375804234


In [40]:
def count_rate(seq_list, truth_list, pred_list, idx):
    counter1 = 0
    counter2 = 0

    for i in range(len(seq_list)):
        if seq_list[i] == idx and truth_list[i]==1: 
            counter1 += 1
            if pred_list[i]==1:
                counter2 += 1            
    return counter2, counter1

In [41]:
activity_dict = {index:dict for dict,index in dict_activity.items()}

for i in range(2,10,1):
    counter, total = count_rate(seq_list_all, y_key_truth_all, y_key_pred_all, i)
    print(activity_dict[i], ' |detected: ', counter, ' |total: ', total, '\n')

Logon  |detected:  0  |total:  0 

Logoff  |detected:  0  |total:  0 

Connect  |detected:  289  |total:  23510 

Disconnect  |detected:  86  |total:  23395 

Email  |detected:  0  |total:  7541 

Http_normal  |detected:  0  |total:  0 

Http_abnormal  |detected:  67305  |total:  67305 

File  |detected:  0  |total:  0 



In [42]:
# index2url = {index:url for url,index in url2index.items()}
# activity_dict = {index:dict for dict,index in dict_activity.items()}

# cfdet.generator.load_state_dict(torch.load('state_dict_minloss.bin')) 

# y_key_pred = []
# y_key_truth = []

# cfdet.generator.eval()


# with torch.no_grad():
#     for sequence, sequence_label, key_label, url in train_test_loader2:  
#         f=open('Case.txt','a')
#         key_label_list = key_label.tolist()

#         for j in range(len(sequence_label)):
#             y_key_truth = y_key_truth + key_label_list[j]

#         sequence = sequence.cuda()
#         z_out, _ = cfdet.generate(sequence, training=False)
#         z_list = z_out.data.tolist()

#         for k in range(len(sequence_label)):
#             y_key_pred = y_key_pred + z_list[k]
        
#         for v in range(len(sequence_label)):
# #             print(sequence.tolist()[v])
# #             print(url.tolist()[v])                
#             entry_list = [activity_dict.get(logkey) for logkey in sequence.tolist()[v]]
#             url_list = [index2url.get(each) for each in url.tolist()[v]]
# #             print([activity_dict.get(logkey) for logkey in sequence.tolist()[v]])
# #             print([index2url.get(each) for each in url.tolist()[v]])
#             for t in range(len(entry_list)):
#                 if entry_list[t][:4] =='Http':
#                     entry_list[t] = url_list[t][1:-1]
# #             print(entry_list)
#             f.write(str(entry_list)+'\n')
                    
# #             print(key_label.tolist()[v])
#             f.write('True label: ' + str(key_label.tolist()[v])+'\n')
# #             print(z_list[v])
#             f.write('Pred label: ' + str(z_list[v])+'\n')
#             f.write('-----------------------'+'\n')
#         f.close()

In [43]:
print(len(train_loader))

print(test_data[test_data['Sequence_label']==0].shape)
print(test_data[test_data['Sequence_label']==1].shape)

test_data['Num'] = test_data['Key_label'].apply(lambda x: x.tolist().count(1))
print(test_data['Num'].sum())

2718
(572560, 4)
(52033, 4)
121751


In [44]:
print(val_data.shape[0])
print(val_data[val_data['Sequence_label']==1].shape[0])
print(val_data[val_data['Sequence_label']==0].shape[0])

69400
5782
63618


In [45]:
val_data['Num'] = val_data['Key_label'].apply(lambda x: x.tolist().count(1))
val_data['Num'].sum()

13514

In [46]:
print(train_test_data.shape[0])
print(train_test_data2.shape[0])

33537
33537


In [47]:
print(raw_ds_normal.shape[0])
print(raw_ds_abnormal.shape[0])

1391559
693993
