<a href="https://colab.research.google.com/github/SigmaGQ/InsDetek.Insider_Threat_Detection_System/blob/main/Training_4.0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from IPython.core.display import clear_output
#@title #__InsDetek__
import os
from google.colab import drive
# mount Google drive
drive.mount('/content/drive')
clear_output()
# by default everything gets executed and saved in 'Berkeley Capstone (Students)'
# notebook_path = '/content/drive/MyDrive/Berkeley Capstone (Students)/Code' # WK
notebook_path = '/content/drive/MyDrive/Capstone/Berkeley Capstone (Students)/Code/InsDetek3.0_LSTM' # GQ
os.chdir(notebook_path);

In [2]:
#@title #__Requirements__
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm, trange
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import time
import re
from logging import raiseExceptions


import torch
from torch import nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from dataloader_ import df_to_tensor


# Data Preparation

In [3]:
class data_preparation():
    def __init__(self, path, idx = None):
        """
        Args:
            path(str): path of csv
            idx(str): index_col
        Object attributes:
            path, idx, df: input dataframe
            user_list, user_train, user_valid, user_test: list of user#
            train_df, valid_df, test_df: dataframe of splited users
            batch_size: batchsize for dataloader
            train, valid tet: dataloader for train, valid and test
        """
        self.path = path
        self.idx = idx
        
    def read_data(self):
        self.df = pd.read_csv(self.path, index_col = self.idx)
        if 'week' in self.df.columns:
            self.df = self.df.drop('week', axis = 1)
        self.feat_size = len(self.df.columns)-2
        self.num_class = self.df['insider'].unique().size
        print("====== Read Data ======\nread '{0}', shape = {1}\n".format(self.path, self.df.shape))
        return self

    def split(self, size):
        """split data into train, valid, test set
        Args:
            df(DataFrame): input dataframe (must includes column 'user')
            size(list): [train_size, valid_size, user_size]
        """
        assert (sum(size) == 1) & (len(size) == 3), "input of 'size' should be three values with a sum of 1"

        self.user_list = self.df['user'].unique()
        self.user_train, self.user_test = train_test_split(self.user_list, train_size = size[0], shuffle = True)
        self.user_valid, self.user_test = train_test_split(self.user_test, train_size = size[1]/(1-size[0]), shuffle = True)

        self.train_df = self.df[self.df['user'].isin(self.user_train)]
        self.valid_df = self.df[self.df['user'].isin(self.user_valid)]
        self.test_df = self.df[self.df['user'].isin(self.user_test)]

        print('====== Split Data ======\nsize = ', size)
        print('train: {0} - {1} users\n'.format(self.train_df.shape, len(self.user_train)),
            '\rvalid: {0} - {1} users\n'.format(self.valid_df.shape, len(self.user_valid)),
            '\rtest : {0} - {1} users\n'.format(self.test_df.shape, len(self.user_test)))
        return self

    def dataloader(self, batch_size, all_label = True, print_summary = True, shuffle = True):
        """convert df to dataloader
        Args:
            all_df (tuple or list): train, valid and test data.
            all_label (bool): output will be the labels of the whole sequence if True,
                or the label of the last datapoint in the sequence otherwise.
            print_summary (bool, optional): print the size of output.
            shuffle (bool): parameter 'shuffle' in dataloader
        """
        self.out_df = []
        self.batch_size = batch_size

        print("====== DataLoader ======")
        if len(self.train_df) != 0:
            print("[{0} Data]".format('Train'), end=' ')
            self.train = df_to_tensor(self.train_df, batch_size, all_label, print_summary, shuffle)
        if len(self.valid_df) != 0:
            print("[{0} Data]".format('Valid'), end=' ')
            self.valid = df_to_tensor(self.valid_df, batch_size, all_label, print_summary, shuffle)
        if len(self.test_df) != 0:
            print("[{0} Data]".format('Test'), end=' ')
            self.test = df_to_tensor(self.test_df, batch_size, all_label, print_summary, shuffle)


# Network

In [4]:
class LSTM_network(nn.Module):

    def __init__(self, input_size, num_class, batch_size):
        super(LSTM_network, self).__init__()
        self.conv1 = torch.nn.Conv1d(input_size, 256, kernel_size = 1) # [bs, f, seq] → [bs, 128, seq]
        # why not conv on feature dim
        self.conv2 = torch.nn.Conv1d(256, 128, kernel_size = 1) # [bs, f, seq] → [bs, 128, seq]
        self.lstm = nn.LSTM(128, 64, batch_first = True)
        self.hidden1 = nn.Linear(64, 32)
        self.hidden2 = nn.Linear(32, 8)
        self.hidden2tag = nn.Linear(8, num_class)


    def forward(self, input):
        self.input_seq = input.permute(0,2,1) # to [batchsize, feature, seq]
        self.cnn_out1 = self.conv1(self.input_seq)        
        self.cnn_out2 = self.conv2(self.cnn_out1)
        self.cnn_out2 = self.cnn_out2.permute(0,2,1) # to [batchsize, seq, feature]
        self.lstm_out, (self.hidden, self.cell) = self.lstm(self.cnn_out2)
        hidden1_out = self.hidden1(self.lstm_out)
        hidden2_out = self.hidden2(hidden1_out)
        logit = self.hidden2tag(hidden2_out)
        log_prob = F.log_softmax(logit, dim=2)
        return log_prob


#Evaluation

In [5]:
class Evaluation():
    def __init__(self, model, input_data: torch.utils.data.dataloader.DataLoader):
        """Return a list of clean(0)/insider(1) label, for each activities within the seq
        Args:
            model (nn.Module): model for prediction
            input_data (DataLoader): input_data, both feature and label, in dataloader
        """
        self.model = model
        self.dataset = input_data
    
    def __call__(self,  mode):
        """ set the value of logits: [log(prob_i)], predictions: y^, labels: y, avg_loss
        Args:
            mode (String): indicates which dataset to use, train, valid or test
        """
        if mode == 'train':
            self.data = self.dataset.train
        elif mode == 'valid':
            self.data = self.dataset.valid
        elif mode == 'test':
            self.data = self.dataset.test
        else:
            raise ValueError("mode should be 'train', 'valid' or 'test'") 

        with torch.no_grad(): # turns off automatic differentiation, which isn't required but helps save memory
            model.eval()

            self.log_prob, self.predictions, self.labels = [], [], []
            total_loss = 0
            for feature_seqs, label_seqs, mask_seqs in self.data:
                seq_len = feature_seqs.shape[1]
                mask_seqs = mask_seqs.bool()
                output_seqs = self.model(feature_seqs) # output_seqs.shape = [batchsize, seq_len, num_class]

                batch_loss_seqs = loss_function(output_seqs.reshape([-1,self.dataset.num_class,seq_len]), label_seqs) # loss.shape = [batchsize, seq_len] = [20,72]
                total_loss += torch.mul(batch_loss_seqs, mask_seqs).reshape(-1).sum() # add sum of loss within one batch 
                batch_loss = 0

                real_label_seqs = label_seqs[mask_seqs]
                real_output_seqs = output_seqs[mask_seqs] # real_output_seqs = [len(all real data within the batch)), num_class]
                pred_seqs = pd.DataFrame(real_output_seqs.tolist()).idxmax(axis=1) # pred_seqs = [len(all real data)]
                self.log_prob += real_output_seqs.tolist()
                self.predictions += pred_seqs.tolist()
                self.labels += real_label_seqs.tolist()

            model.train()

            self.avg_loss = total_loss / len(self.labels)

            return self


    def get_metrics(self, print_report = False):
        self.precision, self.recall, self.fscore, self.support = precision_recall_fscore_support(self.labels, self.predictions, beta=2, zero_division = 0)
        if print_report:
            print(classification_report(self.labels, self.predictions, zero_division=0))

    # Compute ROC curve and ROC area for each class
    def roc_curve(self):
        if self.dataset.num_class != 2:
            print('ROC curve for multi-label not available')
            return
        else:
            y_prob = np.exp(np.array(eval.log_prob)[:,1])
            fpr, tpr, threshold = roc_curve(self.labels, y_prob)
            roc_auc = auc(fpr, tpr)
            plt.figure(figsize=(8, 8))
            plt.plot(fpr, 
                    tpr, 
                    color = 'darkorange',
                    lw = 2, 
                    label = 'ROC curve (area = %0.3f)' % roc_auc)
            plt.plot([0, 1], [0, 1], color = 'navy', lw = 2, linestyle = '--')
            plt.xlim([0.0, 1.00])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver operating characteristic example')
            plt.legend(loc = "lower right")
            plt.show()

    def pr_curve(self, y, prob):

        #calculate precision and recall
        precision, recall, thresholds = precision_recall_curve(y, prob)

        #create precision recall curve
        plt.figure(figsize=(10, 10))
        plt.plot(recall, precision, color='purple')

        #add axis labels to plot
        plt.title('Precision-Recall Curve')
        plt.ylabel('Precision')
        plt.xlabel('Recall')

        #display plot
        plt.show()

# Train

In [7]:
def train(model, epoch, dataset, optimizer, loss_function):
    epoch_bar = tqdm(range(epoch), leave = True)
    loss_list = []
    metric = []

    for epoch_i, epoch in enumerate(epoch_bar): 

        batch_bar = tqdm(dataset.train, leave = False)
        loss_list_epoch = []

        for batch_i, (feature_seqs, label_seqs, mask_seqs) in enumerate(batch_bar): # get feature sequenceS, label sequenceS and mask sequenceS
            
            # == Step 1. clear gradient ==
            model.zero_grad()
            # == Step 2. Run forward pass ==
            predict_seqs = model(feature_seqs)
            # == Step 3.1 Compute the loss ==
            seq_len = feature_seqs.shape[1] 
            predict_seqs = predict_seqs.reshape([-1,dataset.num_class,seq_len]) # [20,72,2]→[20,2,72] Because the input of NLLLoss is in (C,N_class, d1, d2, ...)
            loss = loss_function(predict_seqs, label_seqs) # loss.shape = [batchsize, seq_len] = [20,72]
            loss = torch.mul(loss, mask_seqs).reshape(-1)
            loss = loss.sum() / mask_seqs.sum()
            # == Step 3.2 Compute the gradients ==
            loss.backward()
            # == Step 3.3 Update the parameters ==
            optimizer.step()
            # batch_bar.set_description('batch: %i' % batch_i)
            # batch_bar.set_postfix(loss=format(loss,'.3f'))
            loss_list_epoch.append(loss)
        
        val = Evaluation(model, dataset)
        val('valid').get_metrics()
        metric.append(np.concatenate([[val.avg_loss],val.precision,val.recall,val.fscore]))
        loss_list.append(loss_list_epoch)

        torch.save({'model':model}, 'log/saved model/epoch{}.pth'.format(epoch_i))
        epoch_bar.set_description('Epoch: %i' % epoch)
        epoch_bar.set_postfix(valid_recall = '[0: {0:.3f}, 1: {1:.3f}]'.format(val.recall[0], val.recall[1])) 

    return loss_list, pd.DataFrame(metric, columns = ['avg_loss', 'precision_0', 'precision_1', 'recall_0', 'recall_1', 'f1_0', 'f1_1'])

#\_\_main__

In [33]:
data_path = 'data/data_3.1.csv'
split_size = [0.8, 0.1, 0.1]
batchsize = 32
data = data_preparation(data_path, 'idx')
# data = data_preparation(data_path)
data.read_data().split(split_size).dataloader(batchsize)
# clear_output()

read 'data/data_3.1.csv', shape = (19381, 405)

size =  [0.8, 0.1, 0.1]
train: (15478, 405) - 614 users
 valid: (1979, 405) - 77 users
 test : (1924, 405) - 77 users

[Train Data] Input DataFrame: (15478, 405) with 14.82% insiders
   => 614 sequences (users)
       features: [43, 403], labels: [43] in size of [(max) length, dimension]
   => 19 batches in Dataloader (batchsize = 32)
       features: [32, 44, 403], labels: [32, 43], masks: [32, 42] in size of [batchsize, (max) length, dimension]

[Valid Data] Input DataFrame: (1979, 405) with 15.01% insiders
   => 77 sequences (users)
       features: [29, 403], labels: [29] in size of [(max) length, dimension]
   => 2 batches in Dataloader (batchsize = 32)
       features: [32, 42, 403], labels: [32, 49], masks: [32, 38] in size of [batchsize, (max) length, dimension]

[Test Data] Input DataFrame: (1924, 405) with 16.16% insiders
   => 77 sequences (users)
       features: [16, 403], labels: [16] in size of [(max) length, dimension]
 

In [15]:
model = LSTM_network(input_size = data.feat_size, num_class = data.num_class, batch_size = data.batch_size)
weight = torch.tensor([1., 5.])
loss_function = nn.NLLLoss(weight, reduction = 'none') # 'none' to get list of loss
    # NLLLoss takes list of num_class-D logits: [log(prob_class_i)] as input, 
    # and avg&sum the true logits (i.e. takes the log(prob_2) when label=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs_num = 300
loss_list, metric_df = train(model, epochs_num, data, optimizer, loss_function)
# loss_list_sf, valid_loss_list_sf, recall_list_sf = train(model, epochs_num, train_data, optimizer, loss_function)

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

# Results

In [16]:
loss_df = pd.DataFrame(columns = ['epoch', 'batch', 'loss'])
for i, epoch_i_loss in enumerate(loss_list):
    epoch_loss = []
    for batch_j_loss in epoch_i_loss:
        epoch_loss.append(float(batch_j_loss.detach().numpy()))
    df_temp = pd.DataFrame(columns = ['epoch', 'batch', 'loss'])
    df_temp['batch'] = list(range(len(epoch_i_loss)))
    df_temp['epoch'] = float(i)    
    df_temp['epoch'] = df_temp['epoch'].astype(float)
    df_temp['loss'] = epoch_loss
    loss_df = loss_df.append(df_temp)
loss_df = loss_df.reset_index(drop=True)

In [17]:
#@title Loss of each epoch
color_list = ['rgb({0}, {0}, {0})'.format(int(i/epochs_num*255)) for i in range(epochs_num)]
# fig = px.line(loss_df[loss_df['epoch']%10 == 0], x = 'batch', y = 'loss', color = 'epoch', color_discrete_sequence= color_list)
fig = px.line(loss_df, x = 'batch', y = 'loss', color = 'epoch', color_discrete_sequence= color_list)
fig.update_layout(plot_bgcolor='#a1afc9')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

In [18]:
fig = px.line(loss_df, y = 'loss', color = 'epoch', color_discrete_sequence= color_list)
fig.update_layout(plot_bgcolor='#a1afc9')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

In [19]:
# #@title Train & Valid Loss
# train_valid_loss = loss_df.groupby('epoch').mean().reset_index()
# train_valid_loss['valid'] = 0

# valid_loss = pd.DataFrame(columns = ['loss', 'valid'])
# valid_loss['loss'] = valid_loss_list
# valid_loss['loss'] = valid_loss['loss'].astype(float)
# valid_loss['valid'] = 1
# valid_loss = valid_loss.reset_index()
# valid_loss = valid_loss.rename(columns={'index':'epoch'})
# train_valid_loss = train_valid_loss.append(valid_loss)
# px.line(train_valid_loss, x = 'epoch', y = 'loss', color = 'valid')

train_valid_loss = loss_df.groupby('epoch').mean().reset_index()
train_valid_loss['valid'] = 0

valid_loss = pd.DataFrame(columns = ['loss', 'valid'])
valid_loss['loss'] = metric_df['avg_loss']
valid_loss['loss'] = valid_loss['loss'].astype(float)
valid_loss['valid'] = 1
valid_loss = valid_loss.reset_index()
valid_loss = valid_loss.rename(columns={'index':'epoch'})
train_valid_loss = train_valid_loss.append(valid_loss)
px.line(train_valid_loss, x = 'epoch', y = 'loss', color = 'valid')

In [31]:
# # @title Validation Recall & Precision
# recall_df = pd.DataFrame(recall_list, columns = ['recall of 0', 'recall of 1']).melt().reset_index()
# recall_df['index'] = recall_df['index'].map(lambda x: x % epochs_num)
# recall_df = recall_df.rename(columns={'index':'epoch', 'variable':'label', 'value':'recall'})
# px.line(recall_df, x = 'epoch', y = 'recall', color = 'label')

# @title Validation Recall
recall_df = metric_df[['recall_0', 'recall_1', 'precision_0', 'precision_1']].melt().reset_index().rename(columns={'recall_0':'recall of 0 (clean)', 'recall_1':'recall of 1(malicious)'})
recall_df['index'] = recall_df['index'].map(lambda x: x % epochs_num)
recall_df = recall_df.rename(columns={'index':'epoch', 'variable':'label', 'value':'rate'})
px.line(recall_df, x = 'epoch', y = 'rate', color = 'label')

In [29]:
#@title Recall by epochs
recall_df = metric_df[['recall_0','recall_1']].reset_index()
recall_df['index'] = recall_df['index'].astype(str)
fig = px.scatter(recall_df, x='recall_1', y='recall_0', color = 'index', color_discrete_sequence = color_list)
fig.update_layout(plot_bgcolor='#a1afc9')
fig.update_layout(title="Recall 0 vs 1", title_font_size=20)

In [30]:
#@title Precision-Recall by epochs
recall_df = metric_df[['precision_1','recall_1']].reset_index()
recall_df['index'] = recall_df['index'].astype(str)
fig = px.scatter(recall_df, x='precision_1', y='recall_1', color = 'index', color_discrete_sequence = color_list)
fig.update_layout(plot_bgcolor='#a1afc9')
fig.update_layout(title="P-R on 1", title_font_size=20)

In [None]:
recall_df

In [None]:
recall_df = pd.DataFrame(recall_list, columns = ['recall of 0', 'recall of 1']).melt().reset_index()
recall_df['index'] = recall_df['index'].map(lambda x: x % epochs_num)
recall_df['data'] = 'data'
recall_df = recall_df.rename(columns={'index':'epoch', 'variable':'label', 'value':'recall'})
recall_df

In [None]:
recall_df_sf = pd.DataFrame(recall_list_sf, columns = ['recall of 0', 'recall of 1']).melt().reset_index()
recall_df_sf['index'] = recall_df_sf['index'].map(lambda x: x % epochs_num)
recall_df_sf['data'] = 'shuffled data'
recall_df_sf = recall_df_sf.rename(columns={'index':'epoch', 'variable':'label', 'value':'recall'})
recall_df_sf

In [None]:
recall_all = recall_df.append(recall_df_sf)
px.line(recall_all, x = 'epoch', y = 'recall', color = 'label', line_dash = 'data')

In [None]:
test_model = Predictor(model, test_data)
test_pred, test_true = test_model.predictor()
test_model.get_metrics(True)

In [None]:
loss_list[21].grad

In [None]:
temp = []
loss_df = pd.Series(loss_list)
for i in range(100):
    temp.append(loss_list.iloc[:,20*60+i])
loss_plot(temp, False)

# Search - loss weight

In [None]:
weight_list = torch.tensor([[1.5, 5], [1.5,6], [1,6], [0.1,0.9],[0.01,0.99]])
for weight in weight_list:
    print('=== ',weight,' ===')
    loss_function = nn.NLLLoss(weight, reduction = 'none') 
    loss_list = train(model, data_loader, optimizer, loss_function)
    _ = loss_plot(loss_list, False)
    predictor = Predictor(model, df)
    predictor.get_metrics()

# Results

In [None]:
def write_log(exp_n, comment):
    exp = str(exp_n).zfill(3)
    path = 'log/exp' + exp + '/'
    if os.path.exists(path):
        print("=== Overwriting!!! ===")
    else:
        os.makedirs(path)   

    with open("log/log.txt","a") as f:
        f.write('\r\n\r\n' + exp)
        f.write('\r\n\t' + comment)

    dic = {'model':model, 'optim':optimizer}
    torch.save(dic, path +'model_optim.pth')

    print(path)
    return path


In [None]:
info =  '==== model info ====\n'\
        + 'batchsize=' + str(batch_size)\
        + '\n' + 'optim=' + optimizer.__class__.__name__\
        + '(lr={:g})'.format(optimizer.param_groups[0]['lr'])
print(info)
comment = """
data3.1.csv
100 epochs lr=0.0001, weight[1.5,5], Conv(256,ker=1)-Conv(128,ker=1)-LSTM64-Lin32-Lin8-Lin2
""" + info

In [None]:
exp_nums = re.findall('\d{3}', str(os.listdir('log')))
exp_new = max(list(map(lambda x: int(x), exp_nums))) + 1

In [None]:
path = write_log(exp_new, comment)

## Loss

In [None]:
def loss_plot(loss_list, save=True):
    losslist = []
    for i in loss_list:
        losslist.append(float(i.detach().numpy()))
    note = '\n $\mathbf{data3.2, cov-cov-lstm-}$'
    plt.figure(figsize = (10,5))
    plt.suptitle(note + re.findall('LSTMTagger\(\\n(.+)\\n\)$',str(model), re.S)[0], y = -0.001)
    plt.subplot(1,2,1)
    plt.plot(losslist)
    plt.subplot(1,2,2)
    plt.plot(losslist)
    plt.ylim(0,5)
    if save:
        plt.savefig(path + 'loss.jpg', bbox_inches='tight')
    return losslist
# losslist = loss_plot(loss_list)

In [None]:
loss_df = pd.DataFrame(losslist)#.reset_index(drop=False)
loss_df.columns = ['loss']#['epoch', 'loss']
loss_df.to_csv(path+'loss.csv', index = False)

In [None]:
# save
# torch.save(dic, path +'model_optim.pth')
# load
# net = torch.load('.pth')
# net['model'].state_dict()

# Predict

In [None]:
"""data3.1 100 epochs"""
predictor = Predictor(model, data.train_df)
predictor.get_metrics()
predictor = Predictor(model, data.test_df)
predictor.get_metrics()

In [None]:
%%writefile dataloader_.py