# 必要なライブラリの準備

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Subset

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, confusion_matrix
from time import time
from copy import deepcopy

import numpy as np
import pandas as pd
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

import pickle
import json

from winsound import Beep

import matplotlib.pyplot as plt
from matplotlib import ticker
%matplotlib inline

# ハイパーパラメータ・活性化関数・最適化アルゴリズムの定義

In [2]:
hyperparam_dict = {

    'API_type' : 307,
    'hidden1_dim' : 128,
    'n_classes' : 1,

    'bias1' : True,
    'bias2' : True,

    'dropout1_rate' : 0.5,

    'epoch' : 1000,
    'loss_func' : nn.BCEWithLogitsLoss(),

    'optimizer' : {
        'algorithm' : optim.Adam,
        'param' : {
            'lr' : 1e-4,
            'weight_decay' : 1e-4,
        }
    },

    'batch_size' : 32,
    'n_splits' : 10,

    'data_random_state' : 0,
    'fold_random_state' : 0,
    'weight_random_state' : 0,

    'device' : torch.device('cuda'),

}

# データセットの作成

In [3]:
df = pd.read_csv('./dynamic_api_call_sequence_per_malware_100_0_306.csv')
df

Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,71,297,135,171,215,35,208,56,71,1
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,81,240,117,71,297,135,171,215,35,1
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,65,112,123,65,112,123,65,113,112,1
3,72049be7bd30ea61297ea624ae198067,82,208,187,208,172,117,172,117,172,...,208,302,208,302,187,208,302,228,302,1
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,209,260,40,209,260,141,260,141,260,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43871,e3d6d58faa040f0f9742c9d0eaf58be4,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,1
43872,9b917bab7f32188ae40c744f2be9aaf8,82,240,117,240,117,240,117,240,117,...,159,224,82,159,224,82,159,224,82,1
43873,35a18ee05f75f04912018d9f462cb990,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,260,1
43874,654139d715abcf7ecdddbef5a84f224b,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,1


In [4]:
X = df.drop(['hash', 'malware'], axis = 1).values.astype(int)
y = df['malware'].values.astype(float)
del df
print(X.shape)
print(y.shape)

(43876, 100)
(43876,)


In [5]:
def check_imbalance(dataset):
    count = sorted(Counter(dataset).items())
    print(count)
    print(count[1][1] / count[0][1])
    return

random_undersampler = RandomUnderSampler(random_state = hyperparam_dict['data_random_state'])
X, y = random_undersampler.fit_resample(X, y)
check_imbalance(y)

dataset = list(zip(X, y))

[(0.0, 1079), (1.0, 1079)]
1.0


In [6]:
fold = KFold(n_splits = hyperparam_dict['n_splits'],
             shuffle = True,
             random_state = hyperparam_dict['fold_random_state'])

# モデルの定義

In [7]:
class GRU_network(nn.Module):
    
    def __init__(self, input_dim, hidden1_dim, n_classes, dropout1_rate, bias1=True, bias2=True):
        
        super(GRU_network, self).__init__()
        
        self.input_dim = input_dim
        self.hidden1_dim = hidden1_dim
        self.dropout1_rate = dropout1_rate
        
        self.gru1 = nn.GRU(self.input_dim, self.hidden1_dim, batch_first = True, bias = bias1)
        self.dropout1 = nn.Dropout(p = self.dropout1_rate)
        self.fc = nn.Linear(self.hidden1_dim, n_classes, bias = bias2)
        
    def forward(self, X):
        
        X = F.one_hot(X, num_classes = self.input_dim).float().cuda()
        
        # Hidden layer shape: (num_layers, batch_size, hidden_dim)
        hidden_0 = torch.zeros(1, X.size(0), self.hidden1_dim).float().cuda()
        
        # Input/Output shape: (batch_size, seq_len, input_dim)
        _, self.hidden1 = self.gru1(X, hidden_0)
        
        H = self.hidden1[0].squeeze()
        H = self.dropout1(H)
        H = self.fc(H)
                
        return H.squeeze()

# 評価値の定義

In [8]:
def evaluate(loader, model, metric_dict):

    sum_loss = 0.0
    all_dataset = len(loader.dataset)
    all_output = torch.tensor([]).to(hyperparam_dict['device'])
    all_label = torch.tensor([]).to(hyperparam_dict['device'])

    for batch_data, batch_label in loader:
        batch_data, batch_label = batch_data.long().to(hyperparam_dict['device']), batch_label.to(hyperparam_dict['device'])
        batch_output = model(batch_data)
        batch_loss = loss_func(batch_output, batch_label)
        optimizer.zero_grad()

        sum_loss += batch_loss.detach().item() * batch_output.shape[0]
        all_output = torch.cat((all_output, batch_output))
        all_label = torch.cat((all_label, batch_label))

    sigmoid = nn.Sigmoid()
    all_pred, all_score, all_label = (all_output >= 0).cpu(), sigmoid(all_output).cpu(), all_label.cpu()
    tn, fp, fn, tp = confusion_matrix(all_label, all_pred).flatten()

    metric_dict['Loss'][-1].append(sum_loss / all_dataset)
    metric_dict['Accuracy'][-1].append(accuracy_score(all_label, all_pred))
    metric_dict['Precision'][-1].append(precision_score(all_label, all_pred))
    metric_dict['Recall'][-1].append(recall_score(all_label, all_pred))
    metric_dict['F1-Score'][-1].append(f1_score(all_label, all_pred))
    metric_dict['ROC-Curve'][-1].append(roc_curve(all_label, all_score))
    metric_dict['AUC'][-1].append(roc_auc_score(all_label, all_pred))
    metric_dict['TP'][-1].append(tp)
    metric_dict['FP'][-1].append(fp)
    metric_dict['TN'][-1].append(tn)
    metric_dict['FN'][-1].append(fn)

# 学習と評価

In [None]:
metric_dict = {
    'Loss' : [],
    'Accuracy' : [],
    'Precision' : [],
    'Recall' : [],
    'F1-Score' : [],
    'ROC-Curve' : [],
    'AUC': [],
    'TP' : [],
    'TN' : [],
    'FP' : [],
    'FN' : [],
}

eval_dict = {
    'Train' : deepcopy(metric_dict),
    'Test' : deepcopy(metric_dict),
    'Time' : {
        'Train_Time' : [],
        'Eval_Time' : [],
    },
}

loss_func = hyperparam_dict['loss_func']

for fold_idx, (train_idx, test_idx) in enumerate(fold.split(dataset)):
    
    print(f'Fold {fold_idx}')

    torch.manual_seed(hyperparam_dict['weight_random_state'])

    model = GRU_network(
        input_dim = hyperparam_dict['API_type'],
        hidden1_dim = hyperparam_dict['hidden1_dim'],
        n_classes = hyperparam_dict['n_classes'],
        dropout1_rate = hyperparam_dict['dropout1_rate'],
        bias1 = hyperparam_dict['bias1'],
        bias2 = hyperparam_dict['bias2'],
        ).to(hyperparam_dict['device'])

    optimizer = hyperparam_dict['optimizer']['algorithm'](
        params = model.parameters(),
        lr = hyperparam_dict['optimizer']['param']['lr'],
        weight_decay = hyperparam_dict['optimizer']['param']['weight_decay']
        )
    
    temp_optim = {key : optimizer.param_groups[0][key] for key in optimizer.param_groups[0] if key != 'params'}
    temp_optim['betas'] = list(temp_optim['betas'])
    hyperparam_dict['optimizer']['param'] = temp_optim

    for i, param in enumerate(model.parameters()):
        print(f'\nparam {i}')
        print(param.shape)
        print(param)
    print(f'\nParameters: {np.sum([param.numel() for param in model.parameters()])}\n')
    
    train_loader = DataLoader(
        Subset(dataset, train_idx),
        shuffle = True,
        batch_size = hyperparam_dict['batch_size'],
        )
    
    test_loader = DataLoader(
        Subset(dataset, test_idx),
        shuffle = False,
        batch_size = hyperparam_dict['batch_size'],
        )

    [eval_dict['Train'][key].append([]) for key in eval_dict['Train']]
    [eval_dict['Test'][key].append([]) for key in eval_dict['Test']]
    eval_dict['Time']['Train_Time'].append([])
    eval_dict['Time']['Eval_Time'].append([])

    for epoch in range(hyperparam_dict['epoch']):
        
        # Training
        start_train_time = time()
        model.train()
        for batch_data, batch_label in train_loader:
            batch_data, batch_label = batch_data.long().to(hyperparam_dict['device']), batch_label.to(hyperparam_dict['device'])
            output = model(batch_data)
            loss = loss_func(output, batch_label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()       
        eval_dict['Time']['Train_Time'][-1].append(time() - start_train_time)

        # Evaluation
        start_eval_time = time()
        model.eval()
        with torch.no_grad():

            # Trainset
            evaluate(
                loader = train_loader,
                model = model,
                metric_dict = eval_dict['Train'],
                )

            # Testset
            evaluate(
                loader = test_loader,
                model = model,
                metric_dict = eval_dict['Test'],
            )
            
        eval_dict['Time']['Eval_Time'][-1].append(time() - start_eval_time)

        if (epoch + 1) % 100 == 0:
            print(f'Epoch {epoch + 1}')
            for key, value in eval_dict.items():
                print(f'\n\t{key}')
                for metric in value:
                    if metric != 'ROC-Curve':
                        print(f'\t\t{metric} : {value[metric][-1][-1]}')
            print()

    torch.save(model.state_dict(), 'param{}.pth'.format(fold_idx))

Fold 0

param 0
torch.Size([384, 307])
Parameter containing:
tensor([[-0.0007,  0.0474, -0.0727,  ...,  0.0174, -0.0656,  0.0147],
        [ 0.0376,  0.0350, -0.0111,  ..., -0.0689,  0.0737, -0.0390],
        [ 0.0314,  0.0769,  0.0446,  ...,  0.0390,  0.0120,  0.0431],
        ...,
        [ 0.0486,  0.0109, -0.0438,  ..., -0.0206, -0.0840, -0.0452],
        [-0.0153,  0.0581,  0.0205,  ..., -0.0158,  0.0850,  0.0501],
        [ 0.0756, -0.0700, -0.0268,  ..., -0.0856, -0.0747,  0.0571]],
       device='cuda:0', requires_grad=True)

param 1
torch.Size([384, 128])
Parameter containing:
tensor([[ 0.0745,  0.0780,  0.0115,  ...,  0.0855,  0.0285, -0.0721],
        [ 0.0740,  0.0873, -0.0403,  ..., -0.0625, -0.0374, -0.0008],
        [ 0.0151,  0.0698, -0.0878,  ..., -0.0252,  0.0291,  0.0737],
        ...,
        [-0.0177, -0.0130, -0.0238,  ...,  0.0746, -0.0235, -0.0475],
        [-0.0058, -0.0495, -0.0652,  ...,  0.0863,  0.0635, -0.0480],
        [ 0.0719, -0.0057, -0.0691,  ..., -0

# 評価値の出力

In [None]:
def plot(epoch, data1, data2, ylabel, ylim_bottom=0, ylim_top=1, save=True, legend1='train', legend2='test'):

    fig = plt.figure()
    ax = fig.add_subplot(1,1,1, title=ylabel, xlabel='epoch', xlim=(0, epoch),ylabel=ylabel, ylim=(ylim_bottom, ylim_top))

    ax.plot(range(epoch), np.array(data1).mean(axis=0), marker='None')
    
    if data2:
        ax.plot(range(epoch), np.array(data2).mean(axis=0), marker='None', c='#ff00ff')
        ax.legend([f'{legend1}', f'{legend2}'])
    else:
        ax.legend(f'{ylabel}')

    ax.grid()
    # ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) 

    if save:
        plt.savefig(f'{ylabel}.png')

    plt.show()

In [None]:
plot(hyperparam_dict['epoch'], eval_dict['Train']['Loss'], eval_dict['Test']['Loss'], 'Cross-Entropy Loss', ylim_top=1)

In [None]:
plot(hyperparam_dict['epoch'], eval_dict['Train']['Accuracy'], eval_dict['Test']['Accuracy'], 'Accuracy', ylim_bottom=0.8)

In [None]:
plot(hyperparam_dict['epoch'], eval_dict['Train']['Precision'], eval_dict['Test']['Precision'], 'Precision', ylim_bottom=0.8)

In [None]:
plot(hyperparam_dict['epoch'], eval_dict['Train']['Recall'], eval_dict['Test']['Recall'], 'Recall', ylim_bottom=0.8)

In [None]:
plot(hyperparam_dict['epoch'], eval_dict['Train']['F1-Score'], eval_dict['Test']['F1-Score'], 'F1-Score', ylim_bottom=0.8)

In [None]:
plot(hyperparam_dict['epoch'], eval_dict['Train']['AUC'], eval_dict['Test']['AUC'], 'AUC', ylim_bottom=0.8)

In [None]:
plot(hyperparam_dict['epoch'], eval_dict['Time']['Train_Time'], eval_dict['Time']['Eval_Time'], 'Time', ylim_top=1, legend2='eval')

In [None]:
def find_max(eval_dict):
    for key, value in eval_dict.items():
        if key != 'Time':
            print(key)
            for metric in value:
                if metric not in {'Loss', 'ROC-Curve', 'TP', 'FP', 'TN', 'FN'}:
                    print(metric)
                    print(max(enumerate(np.array(value[metric]).mean(axis=0)), key = lambda x : x[1]))
            print()

find_max(eval_dict)

In [None]:
def output(metric, key = 'Test' ,digit = 4):
    for i, x in enumerate(np.array(eval_dict[key][metric]).mean(axis=0)):
        print(f'{i} : {round(x, digit)}')

output('F1-Score')

In [None]:
def plot_ROC_Curve(fold, epoch, key = 'Test'):

    fpr, tpr, _ = eval_dict['Test']['ROC-Curve'][fold][epoch]
    print(eval_dict['Test']['AUC'][fold][epoch])
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(fpr, tpr, marker='.',c='#ff00ff')
    ax.set_xlabel('FPR: False Positive Rete')
    ax.set_ylabel('TPR: True Positive Rete')
    # ax.set_aspect('equal')
    ax.grid()

plot_ROC_Curve(7, 213)

# ハイパーパラメータ・評価値の保存

In [None]:
hyperparam_dict['loss_func'] = str(hyperparam_dict['loss_func'])
hyperparam_dict['device'] = str(hyperparam_dict['device'])
hyperparam_dict['optimizer']['algorithm'] = str(hyperparam_dict['optimizer']['algorithm'])

json.dump(hyperparam_dict, open('hyperparameter.json', 'w'), indent=4)
pickle.dump(eval_dict, open('evaluation.pkl', 'wb'))

# 学習・保存の終了時にビープ音で通知

In [None]:
Beep(1100, 1000)