In [23]:
# coding: utf-8
import os
import time
import sys
import yaml
import numpy as np
import pandas as pd
from src.util import ExeDataset,write_pred
from src.model import MalConv
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

In [24]:
config_path = 'config/example2.yaml'
seed = 99
conf = yaml.load(open(config_path, 'r'), Loader=yaml.Loader)

In [36]:
exp_name = conf['exp_name']+'_sd_'+str(seed)
print('Experiment:')
print('\t',exp_name)

np.random.seed(seed)
torch.manual_seed(seed)

train_data_path = conf['train_data_path']
train_label_path = conf['train_label_path']
valid_data_path = conf['valid_data_path']
valid_label_path = conf['valid_label_path']

log_dir = conf['log_dir']
pred_dir = conf['pred_dir']
checkpoint_dir = conf['checkpoint_dir']

log_file_path = log_dir+exp_name+'.log'
chkpt_acc_path = checkpoint_dir+exp_name+'.model'
pred_path = pred_dir+exp_name+'.pred'

# Parameters
use_gpu = conf['use_gpu']
use_cpu = conf['use_cpu']
learning_rate = conf['learning_rate']
max_step = conf['max_step']
test_step = conf['test_step']
batch_size = conf['batch_size']
first_n_byte = conf['first_n_byte']
window_size = conf['window_size']
display_step = conf['display_step']
sample_cnt = conf['sample_cnt']

# Load Ground Truth.
tr_label_table = pd.read_csv(train_label_path,header=None,index_col=0)
tr_label_table.index=tr_label_table.index.str.upper()
tr_label_table = tr_label_table.rename(columns={1:'ground_truth'})
val_label_table = pd.read_csv(valid_label_path,header=None,index_col=0)
print("asd")
print(val_label_table)
print("asd")
val_label_table.index=val_label_table.index.str.upper()
print(val_label_table.index)
print("asd")
val_label_table = val_label_table.rename(columns={1:'ground_truth'})
print(val_label_table)
print("asd")

tr_table = tr_label_table.groupby(level=0).last()
val_table = val_label_table.groupby(level=0).last()
print(val_table)
tr_table = tr_table.drop(val_table.index.join(tr_table.index, how='inner'))

Experiment:
	 example_sd_99
asd
                                          1
0                                          
01F545238DCED9110E8DA50B4AB8E2DE5CE4485E  0
01F64389FBBF640580EBF1DC05DBDC3DCFD047FF  1
asd
Index(['01F545238DCED9110E8DA50B4AB8E2DE5CE4485E', '01F64389FBBF640580EBF1DC05DBDC3DCFD047FF'], dtype='object', name=0)
asd
                                          ground_truth
0                                                     
01F545238DCED9110E8DA50B4AB8E2DE5CE4485E             0
01F64389FBBF640580EBF1DC05DBDC3DCFD047FF             1
asd
                                          ground_truth
0                                                     
01F545238DCED9110E8DA50B4AB8E2DE5CE4485E             0
01F64389FBBF640580EBF1DC05DBDC3DCFD047FF             1


In [26]:
print('Training Set:')
print('\tTotal',len(tr_table),'files')
print('\tMalware Count :',tr_table['ground_truth'].value_counts()[1])
print('\tGoodware Count:',tr_table['ground_truth'].value_counts()[0])


print('Validation Set:')
print('\tTotal',len(val_table),'files')
print('\tMalware Count :',val_table['ground_truth'].value_counts()[1])
print('\tGoodware Count:',val_table['ground_truth'].value_counts()[0])

Training Set:
	Total 2 files
	Malware Count : 1
	Goodware Count: 1
Validation Set:
	Total 2 files
	Malware Count : 1
	Goodware Count: 1


In [27]:
dataloader = DataLoader(ExeDataset(list(tr_table.index), train_data_path, list(tr_table.ground_truth), first_n_byte),
                       batch_size=batch_size, shuffle=True, num_workers=use_cpu)
validloader = DataLoader(ExeDataset(list(val_table.index), valid_data_path, list(val_table.ground_truth), first_n_byte),
                         batch_size=batch_size, shuffle=False, num_workers=use_cpu)
valid_idx = list(val_table.index)

In [28]:
malconv = MalConv(input_length=first_n_byte, window_size=window_size)
bce_loss = nn.BCEWithLogitsLoss()
adam_optim = optim.Adam([{'params':malconv.parameters()}], lr=learning_rate)
sigmoid = nn.Sigmoid()

In [29]:
step_msg = 'step-{}-loss-{:.6f}-acc-{:.4f}-time-{:.2f}'
valid_msg = 'step-{}-tr_loss-{:.6f}-tr_acc-{:.4f}-val_loss-{:.6f}-val_acc-{:.4f}'
log_msg = '{}, {:.6f}, {:.4f}, {:.6f}, {:.4f}, {:.2f}'
history = {}
history['tr_loss'] = []
history['tr_acc'] = []
print(history)

{'tr_loss': [], 'tr_acc': []}


In [30]:
valid_best_acc = 0.0
total_step = 0
step_cost_time = 0

In [31]:
for i in dataloader:
    print(i)
    break

[tensor([[ 78,  91,  81,  ...,   0,   0,   0],
        [ 78,  91, 145,  ...,   0,   0,   0]]), tensor([[1],
        [0]])]


In [41]:
while total_step < max_step:
    # Training
    for step, batch_data in enumerate(dataloader):
        break
        start = time.time()
        adam_optim.zero_grad()
        cur_batch_size = batch_data[0].size(0)

        exe_input = batch_data[0]
        exe_input = Variable(exe_input.long(), requires_grad=False)
        
        label = batch_data[1]
        label = Variable(label.float(), requires_grad=False)

        pred = malconv(exe_input)
        loss = bce_loss(pred, label)
        loss.backward()
        adam_optim.step()

        history['tr_loss'].append(loss.cpu().data.numpy())
        history['tr_acc'].extend(list(label.cpu().data.numpy().astype(int)==
                                      (sigmoid(pred).cpu().data.numpy()+0.5).astype(int)))

        step_cost_time = time.time()-start

        if (step+1) % display_step == 0:
            print(step_msg.format(total_step, np.mean(history['tr_loss']),
                                  np.mean(history['tr_acc']), step_cost_time), end='\r', flush=True)
        total_step += 1

        if total_step % test_step == 0:
            break

    # Testing
    history['val_loss'] = []
    history['val_acc'] = []
    history['val_pred'] = []
    
    for _,val_batch_data in enumerate(validloader):
        cur_batch_size = val_batch_data[0].size(0)

        exe_input = val_batch_data[0].cuda() if use_gpu else val_batch_data[0]
        exe_input = Variable(exe_input.long(),requires_grad=False)

        label = val_batch_data[1].cuda() if use_gpu else val_batch_data[1]
        label = Variable(label.float(),requires_grad=False)

        pred = malconv(exe_input)
        print(len(exe_input[0]))
        loss = bce_loss(pred,label)

        history['val_loss'].append(loss.cpu().data.numpy())
        history['val_acc'].extend(list(label.cpu().data.numpy().astype(int)==(sigmoid(pred).cpu().data.numpy()+0.5).astype(int)))
        history['val_pred'].append(list(sigmoid(pred).cpu().data.numpy()))

    if valid_best_acc < np.mean(history['val_acc']):
        valid_best_acc = np.mean(history['val_acc'])
        torch.save(malconv,chkpt_acc_path)
        print('Checkpoint saved at',chkpt_acc_path)
        write_pred(history['val_pred'],valid_idx,pred_path)
        print('Prediction saved at', pred_path)
    break
    history['tr_loss'] = []
    history['tr_acc'] = []

2000000
Checkpoint saved at checkpoint2/example_sd_99.model
Prediction saved at pred2/example_sd_99.pred


In [15]:
print(np.mean(history['tr_acc']))
print(np.mean(history['tr_loss']))

0.975
0.049859595
