## RNN 모델 N-Byte 방식 (함수정보 포함 vs 미포함 => 1:1 비율)

## (1) 데이터로드

In [1]:
# (1) 데이터로드
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(action='ignore')

# 여러개 쳐도 나오게
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# 파일읽기
bin8_3 = pd.read_csv("../data/gcc8/o0123합본/o3/o3_gcc8.csv", index_col=0)
core8_3 = pd.read_csv("../data/gcc8/o0123합본/o3/o3_core8.csv", index_col=0)
print(bin8_3.shape, core8_3.shape)

# reset_index (hex processing 하면서 값이 빠졌으니까 + n_gram 에서 index를 다루기 때문에)
bin8_3.reset_index(inplace=True, drop=True)
core8_3.reset_index(inplace=True, drop=True)

print('bin - core concat')
bin8_3 = pd.concat([bin8_3, core8_3], axis=0)
print('shape -', bin8_3.shape)
print('reset_index 완료')
print('input data shape')
bin8_3.head()

(23151789, 2) (8632590, 2)
bin - core concat
shape - (31784379, 2)
reset_index 완료
input data shape


Unnamed: 0,bin,label
0,47,0
1,108,0
2,105,0
3,98,0
4,47,0


In [2]:
# (2-1) 데이터체크 1 - hex(16진수)가 256 label을 가져야 dummies 변환 가능 
# 16진수 256개 종류가 있어서 pd.get_dummies 사용 가능.
print(len(bin8_3['bin'].unique()))

# (2-2) 데이터 체크 2 - 1, 0 비율 ==> 1이 함수의 갯수를 뜻함
# 정답 데이터 1, 0 비율 확인  ==> 1이 함수의 갯수를 뜻함
print(bin8_3['label'].value_counts())

256
0    31747485
1       36894
Name: label, dtype: int64


## (3) N Byte씩 자르기

In [3]:
idx_bin = bin8_3[bin8_3['label']==1].index  # 407, 474 ...
ls_bin = list(idx_bin)

# 최종 뽑을 행에 대한 index
ls_idx_bin = []

# n byte 자르기 방식
left_idx, right_idx = 0, 32 # 3개씩

# n byte 자르기
for k in range(left_idx, right_idx):
    ls_idx_bin.extend(list(idx_bin + k)) # index 형이라서 가능

#ls_idx = list(set(ls_idx)) 
ls_idx_bin.sort() # 인덱스 정렬

# 1차 index 해당범위 초과한 것들 없애기
ls_idx_bin = list(filter(lambda x: x<len(bin8_3), ls_idx_bin))
print(len(ls_idx_bin))

# 2차 남은 index들 중 right_idx 나눈 나머지 없애기
sub_bin = len(ls_idx_bin)%(right_idx)
print('나머지', sub_bin)

ls_idx_bin = ls_idx_bin[:len(ls_idx_bin)-sub_bin]
print('최종 길이', len(ls_idx_bin))

print('bin8_3', len(ls_idx_bin))

# loc 로 수정필요
bin8_3_Ngram = bin8_3.loc[ls_idx_bin,:].copy()

1180608
나머지 0
최종 길이 1180608
bin8_3 1180608


## (4) false data 만들기

In [4]:
# false data 만들기 - False 데이터 랜덤 생성

# 목표치
goal_bin = len(bin8_3_Ngram)/right_idx
count_bin = 0

print(goal_bin)

# 최종 데이터 Frame
d_bin = pd.DataFrame(columns = bin8_3.columns)

binutils_df = []
# goal 에 도달할 때까지
while True:
    if (count_bin == goal_bin):
            break
    # 진행상황 살펴보기 위함
            
    # 랜덤 N 바이트씩 뽑음
    # random index
    random_idx_bin = np.random.randint(len(bin8_3)-right_idx)

    if count_bin % 1000==0:
        print(count_bin, end=' ')
        print(random_idx_bin)

    df_bin = bin8_3[random_idx_bin : random_idx_bin + right_idx]
    
    # 뽑은 index의 N 바이트 중에 1이 없는 경우만
    if 1 not in df_bin['label'] and count_bin < goal_bin:
        binutils_df.append(df_bin)
        count_bin+=1

print('완료')
print(len(binutils_df))

59304.0
0 12174778
1000 11014408
2000 8779104
3000 28596311
4000 4042078
5000 8166974
6000 23076382
7000 10377072
8000 30079130
9000 24102197
10000 27162250
11000 17103388
12000 13881942
13000 19645745
14000 518484
15000 2216015
16000 21805833
17000 31035856
18000 21416346
19000 23810086
20000 13268222
21000 27909485
22000 15515745
23000 21169645
24000 6397856
25000 3767346
26000 8502000
27000 22468682
28000 30938587
29000 28430502
30000 7682416
31000 4118949
32000 21124182
33000 23928286
34000 6524864
35000 1689815
36000 13096146
37000 20292902
38000 22089873
39000 27366713
40000 6487263
41000 31253839
42000 8942436
43000 26520824
44000 27968851
45000 12832678
46000 12534081
47000 333511
48000 27796909
49000 27849601
50000 4015560
51000 29748257
52000 5538927
53000 28536422
54000 19169449
55000 390296
56000 25347105
57000 2486563
58000 21115706
59000 9442919
완료
59304


In [5]:
# True data와 False Data 같은지 체크
print(len(binutils_df))
print(bin8_3['label'].value_counts()[1])

59304
36894


## (5) False Data + True Data 합치기

In [6]:
f_data = pd.concat(binutils_df)
final = pd.concat([f_data, bin8_3_Ngram])
final.shape

(3795456, 2)

## (6) one hot encoding

In [7]:
# 훈련데이터 (gcc 최적화버전 0, 1, 2, 3 one hot encoding)
bc8_3_onehot_Ngram = pd.get_dummies(final['bin'])
bc8_3_onehot_Ngram = pd.concat([final['label'], bc8_3_onehot_Ngram], axis=1)

print('원핫인코딩완료')
print(bc8_3_onehot_Ngram.shape)

원핫인코딩완료
(3795456, 257)


In [8]:
# 훈련 데이터, 훈련 라벨
x_bc8_3 = bc8_3_onehot_Ngram.iloc[:,1:].to_numpy()
y_bc8_3 = bc8_3_onehot_Ngram['label'].to_numpy()
print(x_bc8_3.shape, x_bc8_3.shape)

x_bc8_3 = x_bc8_3.reshape(-1, right_idx, x_bc8_3.shape[1])
y_bc8_3 = y_bc8_3.reshape(-1, right_idx, 1)

print(x_bc8_3.shape, y_bc8_3.shape)

(3795456, 256) (3795456, 256)
(118608, 32, 256) (118608, 32, 1)


In [9]:
# numpy 행, 열 섞기
p = np.random.permutation(x_bc8_3.shape[0])

x_bc8_3 = x_bc8_3[p]
y_bc8_3 = y_bc8_3[p]

print(x_bc8_3.shape, y_bc8_3.shape)

(118608, 32, 256) (118608, 32, 1)


## (7) 모델

In [10]:
# (10) 양방향 LSTM 모델링 작업
from tensorflow.keras import layers, models
#from tf.keras.models import Model, Sequential
#from tf.keras.layers import SimpleRNN, Input, Dense, LSTM
#from tf.keras.layers import Bidirectional, TimeDistributed

# 학습
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(patience = 3) # 조기종료 콜백함수 정의

xInput = layers.Input(batch_shape=(None,right_idx, 256)) 
xBiLstm = layers.Bidirectional(layers.LSTM(48, return_sequences=True, stateful=False), merge_mode = 'concat')(xInput)
xOutput = layers.TimeDistributed(layers.Dense(1, activation ='sigmoid'))(xBiLstm) # 각 스텝에서 cost가 전송되고, 오류가 다음 step으로 전송됨.

## (8) 학습 - 10 KFold

In [11]:
# 교차검증 kfold
from sklearn.model_selection import KFold

# Accuracy, Precision, Recall, F1-Score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Confusion Matrix, ROC Curve
from sklearn.metrics import confusion_matrix, roc_auc_score

# 최종 평가지표들 평균용
accuracy, recall, precision, f1score, cm = [], [], [], [], []

# 11. 교차검증 kfold - k.split - 10회 / K-Fold 객체 생성
# kf = KFold(n_splits=10, shuffle=False, random_state=None) # KFold non shuffle 버전
kf = KFold(n_splits=10, shuffle=True, random_state=None) # KFold non shuffle 버전

for train, validation in kf.split(x_bc8_3, y_bc8_3):
    model1 = models.Model(xInput, xOutput)
    model1.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    model1.summary()
    print('======Training stage======')
    model1.fit(x_bc8_3[train],
               y_bc8_3[train],
               epochs = 10,
               batch_size = 128,
               callbacks=[early_stopping])
    #k_accuracy = '%.4f' %(model1.evaluate(data_10000x[validation], data_10000y[validation])[1])

# 12. 교차검증결과 predict - 검증셋들
    # predict 값
    k_pr = model1.predict(x_bc8_3[validation])
    
    # 테스트 predict 결과들 비교 (평가지표 보기위함)
    pred = np.round(np.array(k_pr).flatten().tolist())
    y_test = np.array(y_bc8_3[validation]).flatten().tolist()
    
# 13. 평가지표들 출력
    ## 평가지표들
    k_accuracy = float(accuracy_score(y_test, pred))
    k_recall =  float(recall_score(y_test, pred))
    k_precision = float(precision_score(y_test, pred))
    k_f1_score = float(f1_score(y_test, pred))
    #k_cm = float(confusion_matrix(y_test, pred))
    
    print('accuracy_score', k_accuracy)
    print('recall_score', k_recall)
    print('precision_score', k_precision)
    print('f1_score', k_f1_score)
    #print('\nconfusion_matrix\n', k_cm)

    accuracy.append(k_accuracy)
    recall.append(k_recall)
    precision.append(k_precision)
    f1score.append(k_f1_score)
    #cm.append(k_cm)
#    print('roc_curve 면적', roc_auc_score(y_test, pred))

# 최종 결과지표
print('\nK-fold cross validation Accuracy: {}'.format(accuracy))
print('\nK-fold cross validation Recall: {}'.format(recall))
print('\nK-fold cross validation Precision: {}'.format(precision))
print('\nK-fold cross validation F1-Score: {}'.format(f1score))
#print('\nK-fold cross validation ConfusionMatrix: {}'.format(cm))

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 106747 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2cc48c21c48>

accuracy_score 0.9991674395076301
recall_score 0.972422571064913
precision_score 0.9610062893081761
f1_score 0.966680725432307
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 106747 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2ce683d7a88>

accuracy_score 0.9993492327796981
recall_score 0.9860730111837941
precision_score 0.9627111660486197
f1_score 0.9742520587928698
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 106747 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2ce673c1248>

accuracy_score 0.9997576089705759
recall_score 0.9902044293015332
precision_score 0.9902044293015332
f1_score 0.9902044293015332
Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 106747 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2ce6d34cc48>

accuracy_score 0.9998287454683417
recall_score 0.9912393162393163
precision_score 0.9948530988633927
f1_score 0.9930429198330302
Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 106747 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2ce6d1a7d08>

accuracy_score 0.9998972472810049
recall_score 0.9938034188034188
precision_score 0.9978545376528641
f1_score 0.995824858152232
Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 106747 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2ce7ee24608>

accuracy_score 0.9999657490936683
recall_score 0.9984759416503375
precision_score 0.9986933797909407
f1_score 0.9985846488840501
Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 106747 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2ce6ca2eb48>

accuracy_score 0.9999367675575416
recall_score 0.9978773084270856
precision_score 0.9970307529162248
f1_score 0.9974538510502864
Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 106747 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2ce6f95b348>

accuracy_score 0.9999789225191805
recall_score 0.998502353444587
precision_score 0.9997857754927164
f1_score 0.999143652322843
Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 106748 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2ce70971b08>

accuracy_score 0.9999815556492412
recall_score 0.9987315010570824
precision_score 0.9997883597883598
f1_score 0.9992596509783183
Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 106748 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2ce67fbbf88>

accuracy_score 0.9999894603709949
recall_score 0.9997878659312686
precision_score 0.9993638676844784
f1_score 0.999575821845175

K-fold cross validation Accuracy: [0.9991674395076301, 0.9993492327796981, 0.9997576089705759, 0.9998287454683417, 0.9998972472810049, 0.9999657490936683, 0.9999367675575416, 0.9999789225191805, 0.9999815556492412, 0.9999894603709949]

K-fold cross validation Recall: [0.972422571064913, 0.9860730111837941, 0.9902044293015332, 0.9912393162393163, 0.9938034188034188, 0.9984759416503375, 0.9978773084270856, 0.998502353444587, 0.9987315010570824, 0.9997878659312686]

K-fold cross validation Precision: [0.9610062893081761, 0.9627111660486197, 0.9902044293015332, 0.9948530988633927, 0.9978545376528641, 0.9986933797909407, 0.9970307529162248, 0.9997857754927164, 0.9997883597883598, 0.9993638676844784]

K-fold cross validation F1-Score: [0.966680725432307, 0.9742520587928698, 0.9902044293015332, 0.9930429198330302, 0.995824858152232, 0.9985846488840501, 0.99745385105

## (9) 평가지표

In [12]:
print('10-Fold Cross_validation. Accuracy :', np.mean(accuracy))
print('10-Fold Cross_validation. Recall :', np.mean(recall))
print('10-Fold Cross_validation. Precision :', np.mean(precision))
print('10-Fold Cross_validation. F1-Score :', np.mean(f1score))

10-Fold Cross_validation. Accuracy : 0.9997852729197877
10-Fold Cross_validation. Recall : 0.9927117717103338
10-Fold Cross_validation. Precision : 0.9901291656847306
10-Fold Cross_validation. F1-Score : 0.9914022616592645


In [13]:
model1.save('gcc8_bin_core_s32_h48_o3.h5')
print('save 완료')

save 완료
