## RNN 모델 N-Byte 방식 (함수정보 포함 vs 미포함 => 1:1 비율)

## (1) 데이터로드

In [1]:
# (1) 데이터로드
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(action='ignore')

# 여러개 쳐도 나오게
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# 파일읽기
bin6_0 = pd.read_csv("data/"+'gcc6'+"_0_.csv", index_col=0)
core6_0 = pd.read_csv("data/"+"core_gcc6_0_.csv", index_col=0)

# 형태 출력
print(bin6_0.shape, core6_0.shape)

# reset_index (hex processing 하면서 값이 빠졌으니까 + n_gram 에서 index를 다루기 때문에)
bin6_0.reset_index(inplace=True, drop=True)
core6_0.reset_index(inplace=True, drop=True)

print('reset_index 완료')
print('input data shape')
bin6_0.head()
core6_0.head()

(13784128, 2) (4113302, 2)
reset_index 완료
input data shape


Unnamed: 0,bin,label
0,85.0,1
1,137.0,0
2,229.0,0
3,86.0,0
4,83.0,0


In [2]:
# (2-1) 데이터체크 1 - hex(16진수)가 256 label을 가져야 dummies 변환 가능 
# 16진수 256개 종류가 있어서 pd.get_dummies 사용 가능.
print(len(bin6_0['bin'].unique()))
print(len(core6_0['bin'].unique()))

# (2-2) 데이터 체크 2 - 1, 0 비율 ==> 1이 함수의 갯수를 뜻함
# 정답 데이터 1, 0 비율 확인  ==> 1이 함수의 갯수를 뜻함
print(bin6_0['label'].value_counts())
print(core6_0['label'].value_counts())

256
256
0    13750881
1       33247
Name: label, dtype: int64
0    4096245
1      17057
Name: label, dtype: int64


## (3) N Byte씩 자르기

In [3]:
idx_bin = bin6_0[bin6_0['label']==1].index  # 407, 474 ...
idx_core = core6_0[core6_0['label']==1].index  # 407, 474 ...

ls_bin = list(idx_bin)
ls_core = list(idx_core)

# 최종 뽑을 행에 대한 index
ls_idx_bin, ls_idx_core = [], [] 

# n byte 자르기 방식
left_idx, right_idx = 0, 32 # 3개씩

# n byte 자르기
for k in range(left_idx, right_idx):
    ls_idx_bin.extend(list(idx_bin + k)) # index 형이라서 가능
    ls_idx_core.extend(list(idx_core + k))

#ls_idx = list(set(ls_idx)) 
ls_idx_bin.sort() # 인덱스 정렬
ls_idx_core.sort() # 인덱스 정렬


# 1차 index 해당범위 초과한 것들 없애기
ls_idx_bin = list(filter(lambda x: x<len(bin6_0), ls_idx_bin))
ls_idx_core = list(filter(lambda x: x<len(core6_0), ls_idx_core))
print(len(ls_idx_bin), len(ls_idx_core))

# 2차 남은 index들 중 right_idx 나눈 나머지 없애기
sub_bin = len(ls_idx_bin)%(right_idx)
sub_core = len(ls_idx_core)%(right_idx)
print('나머지', sub_bin, sub_core)

ls_idx_bin = ls_idx_bin[:len(ls_idx_bin)-sub_bin]
ls_idx_core = ls_idx_core[:len(ls_idx_core)-sub_core]
print('최종 길이', len(ls_idx_bin), len(ls_idx_core))

print('bin6_0', len(ls_idx_bin), len(ls_idx_core))

# loc 로 수정필요
bin6_0_Ngram = bin6_0.loc[ls_idx_bin,:].copy()
core6_0_Ngram = core6_0.loc[ls_idx_core,:].copy()

1063904 545824
나머지 0 0
최종 길이 1063904 545824
bin6_0 1063904 545824


## (4) false data 만들기

In [4]:
# false data 만들기 - False 데이터 랜덤 생성

# 목표치
goal_bin = len(bin6_0_Ngram)/right_idx
count_bin = 0

goal_core = len(core6_0_Ngram)/right_idx
count_core = 0

print(goal_bin, goal_core)

# 최종 데이터 Frame
d_bin = pd.DataFrame(columns = bin6_0.columns)
d_core = pd.DataFrame(columns = core6_0.columns)

binutils_df, coreutils_df = [], []
# goal 에 도달할 때까지
while True:
    if (count_bin == goal_bin) and (count_core == goal_core):
            break
    # 진행상황 살펴보기 위함
            
    # 랜덤 N 바이트씩 뽑음
    # random index
    random_idx_bin = np.random.randint(len(bin6_0)-right_idx)
    random_idx_core = np.random.randint(len(core6_0)-right_idx)

    if count_bin % 1000==0:
        print(count_bin, end=' ')
        print(random_idx_bin, random_idx_core)

    df_bin = bin6_0[random_idx_bin : random_idx_bin + right_idx]
    df_core = core6_0[random_idx_core : random_idx_core + right_idx]
    
    # 뽑은 index의 N 바이트 중에 1이 없는 경우만
    if 1 not in df_bin['label'] and count_bin < goal_bin:
        binutils_df.append(df_bin)
        count_bin+=1
        
    if (1 not in df_core['label']) and count_core <goal_core:
        coreutils_df.append(df_core)
        count_core+=1

print('완료')
print(len(binutils_df), len(coreutils_df))

33247.0 17057.0
0 4269999 571779
1000 8759947 3333806
2000 4284171 457596
3000 10456316 1523758
4000 678545 3434275
5000 8928510 753457
6000 8736443 1944455
7000 9892945 1998893
8000 7203964 1962512
9000 10932068 3104786
10000 7726791 651369
11000 3466775 3940832
12000 11945541 1598963
13000 2128526 3442720
14000 2569494 1607437
15000 1948599 2730703
16000 4049658 3109064
17000 2555967 1295587
18000 12479257 3448192
19000 558080 47049
20000 468709 1625668
21000 8018770 822066
22000 4774959 604980
23000 192007 1060515
24000 11131805 3182900
25000 5903345 1812188
26000 8573447 869242
27000 13065865 1578585
28000 8224843 2598263
29000 6150530 1146188
30000 13713664 4066413
31000 5285754 565012
32000 11836513 426313
33000 3364745 1777812
완료
33247 17057


In [5]:
# True data와 False Data 같은지 체크
print(len(binutils_df) + len(coreutils_df))
print(bin6_0['label'].value_counts()[1] + core6_0['label'].value_counts()[1])

50304
50304


In [6]:
f_bin_data = pd.concat(binutils_df)
f_core_data = pd.concat(coreutils_df)
f_data = pd.concat([f_bin_data, f_core_data], axis=0)
f_data.shape

(1609728, 2)

## (5) False Data + True Data 합치기

In [7]:
final = pd.concat([f_data, bin6_0_Ngram])
final.shape

(2673632, 2)

## (6) one hot encoding

In [8]:
# 훈련데이터 (gcc 최적화버전 0, 1, 2, 3 one hot encoding)
bc6_0_onehot_Ngram = pd.get_dummies(final['bin'])
bc6_0_onehot_Ngram = pd.concat([final['label'], bc6_0_onehot_Ngram], axis=1)

print('원핫인코딩완료')
print(bc6_0_onehot_Ngram.shape)

원핫인코딩완료
(2673632, 257)


In [9]:
# 훈련 데이터, 훈련 라벨
x_bc6_0 = bc6_0_onehot_Ngram.iloc[:,1:].to_numpy()
y_bc6_0 = bc6_0_onehot_Ngram['label'].to_numpy()
print(x_bc6_0.shape, x_bc6_0.shape)

x_bc6_0 = x_bc6_0.reshape(-1, right_idx, x_bc6_0.shape[1])
y_bc6_0 = y_bc6_0.reshape(-1, right_idx, 1)

print(x_bc6_0.shape, y_bc6_0.shape)

(2673632, 256) (2673632, 256)
(83551, 32, 256) (83551, 32, 1)


In [10]:
# numpy 행, 열 섞기
p = np.random.permutation(x_bc6_0.shape[0])

x_bc6_0 = x_bc6_0[p]
y_bc6_0 = y_bc6_0[p]

print(x_bc6_0.shape, y_bc6_0.shape)

(83551, 32, 256) (83551, 32, 1)


## (7) 모델

In [11]:
# (10) 양방향 LSTM 모델링 작업
from tensorflow.keras import layers, models
#from tf.keras.models import Model, Sequential
#from tf.keras.layers import SimpleRNN, Input, Dense, LSTM
#from tf.keras.layers import Bidirectional, TimeDistributed

# 학습
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(patience = 3) # 조기종료 콜백함수 정의

xInput = layers.Input(batch_shape=(None,right_idx, 256)) 
xBiLstm = layers.Bidirectional(layers.LSTM(48, return_sequences=True), merge_mode = 'concat')(xInput)
xOutput = layers.TimeDistributed(layers.Dense(1, activation ='sigmoid'))(xBiLstm) # 각 스텝에서 cost가 전송되고, 오류가 다음 step으로 전송됨.

## (8) 학습 - 10 KFold

In [12]:
# 교차검증 kfold
from sklearn.model_selection import KFold

# Accuracy, Precision, Recall, F1-Score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Confusion Matrix, ROC Curve
from sklearn.metrics import confusion_matrix, roc_auc_score

# 최종 평가지표들 평균용
accuracy, recall, precision, f1score, cm = [], [], [], [], []

# 11. 교차검증 kfold - k.split - 10회 / K-Fold 객체 생성
# kf = KFold(n_splits=10, shuffle=False, random_state=None) # KFold non shuffle 버전
kf = KFold(n_splits=10, shuffle=True, random_state=None) # KFold non shuffle 버전

for train, validation in kf.split(x_bc6_0, y_bc6_0):
    model1 = models.Model(xInput, xOutput)
    model1.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    model1.summary()
    print('======Training stage======')
    model1.fit(x_bc6_0[train],
               y_bc6_0[train],
               epochs = 10,
               batch_size = 32,
               callbacks=[early_stopping])
    #k_accuracy = '%.4f' %(model1.evaluate(data_10000x[validation], data_10000y[validation])[1])

# 12. 교차검증결과 predict - 검증셋들
    # predict 값
    k_pr = model1.predict(x_bc6_0[validation])
    
    # 테스트 predict 결과들 비교 (평가지표 보기위함)
    pred = np.round(np.array(k_pr).flatten().tolist())
    y_test = np.array(y_bc6_0[validation]).flatten().tolist()
    
# 13. 평가지표들 출력
    ## 평가지표들
    k_accuracy = float(accuracy_score(y_test, pred))
    k_recall =  float(recall_score(y_test, pred))
    k_precision = float(precision_score(y_test, pred))
    k_f1_score = float(f1_score(y_test, pred))
    #k_cm = float(confusion_matrix(y_test, pred))
    
    print('accuracy_score', k_accuracy)
    print('recall_score', k_recall)
    print('precision_score', k_precision)
    print('f1_score', k_f1_score)
    #print('\nconfusion_matrix\n', k_cm)

    accuracy.append(k_accuracy)
    recall.append(k_recall)
    precision.append(k_precision)
    f1score.append(k_f1_score)
    #cm.append(k_cm)
#    print('roc_curve 면적', roc_auc_score(y_test, pred))

# 최종 결과지표
print('\nK-fold cross validation Accuracy: {}'.format(accuracy))
print('\nK-fold cross validation Recall: {}'.format(recall))
print('\nK-fold cross validation Precision: {}'.format(precision))
print('\nK-fold cross validation F1-Score: {}'.format(f1score))
#print('\nK-fold cross validation ConfusionMatrix: {}'.format(cm))

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 75195 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f231773488>

accuracy_score 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 75196 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f24fb4a1c8>

accuracy_score 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 75196 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f4672a0d88>

accuracy_score 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0
Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 75196 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f46687e588>

accuracy_score 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0
Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 75196 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f46ce8b508>

accuracy_score 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0
Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 75196 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f4630d7dc8>

accuracy_score 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0
Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 75196 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f4888425c8>

accuracy_score 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0
Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 75196 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f23182bf88>

accuracy_score 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0
Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 75196 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f2500e9c08>

accuracy_score 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0
Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 256)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 96)            117120    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 1)             97        
Total params: 117,217
Trainable params: 117,217
Non-trainable params: 0
_________________________________________________________________
Train on 75196 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f4888c22c8>

accuracy_score 1.0
recall_score 1.0
precision_score 1.0
f1_score 1.0

K-fold cross validation Accuracy: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

K-fold cross validation Recall: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

K-fold cross validation Precision: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

K-fold cross validation F1-Score: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


## (9) 평가지표

In [13]:
print('10-Fold Cross_validation. Accuracy :', np.mean(accuracy))
print('10-Fold Cross_validation. Recall :', np.mean(recall))
print('10-Fold Cross_validation. Precision :', np.mean(precision))
print('10-Fold Cross_validation. F1-Score :', np.mean(f1score))

10-Fold Cross_validation. Accuracy : 1.0
10-Fold Cross_validation. Recall : 1.0
10-Fold Cross_validation. Precision : 1.0
10-Fold Cross_validation. F1-Score : 1.0
