In [1]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.preprocessing import LabelEncoder

# default module
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold

In [3]:
import warnings
warnings.filterwarnings('ignore')

###### 랜포 중요도에 따른 피처

In [4]:
def select_important_feautre(columns, feature_importance, num):
    
    #딕셔너리
    dic = {}
    for value in [list(v) for v in zip(columns, feature_importance)]:
        dic[str(value[1])] = value[0] 
    
    #피처 중요도
    f_i = list(feature_importance)
    f_i.sort(reverse=True)
    
    
    #n개 뽑기
    result = []
    for i in range(0, num):
        result.append(dic[str(f_i[i])])
    
    return result

In [5]:
def cross_val_score_custom(model, x, y, cv, dev = 0):
    
    print('\n\n')
    
    result = []
    
    for train_idx, test_idx in cv.split(x):
        
        new_x_train = x.iloc[train_idx, :]
        new_y_train = y.iloc[train_idx]
        
        new_x_test = x.iloc[test_idx, :]
        new_y_test = y.iloc[test_idx]
        
        model.fit(new_x_train, new_y_train)
        
        temp = np.mean(f1_score(new_y_test, model.predict(new_x_test), average=None))
        
        if dev:
            print('dev Mode # : {}'.format(temp))
        
        result.append(temp)
    
    print('\n\n')
    
    return np.mean(result)

In [6]:
def get_oof(model, x_train, y_train, x_test):
    
    le = LabelEncoder()
    le.fit(y_train)
    y_train = pd.Series(le.transform(y_train))
    
    oof_train = np.zeros((x_train.shape[0],))
    oof_test = np.zeros((x_test.shape[0],))
    oof_test_skf = np.empty((5, x_test.shape[0]))
    
    for i, (train_idx, test_idx) in enumerate(KFold(n_splits=5).split(x_train)):
        
        x_tr = x_train.iloc[train_idx, :]
        y_tr = y_train.iloc[train_idx]
        x_te = x_train.iloc[test_idx, :]
        
        model.fit(x_tr, y_tr)

        oof_train[test_idx] = model.predict(x_te)
        oof_test_skf[i, :] = model.predict(x_test)
    
    # version 1
    # oof_test[:] = np.round(oof_test_skf.mean(axis=0))
    
    # version 2
    oof_test[:] = pd.DataFrame(oof_test_skf).apply(lambda v: v.value_counts().idxmax(), axis=0)
    
    oof_train = [int(v) for v in oof_train]
    oof_train = le.inverse_transform(oof_train)
    
    oof_test = [int(v) for v in oof_test]
    oof_test = le.inverse_transform(oof_test)
    
    return oof_train.reshape(-1,1), oof_test.reshape(-1,1)

###### 데이터 로드

In [9]:
x_data = pd.read_csv('./data/data.csv')
# label 불러오기
y_data = pd.read_excel('./data/label.xlsx')
y_data['hash'] = y_data['hash'].apply(lambda v: v[:-4].upper())
# y_data.columns = ['hash', 'is_mal']

In [12]:
data = pd.merge(y_data, x_data, on='hash', how = 'left')

In [14]:
data = data.fillna(0)

In [9]:
# data_extra = pd.read_csv('./data/data_extra.csv').drop(['Unnamed: 0'], axis = 1)

In [10]:
# data_extra['is_mal'] = 0

In [11]:
# data = pd.concat([data, data_extra], sort=False).drop_duplicates()

In [12]:
# for i, v in enumerate(data.columns):
#     print("{}: {}".format(i, v))

###### train test data split

In [17]:
x_data = data.iloc[:,2:]
y_data = data.iloc[:,1]

In [23]:
def get_cv_score(x_data, y_data, model, k):
    
    cv = KFold(k, shuffle=True, random_state=0)
    
    train_score = []
    cv_score = []
    
    for idx_train, idx_test in cv.split(X=x_data, y=y_data):
        
        x_train = x_data.iloc[idx_train]
        x_test = x_data.iloc[idx_test]
        y_train = y_data.iloc[idx_train]
        y_test = y_data.iloc[idx_test]

        model.fit(x_train, y_train)
        
        train_score.append(model.score(x_train, y_train))
        cv_score.append(model.score(x_test, y_test))
    
    return train_score, cv_score

In [24]:
rf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1, max_depth=28)

In [25]:
train_score, cv_score = get_cv_score(x_data=x_data, y_data=y_data, k=5, model=rf)
# print("========[{}]========".format(feature_set_name[nu]))
print("train list : {}".format(train_score))
print("cv list : {}".format(cv_score))
print("train: {} \n cv: {}".format(np.mean(train_score), np.mean(cv_score)))
print("\n")

train list : [0.999625, 0.99925, 0.999125, 0.999375, 0.9995]
cv list : [0.945, 0.9475, 0.945, 0.937, 0.9505]
train: 0.999375 
 cv: 0.945




In [15]:
x_test = pd.read_csv('./data/data_test.csv').drop(['Unnamed: 0'], axis = 1)

In [16]:
result = pd.read_csv('./data/result_malware_student_songwonho.csv')

In [17]:
x_test

Unnamed: 0,hash,TimeDateStamp,MajorLinkerVersion,MinorOperatingSystemVersion,MajorSubsystemVersion,SizeOfStackCommit,SizeOfUninitializedData,NumberOfRvaAndSizes,SizeOfHeapReserve,SizeOfHeapCommit,...,loadlibrarya,freelibrary,releasemutex,api_etc,createmutexa,createmutexw,openmutexa,FunctionCnt,FunctionLibCnt,FunctionNoRetCnt
0,000384C2E991519DB9B49C8FF39E8E89,1260053446,6,0,4,4096,1024,16,1048576,4096,...,1,1,0,0,0,0,0,87,6,1
1,000C8778DAA1276584679CD21F2037ED,708992537,2,0,4,16384,61440,16,1048576,4096,...,0,0,0,0,0,0,0,1,0,0
2,001193DED99300A380F6DCA28C61236D,1144432766,6,0,4,4096,1024,16,1048576,4096,...,1,1,0,0,0,0,0,80,6,1
3,00130CDF56DEB3A8F894E29565D5A760,0,2,0,4,4096,1024,16,1048576,4096,...,0,0,0,0,0,0,0,99,32,4
4,00131ECCFDFAD08DABA3742B7142258A,1373857134,9,0,4,4096,0,16,1048576,4096,...,0,0,0,0,0,0,0,17,1,0
5,0023077CC67A46FDE163AF616910D6CC,1345687972,8,0,4,4096,0,16,1048576,4096,...,0,0,0,0,0,0,0,421,0,0
6,002768F23D157D643138C798890B3B64,1485906012,10,1,5,4096,0,16,1048576,4096,...,0,0,0,0,0,0,0,689,307,10
7,0029A2BBE72A871392CF0ECD7E0AFFE7,1260053452,6,0,4,4096,1024,16,1048576,4096,...,1,1,0,0,0,0,0,86,6,1
8,002DC4A8990FFEC5DFAEAED8FECE41DE,1420504568,5,0,4,4096,0,16,1048576,4096,...,0,0,0,0,0,0,0,7,4,0
9,003173C2FB52AEF695D83EC529D5C70D,1327116325,8,0,4,4096,0,16,1048576,4096,...,0,0,0,0,0,0,0,305,0,0


In [19]:
result = result['hash'].apply(lambda v: v[:-4].upper())

In [21]:
result = result.to_frame()

In [22]:
data = pd.merge(result, x_test, on='hash', how = 'left')

In [26]:
data = data.fillna(0)

In [29]:
data = data.drop(['hash'], axis = 1)

In [32]:
x_test = data

In [33]:
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.2, learning_rate=0.1,
       max_delta_step=0, max_depth=10, min_child_weight=1, missing=None,
       n_estimators=600, n_jobs=1, nthread=1, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.8)

tree = DecisionTreeClassifier(criterion='entropy', max_depth=28, random_state=0)

In [34]:
oof_train_rf, oof_test_rf = get_oof(rf, x_train, y_train, x_test)
oof_train_xgb, oof_test_xgb = get_oof(xgb, x_train, y_train, x_test)
oof_train_tree, oof_test_tree = get_oof(tree, x_train, y_train, x_test)

In [35]:
x_train = np.concatenate((oof_train_rf, oof_train_xgb, oof_train_tree), axis = 1)
x_train = pd.DataFrame(x_train)

x_test = np.concatenate((oof_test_rf, oof_test_xgb, oof_test_tree), axis = 1)
x_test = pd.DataFrame(x_test)

In [36]:
oof_train_rf

array([[0],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]], dtype=int64)

In [37]:
oof_train_xgb

array([[0],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]], dtype=int64)

In [38]:
oof_train_tree

array([[0],
       [1],
       [0],
       ...,
       [1],
       [1],
       [1]], dtype=int64)

In [39]:
x_train.columns = ['rf', 'xgb', 'tree']
x_test.columns = ['rf', 'xgb', 'tree']

In [40]:
x_train

Unnamed: 0,rf,xgb,tree
0,0,0,0
1,1,1,1
2,1,1,0
3,1,1,1
4,1,1,1
5,1,1,1
6,0,0,0
7,1,1,1
8,0,0,0
9,1,1,1


## layer 2

In [41]:
x_test.apply(lambda v: v.value_counts().idxmax(), axis=1)

0       1
1       1
2       0
3       0
4       1
5       1
6       0
7       1
8       1
9       1
10      0
11      0
12      0
13      1
14      1
15      1
16      0
17      1
18      1
19      1
20      0
21      1
22      1
23      1
24      0
25      1
26      1
27      1
28      0
29      1
       ..
9970    1
9971    0
9972    0
9973    1
9974    1
9975    1
9976    0
9977    1
9978    1
9979    0
9980    1
9981    1
9982    1
9983    1
9984    1
9985    1
9986    0
9987    1
9988    1
9989    1
9990    0
9991    1
9992    1
9993    0
9994    0
9995    1
9996    0
9997    1
9998    1
9999    0
Length: 10000, dtype: int64

In [44]:
rf_l2 = RandomForestClassifier(n_estimators=3, random_state=3, max_depth=3, n_jobs=-1)

In [45]:
rf_l2.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=3, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=False)

In [47]:
print("훈련 세트 정확도: {:.3f}".format(rf_l2.score(x_train, y_train)))

훈련 세트 정확도: 0.952


In [49]:
rf_l2.predict(x_test)

array([1, 1, 0, ..., 1, 1, 0], dtype=int64)

In [51]:
result['is_mal'] = rf_l2.predict(x_test)

In [56]:
result['hash'] = result['hash'].apply(lambda v: (v + '.vir').lower())

In [58]:
result.to_csv('./first_try.csv')

In [None]:
print("크로스발리데이션 세트 정확도: {:.3f}".format(rf_l2.score(x_test, y_cv)))

In [49]:
print(classification_report(y_cv, tree.predict(x_cv)))

             precision    recall  f1-score   support

          0       0.92      0.90      0.91      2244
          1       0.90      0.92      0.91      2101

avg / total       0.91      0.91      0.91      4345



In [270]:
from sklearn.metrics import accuracy_score


In [271]:
accuracy_score(y_cv, x_test.apply(lambda v: v.value_counts().idxmax(), axis=1))

0.944