In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from aicentro.session import Session
sacp_session = Session(verify=False)
from aicentro.framework.framework import BaseFramework as SacpFrm
sacp_framework = SacpFrm(session=sacp_session)

###  <b>데이터 불러오기</b>
---
- 학습 데이터 불러오기

In [None]:
Train_Data = pd.read_csv('TrainData.csv',delimiter=',')

### <b>데이터 전처리</b>
---

In [None]:
Train_Data=Train_Data.drop_duplicates()

In [None]:
Train_Data['Result_v1'].replace({'benign':1,'malicious':-1}, inplace=True)

In [None]:
Train_Data.drop(columns=["url_chinese_present","html_num_tags('applet')"],inplace=True)

In [None]:
Train_Data = Train_Data.dropna(axis=0)

In [None]:
X = Train_Data.iloc[:,0:len(Train_Data.columns)-1].values
y = Train_Data.iloc[:,len(Train_Data.columns)-1].values

### <b> train_test_split을 이용하여, train_x, val_x, train_y, val_y로 데이터 분리</b>
---

- test_size = 0.3
- random_state = 2021

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# train_test_split 사용
train_x, val_x, train_y, val_y = train_test_split(X, y,test_size=0.3,random_state=2021) 

In [None]:
train_x.shape, val_x.shape, train_y.shape, val_y.shape

### <b>Confusion Matrix 함수 정의</b>
---

- Confusion Matrix란?
 - Training 을 통한 Prediction 성능을 측정하기 위해 예측 value와 실제 value를 비교하기 위한 표
 - 참고 사이트 : https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
 
 


In [None]:
from sklearn.metrics import classification_report as creport
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [None]:
def plot_confusion_matrix(ax, matrix, labels = ['malicious','benign'], title='Confusion matrix', fontsize=9):
    ax.set_xticks([x for x in range(len(labels))])
    ax.set_yticks([y for y in range(len(labels))])

    # Place labels on minor ticks
    ax.set_xticks([x + 0.5 for x in range(len(labels))], minor=True)
    ax.set_xticklabels(labels, rotation='90', fontsize=fontsize, minor=True)
    ax.set_yticks([y + 0.5 for y in range(len(labels))], minor=True)
    ax.set_yticklabels(labels[::-1], fontsize=fontsize, minor=True)

    # Hide major tick labels
    ax.tick_params(which='major', labelbottom='off', labelleft='off')

    # Finally, hide minor tick marks
    ax.tick_params(which='minor', width=0)

    # Plot heat map
    proportions = [1. * row / sum(row) for row in matrix]
    ax.pcolor(np.array(proportions[::-1]), cmap=plt.cm.Blues)

    # Plot counts as text
    for row in range(len(matrix)):
        for col in range(len(matrix[row])):
            confusion = matrix[::-1][row][col]
            if confusion != 0:
                ax.text(col + 0.5, row + 0.5, int(confusion),
                        fontsize=fontsize,
                        horizontalalignment='center',
                        verticalalignment='center')

    # Add finishing touches
    ax.grid(True, linestyle=':')
    ax.set_title(title, fontsize=fontsize)
    ax.set_xlabel('prediction', fontsize=fontsize)
    ax.set_ylabel('actual', fontsize=fontsize)

    plt.show()

### <b> Q2. DecisonTree 모델을 만들어보자

In [None]:
# 1. import
from sklearn.tree import DecisionTreeClassifier


In [None]:
# 2.선언
dtc = DecisionTreeClassifier()



In [None]:
# 3. fit()
dtc.fit(train_x,train_y)



In [None]:
# 3. predict()
dtc_pred = dtc.predict(val_x)


In [None]:
# train 및 val 데이터 정확도 확인 : score()
dtc.score(train_x, train_y), dtc.score(val_x, val_y)


In [None]:
#Confusion Matrix 확인
confusion = confusion_matrix(val_y, dtc_pred)
fig, ax = plt.subplots(figsize=(10,3))
plot_confusion_matrix(ax, confusion, fontsize=30)




In [None]:
plt.figure(figsize=(20,12))
plt.barh(y=Train_Data.columns[:-1],
        width = dtc.feature_importances_)
plt.show()

### <b>앙상블(Ensemble)</b>

## Q3. Random Forest 모델을 만들어보자

In [None]:
# 1. import
from sklearn.ensemble import RandomForestClassifier


In [None]:
# 2.선언
rfc = RandomForestClassifier()



In [None]:
# 3. fit()
rfc.fit(train_x,train_y)



In [None]:
# 3. predict()
rfc_pred = rfc.predict(val_x)



In [None]:
# train 및 val 데이터 정확도 확인 :score()
rfc.score(train_x,train_y), rfc.score(val_x, val_y)



In [None]:
# Confusion Matrix 확인
confusion = confusion_matrix(val_y, rfc_pred)
fig, ax = plt.subplots(figsize=(10,3))
plot_confusion_matrix(ax, confusion, fontsize=30)



In [None]:
# feature_importances_ 활용 Feature별 가중치 확인
plt.figure(figsize=(20,12))
plt.barh(y=Train_Data.columns[:-1],
        width = rfc.feature_importances_)
plt.show()

### <b> Q4. AdaBoost 모델을 만들어보자

In [None]:
# 1. import
from sklearn.ensemble import AdaBoostClassifier


In [None]:
# 2.선언
abc = AdaBoostClassifier()


In [None]:
# 3. fit()
abc.fit(train_x,train_y)


In [None]:
# 3. predict()
abc_pred = abc.predict(val_x)


In [None]:
# train 및 val 데이터 정확도 확인 : score()
abc.score(train_x, train_y), abc.score(val_x, val_y)


In [None]:
#Confusion Matrix 확인
confusion = confusion_matrix(val_y, abc_pred)
fig, ax = plt.subplots(figsize=(10,3))
plot_confusion_matrix(ax, confusion, fontsize=30)


In [None]:
# feature_importances_ 활용 Feature별 가중치 확인
plt.figure(figsize=(20,12))
plt.barh(y=Train_Data.columns[:-1],
        width = abc.feature_importances_)
plt.show()

### <b> Q5. Gradient Boost 모델을 만들어보자

In [None]:
# 1. import
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
# 2.선언
gbc = GradientBoostingClassifier()


In [None]:
# 3. fit()
gbc.fit(train_x,train_y)


In [None]:
# 3. predict()
gbc_pred = gbc.predict(val_x)


In [None]:
# train 및 val 데이터 정확도 확인 : score()
gbc.score(train_x, train_y), gbc.score(val_x,val_y)


In [None]:
#Confusion Matrix 확인
confusion = confusion_matrix(val_y, gbc_pred)
fig, ax = plt.subplots(figsize=(10,3))
plot_confusion_matrix(ax, confusion, fontsize=30)


In [None]:
# feature_importances_ 활용 Feature별 가중치 확인
plt.figure(figsize=(20,12))
plt.barh(y=Train_Data.columns[:-1],
        width = gbc.feature_importances_)
plt.show()