# 3. Random Forest Classifier

In [None]:
# 데이터 불러오기
# Normal / Abnormal 분리

import numpy as np
import pandas as pd
import time
from sklearn.metrics import roc_curve

df = pd.read_csv("final_data.csv").iloc[:,1:]
abnormal = df[df['label']==1]
normal = df[df['label']==0]
print(normal.shape)
print(abnormal.shape)

feature = ['flux',
           'zero_month',
           'zero_week',
           'continuous_zero',
           'diff_week_day',
           'diff_month',
           'std_week',
           'diff_std',
           'rec_day',
           'rec_week',
           'rec_2week',
           'rec_month',
           'rec_day2',
           'rec_week2',
           'rec_2week2',
           'rec_month2', ]

train_df = normal.sample(frac=0.2)
for _ in range(20) :
  train_df = train_df.append(abnormal.iloc[:abnormal.shape[0]//2])

test = normal.sample(frac=0.8)
for _ in range(20) :
  test = test.append(abnormal.iloc[abnormal.shape[0]//2:])

x_train = train_df.iloc[:,1:].loc[:, feature].astype('float64')
y_train = train_df.iloc[:,0]

x_test = test.iloc[:,1:].loc[:, feature].astype('float64')
y_test = test.iloc[:,0]

# Test Dataset의 정답 레이블
test_ground_truth = y_test.astype(np.int)

print('Train Dataset 크기', len(y_train))
print('Train Dataset 중 비정상', sum(y_train))
print('Test Dataset 크기', len(test_ground_truth))
print('Test Dataset 중 비정상', sum(test_ground_truth))

- 트리 수 100개

In [None]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier(100, max_depth = 4)

start = time.time()
estimator.fit(x_train, y_train)
end = time.time()
print("training time (sec): ", end - start)
print("training time per sample (sec): ", (end-start)/x_train.shape[0])

start = time.time()
# 비정상으로 예측할 확률 TP + FP
pred = estimator.predict_proba(x_test)[:,1]
end = time.time()
print("reference time (sec):", end-start)
print("reference time per sample (sec): ", (end-start)/(test.shape[0]))

# Prediction 값을 thresholds 값으로 사용
# thresholds : Positive 로 분류할 임계값 / 내림차순 정렬된 상태
# thresholds 값이 낮으면 tpr, fpr 모두 증가
# thresholds 값이 높으면 tpr, fpr 모두 감소
# thresholds 값에 따라 fpr 과 tpr 을 평가하고 모델의 성능을 보여줌
fpr, tpr, thresholds = roc_curve(test_ground_truth, pred)

# roc curve 그래프
plt.plot(fpr, tpr)
plt.title('ROC_CURVE')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

print("Thresholds : 0.1%")
# fpr 이 0.1% 보다 크면 
for i in range(len(fpr)):
  if fpr[i] > 0.001: #0.1%
    # fpr 이 thresholds 값 이상이 될 때 그 index 를 찾음
    if (i > 0): i -= 1
    break
# 그 때의 fpr, tpr 출력
print("FPR :", fpr[i])
print("TPR :", tpr[i])
print()
predicted = pred >= thresholds[i]
TP = int(sum(np.logical_and(test_ground_truth==1,  predicted==1)))
TN = int(sum(np.logical_and(test_ground_truth==0,  predicted==0)))
FP = int(sum(np.logical_and(test_ground_truth==0,  predicted==1)))
FN = int(sum(np.logical_and(test_ground_truth==1,  predicted==0)))

Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
print(TP, TN, FP, FN)
print("precision: ", Precision)
print("recall: ", Recall)
print("F-measure:", (2*Precision*Recall)/(Precision+Recall+1e-7))
print("MCC:", (TP*TN-FP*FN)/(np.sqrt(float((TP+FN)*(TP+FP)*(TN+FP)*(TN+FN)))+1e-7))
