# 2. 특징 선택 (RFECV)

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("final_data.csv").iloc[:,1:]
abnormal = df[df['label']==1]
normal = df[df['label']==0]
print(normal.shape)
print(abnormal.shape)

In [None]:
from tqdm import tqdm

a = None
for i in tqdm(range(200)):
  final_data = normal.sample(frac=0.1)
  where = final_data.shape[0]

  for i in range(20):
    final_data = final_data.append(abnormal)

  estimator = RandomForestClassifier(50, max_depth = 4)
  x = scale(final_data.iloc[:,1:].astype('float64'))
  y = final_data.iloc[:,0]

  estimator.fit(x, y)
  if a is None:
    a = estimator.feature_importances_
  else:
    a += estimator.feature_importances_

# feature importance 내림차순 출력
sorted = np.argsort(a)[::-1]
feature_col = final_data.columns.drop("label")

for i in range(len(feature_col)):
  print(feature_col[sorted[i]], ":", a[sorted[i]]/200)

- 최적의 특징 개수 구하기

In [None]:
# RFECV
np.random.seed(777)

final_data = normal.sample(frac=0.1)
where = final_data.shape[0]
for i in range(25):
  final_data = final_data.append(abnormal)


estimator = RandomForestClassifier(50, max_depth = 4)
selector = RFECV(estimator, min_features_to_select = 2, cv = 2)


x = scale(final_data.iloc[:,1:].astype('float64'))
y = final_data.iloc[:,0]

selector.fit(x, y)

In [None]:
# 최적의 특징 개수 찾기

print("optimal number of features:", selector.n_features_)

# cv 개수 = grid_score 의 열 수
# grid_score 의 열 의미 : 각 subset의 valdation score
mean_grid_scores = np.mean(selector.grid_scores_, axis=1)

plt.figure(figsize=[12, 8])
plt.plot(range(2,2+len(mean_grid_scores)), mean_grid_scores,
                        lw=4, marker='o', ms=18, color="red", mfc='red')

# 그래프 y축 제한 범위
# grid_score : 모델 성능
plt.ylim([min(mean_grid_scores)*0.9999, max(mean_grid_scores)*1.0001])
plt.ylabel("Cross Validation Score",
           size=40)
plt.xlabel("Number of Features", 
           size=40)
plt.yticks(size=25)
plt.xticks([5, 10, 15, 20],
           size=25)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.grid()
plt.tight_layout()
plt.savefig("RFECV_Num_of_Features_Selection.png")