In [None]:
!git clone https://github.com/Seung-hwanSong/Tree.git #코랩 사용

# [의사결정나무 및 앙상블 Part 1]
## Random Forest - Feature Importance (Attribute Selection)

### 수백 또는 수백만 개의 feature 중, 가장 중요한 feature만 포함하는 모델을 만드는 것

- 모델을 더 쉽게 해석 할 수 있음
- 모델의 분산을 줄일 수 있음
- 모델 학습 과정에서의 computational cost를 줄일 수 있음 

## 1) 모듈 불러오기

In [None]:
""" 데이터 전처리 """
import pandas as pd
import numpy as np

""" 모델 생성, 학습, 평가 """
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score # 정확도 함수

In [None]:
# 데이터 불러오기 (kaggle data)
data = pd.read_csv('/content/Tree/data/otto_train.csv')
# data = pd.read_csv("./data/otto_train.csv") #로컬

data.head()

In [None]:
# 무의미한 변수 제거

data= data.drop(['id'],axis=1)

In [None]:
# 타겟 변수의 형변환

mapping_dict = {'Class_1' : 1,
                'Class_2' : 2,
                'Class_3' : 3,
                'Class_4' : 4,
                'Class_5' : 5,
                'Class_6' : 6,
                'Class_7' : 7,
                'Class_8' : 8,
                'Class_9' : 9,}
after_mapping_target = data['target'].apply(lambda x : mapping_dict[x])
after_mapping_target

In [None]:
# features/target, train/test dataset 분리

feature_columns = list(data.columns.difference(['target']))
X = data[feature_columns]
y = after_mapping_target

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 2024) # 학습데이터와 평가데이터의 비율을 8:2 로 분할| 
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

## 2) Random Forest 적합

In [None]:
forest = RandomForestClassifier(n_estimators = 300, max_depth = 100, criterion = 'gini', random_state = 1, n_jobs = -1)

forest.fit(train_x, train_y)

In [None]:
# 변수 중요도 저장 

importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

In [None]:
# Print the feature ranking
print("Featrue ranking: ")

for f in range(train_x.shape[1]):
    print("{}. feature {} ({:.3f})".format(f + 1, train_x.columns[indices][f], importances[indices[f]]))

In [None]:
# Matplotlib 을 이용한 plotting

import matplotlib.pyplot as plt

plt.figure(figsize=(20, 6))
plt.title("Feature importances")
plt.bar(range(train_x.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(train_x.shape[1]), train_x.columns[indices], rotation=45)
plt.xlim([-1, train_x.shape[1]])
plt.show()

## 3) 계산된 feature importance를 얼마만큼 신뢰할 수 있을까?

- train_x에 random 열을 만들어, 무작위로 값을 넣음
- Fitting이 되더라도 random 열의 feature importance는 낮아야 함

In [None]:
train_x['random'] = np.random.random(size=len(train_x))

In [None]:
# 위와 동일한 모델로 훈련 
forest_rand = RandomForestClassifier(n_estimators = 300, max_depth = 100, criterion = 'gini', random_state = 2024, n_jobs = -1)

forest_rand.fit(train_x, train_y)

In [None]:
importances = forest_rand.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest_rand.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

In [None]:
# Print the feature ranking
print("Featrue ranking: ")

for f in range(train_x.shape[1]):
    print("{}. feature {} ({:.3f})".format(f + 1, train_x.columns[indices][f], importances[indices[f]]))

## 전혀 상관이 없어야 할 random 열의 feature importance가 다소 높은것을 확인 가능

In [None]:
plt.figure(figsize=(20, 6))
plt.title("Feature importances")
plt.bar(range(train_x.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(train_x.shape[1]), train_x.columns[indices], rotation=45)
plt.xlim([-1, train_x.shape[1]])
plt.show()

## * Problem
- Gini Impurity 기반의 feature importance는 기본적으로 연속형 변수나, 많은 범주를 가진(high-cardinality) 변수의 importance를 과대평가하는 경향이 존재
- 이러한 변수들이 노드 부닉의 기준이 될 기회가 많아서 발생하는 문제

## * Permutation importance
- 이러한 문제 상황에서 사용하는 방법으로, 해당 변수의 데이터를 임의로 섞는 permutation importance가 존재

In [None]:
def permutation_importances(rf, train_x, train_y, metric):
    baseline = metric(rf, train_x, train_y)
    imp = []
    for col in train_x.columns:
        save = train_x[col].copy()
        train_x[col] = np.random.permutation(train_x[col])
        m = metric(rf, train_x, train_y)
        train_x[col] = save
        imp.append(baseline - m)
    return np.array(imp)

In [None]:
import sklearn
from distutils.version import LooseVersion
if LooseVersion(sklearn.__version__) >= LooseVersion("0.24"):
    # In sklearn version 0.24, forest module changed to be private.
    from sklearn.ensemble._forest import _generate_unsampled_indices
    from sklearn.ensemble import _forest as forest
else:
    # Before sklearn version 0.24, forest was public, supporting this.
    from sklearn.ensemble.forest import _generate_unsampled_indices
    from sklearn.ensemble import forest
    
def _get_unsampled_indices(tree, n_samples):
    """
    An interface to get unsampled indices regardless of sklearn version.
    """
    if LooseVersion(sklearn.__version__) >= LooseVersion("0.24"):
        # Version 0.24 moved forest package name
        from sklearn.ensemble._forest import _get_n_samples_bootstrap
        n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
        return _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap)
    elif LooseVersion(sklearn.__version__) >= LooseVersion("0.22"):
        # Version 0.22 or newer uses 3 arguments.
        from sklearn.ensemble.forest import _get_n_samples_bootstrap
        n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, n_samples)
        return _generate_unsampled_indices(tree.random_state, n_samples, n_samples_bootstrap)
    else:
        # Version 0.21 or older uses only two arguments.
        return _generate_unsampled_indices(tree.random_state, n_samples)


In [None]:
def oob_classifier_accuracy(rf, train_x, train_y):
    X = train_x.values
    y = train_y.values

    n_samples = len(X)
    n_classes = len(np.unique(y))
    predictions = np.zeros((n_samples, n_classes))
    for tree in rf.estimators_:
        unsampled_indices = _get_unsampled_indices(tree, n_samples)
        tree_preds = tree.predict_proba(X[unsampled_indices, :])
        predictions[unsampled_indices] += tree_preds

    predicted_class_indexes = np.argmax(predictions, axis=1)
    predicted_classes = [rf.classes_[i] for i in predicted_class_indexes]

    oob_score = np.mean(y == predicted_classes)
    return oob_score


In [None]:
# 동일한 모델 적용 forest_rand

imp = permutation_importances(forest_rand, train_x, train_y, oob_classifier_accuracy)

In [None]:
plt.figure(figsize=(20, 6))
plt.title("Feature importances")
plt.bar(range(train_x.shape[1]), imp[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(train_x.shape[1]), train_x.columns[indices], rotation=45)
plt.xlim([-1, train_x.shape[1]])
plt.show()

---