# 랜덤 포레스트(Random Forest)

In [1]:
import pandas as pd
fn_df = pd.read_csv('../00. data/UCI_HAR_Dataset/features.txt', sep='\s+', header=None, names=['col_index', 'col_name'])

In [2]:
def get_new_fn_df(fn_df):
    f_dup_df = pd.DataFrame(data=fn_df.groupby('column_name').cumcount(), columns=['dup_cnt'])
    f_dup_df = f_dup_df.reset_index()
    new_fn_df =pd.merge(fn_df.reset_index(), f_dup_df, how='outer')
    new_fn_df['column_name'] = new_fn_df[['column_name', 'dup_cnt']].apply(lambda x: x[0] + '_' + str(x[1]) if x[1] > 0 else x[0], axis=1)
    new_fn_df = new_fn_df.drop(['index'], axis=1)
    return new_fn_df

In [3]:
def get_HAR_dataset():
    fn_df = pd.read_csv('../00. data/UCI_HAR_Dataset/features.txt', sep='\s+', header=None, names=['column_index', 'column_name'])
    new_fn_df = get_new_fn_df(fn_df)
    feature_name = new_fn_df.iloc[:, 1].values.tolist()
    X_train = pd.read_csv('../00. data/UCI_HAR_Dataset/train/X_train.txt', sep='\s+', names=feature_name)
    X_test = pd.read_csv('../00. data/UCI_HAR_Dataset/test/X_test.txt', sep='\s+', names=feature_name)
    y_train = pd.read_csv('../00. data/UCI_HAR_Dataset/train/y_train.txt', sep='\s+', names=['action'])
    y_test = pd.read_csv('../00. data/UCI_HAR_Dataset/test/y_test.txt', sep='\s+', names=['action'])

    return X_train, X_test, y_train, y_test, feature_name

In [4]:
X_train, X_test, y_train, y_test, feature_name = get_HAR_dataset()

### 랜덤 포레스트 모델 생성/학습/예측/평가

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'랜덤 포레스트 정확도: {acc:.4f}')

랜덤 포레스트 정확도: 0.9230


### 최적 파라미터 탐색

In [6]:
rf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [7]:
from sklearn.model_selection import GridSearchCV
params = {
    'n_estimators': [10, 30],
    'max_depth': [8, 12, 16],
    'min_samples_split': [12, 16, 20]
}

In [9]:
rf_clf = RandomForestClassifier(n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
grid_cv.fit(X_train, y_train)
print('최적 파라미터:', grid_cv.best_params_)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')

최적 파라미터: {'max_depth': 8, 'min_samples_split': 16, 'n_estimators': 30}
최고 평균 정확도: 0.9198


### K 최근접 이웃(K-Nearest Neighbor)

In [10]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [11]:
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
accuracy_score(y_test, pred)

0.9015948422124194