# 랜덤 포레스트(Random Forest)

In [2]:
import pandas as pd
feature_name_df = pd.read_csv('../00.data/UCI_HAR_Dataset/features.txt', sep='\s+',
                                header=None, names=['col_index','col_name'])

In [3]:
def get_new_feature_name_df(old_df):
    dup_df = pd.DataFrame({'dup_cnt':feature_name_df.groupby('col_name').cumcount()})
    new_df = pd.merge(old_df.reset_index(), dup_df.reset_index())
    new_df['col_name'] = new_df[['col_name', 'dup_cnt']].\
        apply(lambda x: x[0]+'_'+str(x[1]) if x[1] > 0 else x[0], axis=1)
    new_df = new_df.drop(['index'], axis=1)
    return new_df

In [4]:
new_feature_df = get_new_feature_name_df(feature_name_df)
feature_list = list(new_feature_df.col_name.values)

In [6]:
# X_train
X_train_df = pd.read_csv('../00.data/UCI_HAR_Dataset/train/X_train.txt', sep='\s+',header=None,names=feature_list)
X_train = X_train_df.values

# X_test
X_test_df = pd.read_csv('../00.data/UCI_HAR_Dataset/test/X_test.txt', sep='\s+',header=None,names=feature_list)
X_test = X_test_df.values

# y_train
y_train_df = pd.read_csv('../00.data/UCI_HAR_Dataset/train/y_train.txt', sep='\s+',header=None,names=['action'])
y_train = y_train_df.values

# y_test
y_test_df = pd.read_csv('../00.data/UCI_HAR_Dataset/test/y_test.txt', sep='\s+',header=None,names=['action'])
y_test = y_test_df.values

### 랜덤 포레스트 모델 생성/학습/예측/평가

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train,y_train)
pred = rf_clf.predict(X_test)
acc = accuracy_score(y_test,pred)
print(f'랜덤 포레스트 모델의 정확도: {acc:.4f}')

랜덤 포레스트 모델의 정확도: 0.9284


### 최적 파라미터 찾기

In [9]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [10, 30, 50],
    'max_depth': [8, 12, 16],
    'min_samples_split': [12, 16, 20]
}

In [10]:
rf_clf = RandomForestClassifier(n_jobs=1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=3, n_jobs=1)
grid_cv.fit(X_train,y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터: ', grid_cv.best_params_)

최고 평균 정확도: 0.9147
최적 파라미터:  {'max_depth': 8, 'min_samples_split': 16, 'n_estimators': 50}


In [18]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = { 'n_estimators': randint(low=1, high=50), 'max_depth': randint(low=1, high=50), 'min_samples_split': randint(low=1, high=50)}
rf_clf = RandomForestClassifier(n_jobs=1)
grid_cv = RandomizedSearchCV(rf_clf, param_distributions=param_distribs, n_iter=10, cv=5, n_jobs=1)
grid_cv.fit(X_train,y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터: ', grid_cv.best_params_)


최고 평균 정확도: 0.9192
최적 파라미터:  {'max_depth': 27, 'min_samples_split': 19, 'n_estimators': 31}


In [19]:
best = grid_cv.best_estimator_
pred = best.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 랜덤 포레스트 정확도: {acc:.4f}')

최적 파라미터 랜덤 포레스트 정확도: 0.9209


# K 최근접 이웃(K-Nearest Neighbor)

In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [17]:
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
accuracy_score(y_test, pred)

0.9015948422124194