# 랜덤포레스트

In [1]:
import pandas as pd
feature_name_df = pd.read_csv('../00.data/UCI HAR Dataset/UCI HAR Dataset/features.txt', sep='\s+', header=None, 
                              names=['column_index','column_name'])

In [2]:
unique_name = feature_name_df.column_name.unique()
for i in unique_name:
    if len(feature_name_df.column_name[feature_name_df.column_name == i]) > 1 :
        t_l = feature_name_df.column_name[feature_name_df.column_name == i].index
        feature_name_df.column_name[feature_name_df.column_name == i] += ['_1','_2','_3']

feature_name = feature_name_df.iloc[:, 1].values.tolist()

In [3]:
X_train = pd.read_csv('../00.data/UCI HAR Dataset/UCI HAR Dataset/train/X_train.txt', sep='\s+', names=feature_name)
X_test = pd.read_csv('../00.data/UCI HAR Dataset/UCI HAR Dataset/test/X_test.txt', sep='\s+', names=feature_name)
y_train = pd.read_csv('../00.data/UCI HAR Dataset/UCI HAR Dataset/train/y_train.txt', sep='\s+', names=['action'])
y_test = pd.read_csv('../00.data/UCI HAR Dataset/UCI HAR Dataset/test/y_test.txt', sep='\s+', names=['action'])

### 랜덤 포레스트 모델 생성/학습/예측/평가

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'랜덤 포레스트 정확도 : {acc:.4f}')

랜덤 포레스트 정확도 : 0.9209


In [5]:
rf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### 최적의 파라미터

In [6]:
from sklearn.model_selection import GridSearchCV

In [7]:
params = {
    'max_depth': [8, 12, 16],
    'min_samples_split': [12, 16, 20],
    'n_estimators': [10, 30, 50]
}

In [8]:
grid_cv = GridSearchCV(rf_clf, param_grid=params, scoring='accuracy', cv=5, verbose=1)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:  4.1min finished
최고 평균 정확도: 0.9196
최적 파라미터: {'max_depth': 16, 'min_samples_split': 12, 'n_estimators': 50}


In [9]:
df = pd.DataFrame(grid_cv.cv_results_)
df = df[['param_n_estimators', 'param_max_depth', 'param_min_samples_split', 'mean_test_score']]
df

Unnamed: 0,param_n_estimators,param_max_depth,param_min_samples_split,mean_test_score
0,10,8,12,0.905608
1,30,8,12,0.912412
2,50,8,12,0.90996
3,10,8,16,0.901801
4,30,8,16,0.91173
5,50,8,16,0.912135
6,10,8,20,0.897447
7,30,8,20,0.914176
8,50,8,20,0.91336
9,10,12,12,0.907513


### 튜닝된 파라미터로 재평가

In [10]:
best_clf = grid_cv.best_estimator_
pred = best_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 랜덤 포레스트 정확도: {acc:.4f}')

최적 파라미터 랜덤 포레스트 정확도: 0.9291


In [11]:
### 재탐색

In [12]:
params = {
    'n_estimators': [10, 30, 50]
}
grid_cv = GridSearchCV(rf_clf, param_grid=params, scoring='accuracy', cv=5, verbose=1)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   29.8s finished
최고 평균 정확도: 0.9189
최적 파라미터: {'n_estimators': 50}


In [13]:
best_clf = grid_cv.best_estimator_
pred = best_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 랜덤 포레스트 정확도: {acc:.4f}')

최적 파라미터 랜덤 포레스트 정확도: 0.9253


### K 최근접 이웃(K-Nearest Neighbor)

In [14]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [15]:
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
accuracy_score(y_test,pred)

0.9015948422124194

In [16]:
params = {
    'leaf_size': [20, 30, 40],
    'n_neighbors': [5, 10, 15]
}
grid_cv = GridSearchCV(knn, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9018
최적 파라미터: {'leaf_size': 20, 'n_neighbors': 15}
