# 랜덤 포레스트

In [2]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [3]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
cancer_scaled = scaler.fit_transform(cancer.data)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_scaled, cancer.target, stratify=cancer.target, test_size=0.2, random_state=2011
)

### 랜덤 포레스트 모델 생성/학습/예측/평가

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score  # 평가를 하기위해 사용

In [6]:
knn = KNeighborsClassifier()
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [7]:
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
accuracy_score(y_test, pred)

0.956140350877193

## load_digits 사용하기

- 랜덤포레스트

In [8]:
from sklearn.datasets import load_digits
digits = load_digits()

In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
digits_scaled = scaler.fit_transform(digits.data)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    digits_scaled, digits.target, stratify=digits.target, test_size=0.2, random_state=2011
)

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 

In [17]:
rf_clf = RandomForestClassifier(random_state=2021)
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
accuracy_score(y_test, pred)

0.9861111111111112

- 앙상블 로지스틱

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [19]:
lr = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier()

In [20]:
from sklearn.ensemble import VotingClassifier

vo_clf = VotingClassifier(
    estimators=[('LR', lr), ('SVC',svc),('KNN',knn)],
    voting='hard'
)

In [21]:
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9833333333333333

- 앙상블 서포트 벡터 머신

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_digits
digits = load_digits()

In [3]:
df = pd.DataFrame(digits.data, columns=digits.feature_names)
df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [4]:
# StandardScaler로 정규화
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
digits_std = scaler.fit_transform(digits.data)
df = pd.DataFrame(digits_std, columns=digits.feature_names)
df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,-0.335016,-0.043081,0.274072,-0.664478,-0.844129,-0.409724,-0.125023,-0.059078,-0.624009,...,-0.757436,-0.209785,-0.023596,-0.299081,0.086719,0.208293,-0.366771,-1.146647,-0.50567,-0.196008
1,0.0,-0.335016,-1.094937,0.038648,0.268751,-0.13802,-0.409724,-0.125023,-0.059078,-0.624009,...,-0.757436,-0.209785,-0.023596,-0.299081,-1.089383,-0.24901,0.849632,0.548561,-0.50567,-0.196008
2,0.0,-0.335016,-1.094937,-1.844742,0.735366,1.097673,-0.409724,-0.125023,-0.059078,-0.624009,...,0.25923,-0.209785,-0.023596,-0.299081,-1.089383,-2.078218,-0.164037,1.565686,1.695137,-0.196008
3,0.0,-0.335016,0.377661,0.744919,0.268751,-0.844129,-0.409724,-0.125023,-0.059078,1.879691,...,1.072563,-0.209785,-0.023596,-0.299081,0.282736,0.208293,0.24143,0.37904,-0.50567,-0.196008
4,0.0,-0.335016,-1.094937,-2.551014,-0.197863,-1.020657,-0.409724,-0.125023,-0.059078,-0.624009,...,-0.757436,-0.209785,-0.023596,-0.299081,-1.089383,-2.306869,0.849632,-0.468564,-0.50567,-0.196008


In [5]:
# train/test set 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    digits_std, digits.target, stratify=digits.target, test_size=0.2, random_state=2021
)

In [8]:
# 모델 생성 및 학습
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)

SVC()

In [9]:
# 예측 및 평가
from sklearn.metrics import accuracy_score
pred = svc.predict(X_test)
accuracy_score(y_test, pred)

0.9805555555555555

In [10]:
# 하이퍼 파라메터 조정
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [13]:
params = {'C':[0.01, 0.1, 0.5, 1, 5, 10, 100]}

In [14]:
from sklearn.model_selection import GridSearchCV

svc = SVC(random_state=2021)
grid_clf = GridSearchCV(svc, param_grid=params, cv=5, scoring='accuracy')

In [15]:
grid_clf.fit(X_train, y_train)
grid_clf.best_score_

0.9798175571041424

In [16]:
# 최적 파라메터로 학습한 모델로 평가
best_clf = grid_clf.best_estimator_
pred = best_clf.predict(X_test)
accuracy_score(y_test, pred)

0.9805555555555555