# 앙상블 학습

In [1]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [2]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
cancer_scaled = scaler.fit_transform(cancer.data)

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_scaled, cancer.target, stratify=cancer.target, test_size=0.2, random_state=2021
) 

#### 앙상블 학습을 위한 분류기 
- 로지스틱 회귀 
- 서포트 벡터 머신 
- K 최근접 이웃 


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [6]:
lrc = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier()

In [7]:
from sklearn.ensemble import VotingClassifier

voc = VotingClassifier(
    estimators=[('LR', lrc),('SVC', svc), ('KNN', knn)], voting='hard'        # 에스티메이터 튜플 리스트 형태로 
)


In [9]:
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

0.9824561403508771

- (앙상블 아닌) 개별학습의 성능

In [10]:
lrc.fit(X_train, y_train)
svc.fit(X_train, y_train)
knn.fit(X_train, y_train)
lrc.score(X_test, y_test), svc.score(X_test, y_test), knn.score(X_test, y_test)

(0.9824561403508771, 0.9824561403508771, 0.9824561403508771)

- 소프트 보팅 

In [12]:
print(dir(lrc))     # 객체의 어트리뷰트 등 볼 수 있

['C', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_check_n_features', '_estimator_type', '_get_param_names', '_get_tags', '_more_tags', '_predict_proba_lr', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_validate_data', 'class_weight', 'classes_', 'coef_', 'decision_function', 'densify', 'dual', 'fit', 'fit_intercept', 'get_params', 'intercept_', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_features_in_', 'n_iter_', 'n_jobs', 'penalty', 'predict', 'predict_log_proba', 'predict_proba', 'random_state', 'score', 'set_params', 'solver', 'sparsify', 'tol', 'verbose', 'warm_start']


In [14]:
lrc.predict_proba(X_test[:5])       # proba : probability (확률)        # 소프트 보팅은 predict_proba 메소드 지원하는 시만 가능

array([[0.36348222, 0.63651778],
       [0.97162943, 0.02837057],
       [0.1869565 , 0.8130435 ],
       [0.02133462, 0.97866538],
       [0.0548391 , 0.9451609 ]])

In [15]:
svc.predict_log_proba(X_test[:5])       # 확률을 사용할 수 없는 상태로 객체를 불러왔음 

AttributeError: predict_proba is not available when  probability=False

In [17]:
svc2 = SVC(probability=True)        # 확률 사용할 수 있도록 다시 객체 불러오기 
svc2.fit(X_train, y_train)
svc2.predict_log_proba(X_test[:5])

array([[-7.49427455e-01, -6.39866317e-01],
       [-9.38747571e-05, -9.27359597e+00],
       [-4.38780845e+00, -1.25058085e-02],
       [-1.09707670e+01, -1.71972959e-05],
       [-5.64974371e+00, -3.52462259e-03]])

In [19]:
voc2 = VotingClassifier(
    estimators=[('LR',lrc), ('SVC', svc2), ('KNN', knn)], voting= 'soft'
)
voc2.fit(X_train, y_train)
voc2.score(X_test, y_test)

0.9912280701754386

In [20]:
voc2.predict_proba(X_test[:5])

array([[0.40601754, 0.59398246],
       [0.99048284, 0.00951716],
       [0.13399512, 0.86600488],
       [0.00817981, 0.99182019],
       [0.01981703, 0.98018297]])

#### Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
rfc = RandomForestClassifier(random_state=2021)     # 'bootstrap': True, n_estimators': 10 .... 
rfc.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2021,
 'verbose': 0,
 'warm_start': False}

In [23]:
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.9736842105263158

#### XGBoost

In [24]:
import xgboost as xgb
from xgboost import XGBRFClassifier

In [25]:
xgc = XGBRFClassifier()
xgc.fit(X_train, y_train)
xgc.score(X_test, y_test)





0.9649122807017544