# XGBoost, LightBGBM
- 모듈을 먼저 설치해야 함
- 코랩에서 실행

## 유방암 데이터


In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

In [83]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [84]:
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
features = cancer.feature_names

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape, X_test.shape

((426, 30), (143, 30))

## XGBoost
- 사이킷런 래퍼 방식 (사이킷런의 Estimator를 상속하여 만들었다)
- fit(), predict() 사용 가능
- XGBClassifier, XGBRegressor 제공
- 조기 종료 설정

In [71]:
def view_clf_performances(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("AUC=",roc_auc_score(y_test, y_pred_proba[:,1]))

In [36]:
# !pip install xgboost

In [88]:
from xgboost import XGBClassifier

evals = [(X_test, y_test)]
xgb_clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
xgb_clf.fit(X_train, y_train, early_stopping_rounds=10, 
                eval_metric="logloss", eval_set=evals, verbose=False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [89]:
view_clf_performances(xgb_clf, X_test, y_test)

[[49  2]
 [ 3 89]]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95        51
           1       0.98      0.97      0.97        92

    accuracy                           0.97       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.97      0.97      0.97       143

AUC= 0.9968030690537084


## 특성 중요도 출력

In [90]:
def view_feature_importances(features, importances):
    return pd.DataFrame({'Feature':features, 
     'Importances':importances}).sort_values('Importances', ascending=False)

In [91]:
view_feature_importances(features, xgb_clf.feature_importances_)[:10]

Unnamed: 0,Feature,Importances
22,worst perimeter,0.457719
27,worst concave points,0.168091
7,mean concave points,0.091333
23,worst area,0.044724
20,worst radius,0.029884
26,worst concavity,0.02417
1,mean texture,0.017988
3,mean area,0.017701
14,smoothness error,0.013589
21,worst texture,0.012449


## 랜덤 포레스트

In [92]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [93]:
view_clf_performances(rfc, X_test, y_test)

[[50  1]
 [ 2 90]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97        51
           1       0.99      0.98      0.98        92

    accuracy                           0.98       143
   macro avg       0.98      0.98      0.98       143
weighted avg       0.98      0.98      0.98       143

AUC= 0.9976555839727195


In [94]:
view_feature_importances(features, rfc.feature_importances_)[:10]

Unnamed: 0,Feature,Importances
27,worst concave points,0.136416
23,worst area,0.1249
22,worst perimeter,0.104921
20,worst radius,0.083237
7,mean concave points,0.081378
0,mean radius,0.070129
26,worst concavity,0.043851
3,mean area,0.043369
13,area error,0.041972
2,mean perimeter,0.040574


# LightGBM 
- LightGBM 설치

In [95]:
# !pip install lightgbm

In [100]:
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimators=200)
evals = [(X_test, y_test)]
lgbm_clf.fit(X_train, y_train, early_stopping_rounds=10, 
                 eval_metric="logloss", eval_set=evals, verbose=False)

LGBMClassifier(n_estimators=200)

In [101]:
view_clf_performances(lgbm_clf, X_test, y_test)

[[50  1]
 [ 3 89]]
              precision    recall  f1-score   support

           0       0.94      0.98      0.96        51
           1       0.99      0.97      0.98        92

    accuracy                           0.97       143
   macro avg       0.97      0.97      0.97       143
weighted avg       0.97      0.97      0.97       143

AUC= 0.9950980392156863


In [102]:
view_feature_importances(features, lgbm_clf.feature_importances_)[:10]

Unnamed: 0,Feature,Importances
21,worst texture,134
27,worst concave points,93
7,mean concave points,78
1,mean texture,74
22,worst perimeter,65
20,worst radius,61
13,area error,60
26,worst concavity,51
6,mean concavity,49
15,compactness error,44
