<a href="https://colab.research.google.com/github/Seungkyu-Han/colab_ml/blob/main/light_gbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import lightgbm

In [11]:
from lightgbm import LGBMClassifier
import lightgbm as lgb

import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

dataset = load_breast_cancer()

cancer_df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
cancer_df['target'] = dataset.target
X_features = cancer_df.iloc[:, :-1]
y_label = cancer_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X_features, y_label, test_size=0.2, random_state=156)

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=156)


evals = [(X_val, y_val)]

lgbm_wrapper = LGBMClassifier(
    n_estimators=400,
    learning_rate=0.05
)

lgbm_wrapper.fit(
    X_tr,
    y_tr,
    eval_set=evals,
    eval_metric='logloss',
    callbacks=[lgb.early_stopping(50)],
)

preds = lgbm_wrapper.predict(X_test)

pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]


[LightGBM] [Info] Number of positive: 251, number of negative: 158
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000281 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4092
[LightGBM] [Info] Number of data points in the train set: 409, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.613692 -> initscore=0.462858
[LightGBM] [Info] Start training from score 0.462858
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[61]	valid_0's binary_logloss: 0.260236


In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
  confusion = confusion_matrix(y_test, pred)
  accuracy = accuracy_score(y_test, pred)
  precision = precision_score(y_test, pred)
  recall = recall_score(y_test, pred)
  f1 = f1_score(y_test, pred)
  roc_auc = roc_auc_score(y_test, pred_proba)

  print('오차 행렬')
  print(confusion)

  print(f'정확도: {accuracy}, 정밀도: {precision}, 재현율: {recall}, F1: {f1}, AUC: {roc_auc}')

In [13]:
get_clf_eval(y_test, preds, pred_proba)

오차 행렬
[[34  3]
 [ 2 75]]
정확도: 0.956140350877193, 정밀도: 0.9615384615384616, 재현율: 0.974025974025974, F1: 0.967741935483871, AUC: 0.9877149877149878
