In [12]:
# load package
from sklearn import datasets 
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import  precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_california_housing
import numpy as np
import matplotlib.pyplot as plt

In [13]:
# 导入乳腺癌数据
cancer = datasets.load_breast_cancer()  
Y = pd.DataFrame(data=cancer['target'], columns=['target'])
X = pd.DataFrame(data=cancer['data'], columns=cancer['feature_names'])
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=10)

In [14]:
# build lightgbm data
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

In [15]:
## 设定参数
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss', 'auc', 'cross_entropy'],
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [16]:
# model train
gbm1 = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=[lgb_train, lgb_eval],
                early_stopping_rounds=10)

[1]	training's auc: 0.980526	training's xentropy: 0.626414	training's binary_logloss: 0.626414	valid_1's auc: 0.974349	valid_1's xentropy: 0.608637	valid_1's binary_logloss: 0.608637
Training until validation scores don't improve for 10 rounds.
[2]	training's auc: 0.980526	training's xentropy: 0.590854	training's binary_logloss: 0.590854	valid_1's auc: 0.974349	valid_1's xentropy: 0.573687	valid_1's binary_logloss: 0.573687
[3]	training's auc: 0.98234	training's xentropy: 0.560036	training's binary_logloss: 0.560036	valid_1's auc: 0.983354	valid_1's xentropy: 0.545363	valid_1's binary_logloss: 0.545363
[4]	training's auc: 0.989436	training's xentropy: 0.530959	training's binary_logloss: 0.530959	valid_1's auc: 0.985775	valid_1's xentropy: 0.517253	valid_1's binary_logloss: 0.517253
[5]	training's auc: 0.988822	training's xentropy: 0.504266	training's binary_logloss: 0.504266	valid_1's auc: 0.985623	valid_1's xentropy: 0.490874	valid_1's binary_logloss: 0.490874
[6]	training's auc: 0.99

In [17]:
y_pred = gbm1.predict(x_test)
y_pred_label = y_pred > 0.5
print(f'ROC AUC: {roc_auc_score(y_test, y_pred):.5f}')
print(f'Log loss: {log_loss(y_test, y_pred):.5f}')
print('分类报告：\n',classification_report(y_test, y_pred_label))

ROC AUC: 0.99092
Log loss: 0.44551
分类报告：
               precision    recall  f1-score   support

           0       0.96      0.78      0.86        59
           1       0.89      0.98      0.94       112

    accuracy                           0.91       171
   macro avg       0.93      0.88      0.90       171
weighted avg       0.92      0.91      0.91       171



In [18]:
## sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

## 自定义损失函数需要提供损失函数的一阶和二阶导数形式
def loglikelood(preds, train_data):
    labels = train_data.get_label()
    preds = sigmoid(preds)
    grad = preds - labels
    hess = preds * (1. - preds)
    return grad, hess

## 自定义评估函数
def binary_error(preds, train_data):
    labels = train_data.get_label()
    preds = sigmoid(preds)
    return 'error', -np.average(labels* np.log(preds) + (1-labels) * np.log(1-preds)), False
    #return 'error', log_loss(labels, preds), False
    

gbm2 = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                fobj=loglikelood,
                feval=binary_error,
                valid_sets=[lgb_train, lgb_eval],
                early_stopping_rounds=10)

[1]	training's auc: 0.980526	training's xentropy: 2.16342	training's binary_logloss: 2.35433	training's error: 0.652955	valid_1's auc: 0.974349	valid_1's xentropy: 2.41692	valid_1's binary_logloss: 2.6593	valid_1's error: 0.652108
Training until validation scores don't improve for 10 rounds.
[2]	training's auc: 0.980526	training's xentropy: 1.74556	training's binary_logloss: 1.93648	training's error: 0.616618	valid_1's auc: 0.974349	valid_1's xentropy: 1.97925	valid_1's binary_logloss: 2.22163	valid_1's error: 0.615016
[3]	training's auc: 0.981446	training's xentropy: 1.21091	training's binary_logloss: 1.31504	training's error: 0.58478	valid_1's auc: 0.980176	valid_1's xentropy: 1.74756	valid_1's binary_logloss: 1.98994	valid_1's error: 0.584646
[4]	training's auc: 0.980752	training's xentropy: 1.32989	training's binary_logloss: 1.52081	training's error: 0.554487	valid_1's auc: 0.980478	valid_1's xentropy: 1.55695	valid_1's binary_logloss: 1.79933	valid_1's error: 0.553659
[5]	training

In [19]:
y_pred = sigmoid(gbm2.predict(x_test))
y_pred_label = y_pred > 0.5
print(f'ROC AUC: {roc_auc_score(y_test, y_pred):.5f}')
print(f'Log loss: {log_loss(y_test, y_pred):.5f}')
print('分类报告：\n',classification_report(y_test, y_pred_label))

ROC AUC: 0.99168
Log loss: 0.41345
分类报告：
               precision    recall  f1-score   support

           0       0.92      0.95      0.93        59
           1       0.97      0.96      0.96       112

    accuracy                           0.95       171
   macro avg       0.95      0.95      0.95       171
weighted avg       0.95      0.95      0.95       171



In [20]:
def log_loss_init_score(y):
    pavg = y.mean()
    pavg = np.clip(pavg, 1e-15, 1 - 1e-15)
    g = np.log(pavg / (1. - pavg))
    return g

# init score
init_score_train = np.full_like(y_train, log_loss_init_score(y_train), dtype=float).flatten()
init_score_test = np.full_like(y_test, log_loss_init_score(y_train), dtype=float).flatten()

# build lightgbm data
lgb_train = lgb.Dataset(x_train, y_train, init_score=init_score_train)
lgb_eval = lgb.Dataset(x_test, y_test, init_score=init_score_test, reference=lgb_train)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss', 'auc', 'cross_entropy'],
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'boost_from_average': True
}

gbm3 = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                fobj=loglikelood,
                feval=binary_error,
                valid_sets=[lgb_train, lgb_eval],
                early_stopping_rounds=10)

[1]	training's auc: 0.980526	training's xentropy: 0.560186	training's binary_logloss: 0.560186	training's error: 0.626414	valid_1's auc: 0.974349	valid_1's xentropy: 0.564444	valid_1's binary_logloss: 0.564444	valid_1's error: 0.608637
Training until validation scores don't improve for 10 rounds.
[2]	training's auc: 0.980526	training's xentropy: 0.442357	training's binary_logloss: 0.442357	training's error: 0.590854	valid_1's auc: 0.974349	valid_1's xentropy: 0.447013	valid_1's binary_logloss: 0.447013	valid_1's error: 0.573687
[3]	training's auc: 0.98234	training's xentropy: 0.347519	training's binary_logloss: 0.347519	training's error: 0.560036	valid_1's auc: 0.983354	valid_1's xentropy: 0.363806	valid_1's binary_logloss: 0.363806	valid_1's error: 0.545363
[4]	training's auc: 0.989436	training's xentropy: 0.267194	training's binary_logloss: 0.267194	training's error: 0.530959	valid_1's auc: 0.985775	valid_1's xentropy: 0.303288	valid_1's binary_logloss: 0.303288	valid_1's error: 0.51

In [21]:
y_pred = sigmoid(gbm3.predict(x_test))
y_pred_label = y_pred > 0.5
print(f'ROC AUC: {roc_auc_score(y_test, y_pred):.5f}')
print(f'Log loss: {log_loss(y_test, y_pred):.5f}')
print('分类报告：\n',classification_report(y_test, y_pred_label))

ROC AUC: 0.99092
Log loss: 0.48444
分类报告：
               precision    recall  f1-score   support

           0       0.90      0.97      0.93        59
           1       0.98      0.95      0.96       112

    accuracy                           0.95       171
   macro avg       0.94      0.96      0.95       171
weighted avg       0.96      0.95      0.95       171

