In [60]:
# load package
from sklearn import datasets 
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import  precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_california_housing
import numpy as np
import random
import matplotlib.pyplot as plt

In [61]:
# 导入乳腺癌数据
cancer = datasets.load_breast_cancer()  

X = pd.DataFrame(data=cancer['data'], columns=cancer['feature_names'])

# 构造label
random_label = [random.randint(0,5) for i in range(len(X))]
Y = pd.DataFrame(data=cancer['target'] * random_label, columns=['target']) 

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=10)

In [62]:
Y.describe()

Unnamed: 0,target
count,569.0
mean,1.565905
std,1.793856
min,0.0
25%,0.0
50%,1.0
75%,3.0
max,5.0


In [63]:
# build lightgbm data
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

In [64]:
## 设定参数
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [65]:
## sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

## 自定义损失函数需要提供损失函数的一阶和二阶导数形式
def loglikelood(preds, train_data):
    labels = train_data.get_label()
    preds = sigmoid(preds)
    grad = -labels * (1 - preds) + preds
    hess = (labels + 1) * preds * (1 - preds)
    return grad, hess

## 自定义评估函数
def binary_error(preds, train_data):
    labels = train_data.get_label()
    preds = sigmoid(preds)
    return 'error', -np.average(labels * np.log(preds) + np.log(1 -preds)), False
    

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                fobj=loglikelood,
                feval=binary_error,
                valid_sets=[lgb_train, lgb_eval],
                early_stopping_rounds=10)

[1]	training's error: 1.70717	valid_1's error: 1.8379
Training until validation scores don't improve for 10 rounds.
[2]	training's error: 1.67753	valid_1's error: 1.81108
[3]	training's error: 1.64958	valid_1's error: 1.78639
[4]	training's error: 1.62523	valid_1's error: 1.76496
[5]	training's error: 1.60086	valid_1's error: 1.74503
[6]	training's error: 1.57836	valid_1's error: 1.72789
[7]	training's error: 1.55589	valid_1's error: 1.70835
[8]	training's error: 1.53732	valid_1's error: 1.69554
[9]	training's error: 1.52037	valid_1's error: 1.68407
[10]	training's error: 1.50474	valid_1's error: 1.6728
[11]	training's error: 1.48982	valid_1's error: 1.66231
[12]	training's error: 1.47648	valid_1's error: 1.65253
[13]	training's error: 1.46354	valid_1's error: 1.64313
[14]	training's error: 1.45115	valid_1's error: 1.63393
[15]	training's error: 1.441	valid_1's error: 1.62677
[16]	training's error: 1.43094	valid_1's error: 1.6196
[17]	training's error: 1.42046	valid_1's error: 1.61323
