In [18]:
import json
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

import lightgbm as lgb


df_train = pd.read_csv('https://cdn.coggle.club/LightGBM/examples/binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('https://cdn.coggle.club/LightGBM/examples/binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('https://cdn.coggle.club/LightGBM/examples/binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('https://cdn.coggle.club/LightGBM/examples/binary_classification/binary.test.weight', header=None)[0]

In [19]:
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
num_train, num_feature = X_train.shape

In [20]:
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
 
lgb_train = lgb.Dataset(X_train, y_train,
                        weight=W_train, free_raw_data=False)
 
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
                       weight=W_test, free_raw_data=False)

In [21]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
 
# generate feature names
feature_name = ['feature_' + str(col) for col in range(num_feature)]
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,  # eval training data
                feature_name=feature_name,
                categorical_feature=[21])

You can set `force_col_wise=true` to remove the overhead.
[1]	training's binary_logloss: 0.682311
[2]	training's binary_logloss: 0.674204
[3]	training's binary_logloss: 0.664662
[4]	training's binary_logloss: 0.655524
[5]	training's binary_logloss: 0.648385
[6]	training's binary_logloss: 0.64227
[7]	training's binary_logloss: 0.63507
[8]	training's binary_logloss: 0.628378
[9]	training's binary_logloss: 0.621862
[10]	training's binary_logloss: 0.615907


New categorical_feature is [21]


In [23]:
# save model to file
gbm.save_model('model.txt')
 
print('Dumping model to JSON...')
model_json = gbm.dump_model()
 
with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)

Dumping model to JSON...


In [42]:
# feature names
print('Feature names:', gbm.feature_name())
 
# feature importances
print('Feature importances:', list(gbm.feature_importance()))

Feature names: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27']
Feature importances: [7, 1, 1, 21, 5, 30, 2, 1, 1, 18, 7, 0, 1, 8, 2, 1, 0, 7, 3, 3, 0, 0, 36, 5, 32, 52, 28, 28]


In [43]:
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model='model.txt',
                valid_sets=lgb_eval)
print('Finished 10 - 20 rounds with model file...')


You can set `force_col_wise=true` to remove the overhead.
[11]	valid_0's binary_logloss: 0.615668
[12]	valid_0's binary_logloss: 0.61131
[13]	valid_0's binary_logloss: 0.607092
[14]	valid_0's binary_logloss: 0.603532
[15]	valid_0's binary_logloss: 0.600335
[16]	valid_0's binary_logloss: 0.596209
[17]	valid_0's binary_logloss: 0.592408
[18]	valid_0's binary_logloss: 0.588936
[19]	valid_0's binary_logloss: 0.585849
[20]	valid_0's binary_logloss: 0.582769
Finished 10 - 20 rounds with model file...


New categorical_feature is [21]


In [45]:

# decay learning rates
# learning_rates accepts:
# 1. list/tuple with length = num_boost_round
# 2. function(curr_iter)
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                learning_rates=lambda iter: 0.05 * (0.99 ** iter),
                valid_sets=lgb_eval)
print('Finished 20 - 30 rounds with decay learning rates...')
 


You can set `force_col_wise=true` to remove the overhead.
[41]	valid_0's binary_logloss: 0.547133
[42]	valid_0's binary_logloss: 0.545865
[43]	valid_0's binary_logloss: 0.545081
[44]	valid_0's binary_logloss: 0.544072
[45]	valid_0's binary_logloss: 0.542853
[46]	valid_0's binary_logloss: 0.541275
[47]	valid_0's binary_logloss: 0.540361
[48]	valid_0's binary_logloss: 0.53926
[49]	valid_0's binary_logloss: 0.538636
[50]	valid_0's binary_logloss: 0.537447
Finished 20 - 30 rounds with decay learning rates...




In [46]:
# change other parameters during training
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                valid_sets=lgb_eval,
                callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
print('Finished 30 - 40 rounds with changing bagging_fraction...')

You can set `force_col_wise=true` to remove the overhead.
[51]	valid_0's binary_logloss: 0.536715
[52]	valid_0's binary_logloss: 0.536008
[53]	valid_0's binary_logloss: 0.535634
[54]	valid_0's binary_logloss: 0.534715
[55]	valid_0's binary_logloss: 0.533922
[56]	valid_0's binary_logloss: 0.533792
[57]	valid_0's binary_logloss: 0.532749
[58]	valid_0's binary_logloss: 0.53161
[59]	valid_0's binary_logloss: 0.530396
[60]	valid_0's binary_logloss: 0.52909
Finished 30 - 40 rounds with changing bagging_fraction...




In [47]:

# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelihood(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1. - preds)
    return grad, hess
 
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool
# binary error
# NOTE: when you do customized loss function, the default prediction value is margin
# This may make built-in evalution metric calculate wrong results
# For example, we are doing log likelihood loss, the prediction is score before logistic transformation
# Keep this in mind when you use the customization
def binary_error(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    return 'error', np.mean(labels != (preds > 0.5)), False
 
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                fobj=loglikelihood,
                feval=binary_error,
                valid_sets=lgb_eval)
print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')

You can set `force_col_wise=true` to remove the overhead.
[61]	valid_0's binary_logloss: 5.3439	valid_0's error: 0.264
[62]	valid_0's binary_logloss: 5.3365	valid_0's error: 0.262
[63]	valid_0's binary_logloss: 5.33553	valid_0's error: 0.26
[64]	valid_0's binary_logloss: 5.45143	valid_0's error: 0.264
[65]	valid_0's binary_logloss: 5.56931	valid_0's error: 0.268
[66]	valid_0's binary_logloss: 5.57122	valid_0's error: 0.264
[67]	valid_0's binary_logloss: 5.69048	valid_0's error: 0.268
[68]	valid_0's binary_logloss: 5.63137	valid_0's error: 0.266
[69]	valid_0's binary_logloss: 5.68446	valid_0's error: 0.268
[70]	valid_0's binary_logloss: 5.73838	valid_0's error: 0.27
Finished 40 - 50 rounds with self-defined objective function and eval metric...


