## GBDT+LR代码分析

In [None]:
# Scikit-learn实现

In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df_train = pd.read_csv(r'F:\Data\recsys-data\gbdt+lr/train.csv')
df_test = pd.read_csv(r'F:\Data\recsys-data\gbdt+lr/test.csv')

In [3]:
NUMERIC_COLS = [
    "ps_reg_01", "ps_reg_02", "ps_reg_03",
    "ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",
]

In [11]:
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=10,subsample = 0.6,max_depth=7,min_samples_split=900)

In [4]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,8000,20227,1,7,1,5,1,0,0,1,...,4,2,6,5,0,0,1,1,1,0
1,8001,20228,1,0,1,6,1,0,1,0,...,5,2,4,10,0,0,0,0,0,1
2,8002,20229,0,3,1,8,0,0,0,0,...,10,1,3,5,0,0,1,1,1,0
3,8003,20235,0,2,1,8,0,0,0,0,...,2,2,2,9,0,0,0,1,1,0
4,8004,20236,0,0,1,2,1,0,0,0,...,3,2,5,5,0,0,1,0,1,0


In [100]:
y_train = df_train['target']
y_test = df_test['target']
X_train = df_train[NUMERIC_COLS]
X_test = df_test[NUMERIC_COLS]

In [6]:
X_train.shape

(8001, 7)

In [23]:
lgb_train = lgb.Dataset(X_train,y_train)
lgb_eval = lgb.Dataset(X_test,y_test,reference=lgb_train)

## 设置子树为100颗，每颗树包含64支叶子的树模型。那么形成的中间特征向量为100*64

In [24]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 64,
    'num_trees': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [25]:
#  叶子节点数，用来进行特征转换使用
num_leaf = 64

In [26]:
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_train)



[1]	training's binary_logloss: 0.155602
[2]	training's binary_logloss: 0.155022
[3]	training's binary_logloss: 0.15441
[4]	training's binary_logloss: 0.153819
[5]	training's binary_logloss: 0.153267
[6]	training's binary_logloss: 0.152685
[7]	training's binary_logloss: 0.152144
[8]	training's binary_logloss: 0.151545
[9]	training's binary_logloss: 0.151029
[10]	training's binary_logloss: 0.15049
[11]	training's binary_logloss: 0.150069
[12]	training's binary_logloss: 0.149553
[13]	training's binary_logloss: 0.149064
[14]	training's binary_logloss: 0.148592
[15]	training's binary_logloss: 0.148111
[16]	training's binary_logloss: 0.147618
[17]	training's binary_logloss: 0.147086
[18]	training's binary_logloss: 0.146624
[19]	training's binary_logloss: 0.146184
[20]	training's binary_logloss: 0.145696
[21]	training's binary_logloss: 0.145182
[22]	training's binary_logloss: 0.144704
[23]	training's binary_logloss: 0.144244
[24]	training's binary_logloss: 0.143804
[25]	training's binary_logl

In [27]:
print('Save model...')
# save model to file
gbm.save_model(r'F:\Data\recsys-data\gbdt+lr/model.txt')

Save model...


<lightgbm.basic.Booster at 0x13099093d68>

In [54]:
print('Start predicting...')
# predict and get data on leaves, training data
y_pred = gbm.predict(X_train, pred_leaf=True)

Start predicting...


In [56]:
X_train.shape

(8001, 7)

In [55]:
y_pred

array([[17,  0, 55, ...,  4, 63, 63],
       [62,  8, 58, ..., 47,  9, 57],
       [44,  0, 58, ..., 34, 62, 45],
       ...,
       [51, 19, 16, ..., 23, 33, 56],
       [61, 28, 58, ..., 53, 28, 18],
       [53, 29, 54, ...,  4, 63, 63]])

In [29]:
np.array(y_pred).shape

(8001, 100)

In [32]:
y_pred[0]
# 17,0每个数字代表每颗树的叶子节点索引

array([17,  0, 55, 44, 47,  8,  8, 39,  8,  8,  0,  0,  0,  0,  0,  0, 38,
       36, 36, 26, 15, 13, 38, 18, 41, 54, 45, 51, 55, 59, 15, 20,  2,  2,
        2, 63, 56, 26,  7, 25, 46, 58, 62, 26, 19, 48,  6, 51,  5, 45, 44,
        1, 44, 14, 33, 41, 10, 39, 49, 63, 51, 63, 20, 48, 52, 47,  8, 36,
        8,  8, 50,  0, 32, 21,  8, 23, 48, 48, 17, 49, 46, 10, 28, 12, 59,
       22, 12, 51, 34, 32, 15, 15, 53, 29, 29, 59, 59,  4, 63, 63])

In [36]:
transform_training_matrix = np.zeros([len(y_pred),len(y_pred[0])*num_leaf],dtype=np.int64) # N**num_tress*num_leaf

In [47]:
for i in range(0,len(y_pred)):
    temp = np.arange(len(y_pred[0]))*num_leaf + np.array(y_pred[i]) #  以64为一个周期，然后加上相应的节点位置
    transform_training_matrix[i][temp] += 1 # 找出索引对应的值，然后加1

In [83]:
transform_training_matrix.shape

(8001, 6400)

In [95]:
y_test_lgb = gbm.predict(X_test,pred_leaf=True)

In [65]:
# 将预测集进行onehot转换

In [86]:
len(y_test)

2000

In [96]:
transform_test_matrix = np.zeros([len(y_test_lgb),len(y_test_lgb[0])*num_leaf],dtype=np.int64)

In [88]:
transform_test_matrix.shape

(2000, 6400)

In [97]:
for i in range(len(y_test_lgb)):
    temp = np.arange(len(y_test[0]))*num_leaf + np.array(y_test_lgb[i])
    transform_test_matrix[i][temp] += 1

In [98]:
lm = LogisticRegression(penalty='l2',C=0.05)
lm.fit(transform_training_matrix,y_train)
y_pred_test = lm.predict_proba(transform_test_matrix)

In [99]:
y_pred_test.shape

(2000, 2)

In [101]:
NE = (-1) / len(y_pred_test) * sum(((1+y_test)/2 * np.log(y_pred_test[:,1]) +  (1-y_test)/2 * np.log(1 - y_pred_test[:,1])))
print("Normalized Cross Entropy " + str(NE))

Normalized Cross Entropy 2.213280152050503
