## GBDT

In [4]:
import os
import pickle
import numpy as np
import pandas as pd
from collections import namedtuple

import lightgbm as lgb
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

def save_var(var, file_path):
    pickle.dump(var, open(file_path, 'wb'))
    
def load_var(file_path):
    return pickle.load(open(file_path, 'rb'))

def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print('Func %s, run time: %s' % (func.__name__, stop_time - start_time))
        return res
    return wrapper

##### 获取数据

In [6]:
def get_criteo_data():
    """ 读取criteo数据集 """
    train_path = '../data/criteo/train.txt'
    test_path = '../data/criteo/test.txt'

    dense_column_names = ['I' + str(i) for i in range(1, 14)]
    sparse_column_names = ['C' + str(i) for i in range(14, 40)]
    column_names = ['label'] + dense_column_names + sparse_column_names

    train_df = pd.read_csv(train_path, names=column_names, sep='\t')
    test_df = pd.read_csv(test_path, names=column_names, sep='\t')
    return train_df, test_df, dense_column_names, sparse_column_names

train_df, test_df, dense_column_names, sparse_column_names = get_criteo_data()
print('train_df.shape: {}, test_df.shape: {}'.format(train_df.shape, test_df.shape))

train_df.shape: (1000000, 40), test_df.shape: (1000000, 40)


##### 数据预处理

In [8]:
# 类别特征处理
for c in sparse_column_names:
    train_df[c] = train_df[c].astype('category')
    test_df[c] = test_df[c].astype('category')

##### 单模型训练与评估

In [10]:
def model_metric(prob, label, thr=0.5):
    """ 模型评估 """
    # AUC
    fpr, tpr, threshold = metrics.roc_curve(label, prob)
    auc = metrics.auc(fpr, tpr)
    score = metrics.accuracy_score(label, prob > thr)
    # LogLoss
    logloss = log_loss(label, prob)
    print('模型准确率:{}, AUC得分:{}, LogLoss:{}'.format(score, auc, logloss))
    print(classification_report(label, prob > thr, digits=2))
    print('==========================================================')

x_train, x_val, y_train, y_val = train_test_split(train_df.drop(['label'], axis=1), train_df['label'], test_size = 0.2, random_state = 2018)
    
lgb_train = lgb.Dataset(
            x_train,
            y_train)
lgb_eval = lgb.Dataset(
            x_val,
            y_val,
            reference=lgb_train)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'auc'},
    'learning_rate': 0.03,
    'max_depth': 7,
    'num_leaves': 80,
    'subsample': 0.9,
    'min_data_in_leaf':100,
    'bagging_fraction':0.7,
    'bagging_freq' :1,
    'verbose': -1,
    #'num_leaves': 1000,
    #'num_trees': 20, # 控制gbdt特征纬度
}
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=40000,
                valid_sets=lgb_eval,
                early_stopping_rounds=50,
                verbose_eval=10)







Training until validation scores don't improve for 50 rounds
[10]	valid_0's auc: 0.740678
[20]	valid_0's auc: 0.747433
[30]	valid_0's auc: 0.752304
[40]	valid_0's auc: 0.756224
[50]	valid_0's auc: 0.759542
[60]	valid_0's auc: 0.76248
[70]	valid_0's auc: 0.765032
[80]	valid_0's auc: 0.767313
[90]	valid_0's auc: 0.769482
[100]	valid_0's auc: 0.771563
[110]	valid_0's auc: 0.772939
[120]	valid_0's auc: 0.774273
[130]	valid_0's auc: 0.775532
[140]	valid_0's auc: 0.776522
[150]	valid_0's auc: 0.777268
[160]	valid_0's auc: 0.778023
[170]	valid_0's auc: 0.778661
[180]	valid_0's auc: 0.779223
[190]	valid_0's auc: 0.779638
[200]	valid_0's auc: 0.780106
[210]	valid_0's auc: 0.780504
[220]	valid_0's auc: 0.780883
[230]	valid_0's auc: 0.781219
[240]	valid_0's auc: 0.781519
[250]	valid_0's auc: 0.78187
[260]	valid_0's auc: 0.782165
[270]	valid_0's auc: 0.782409
[280]	valid_0's auc: 0.782665
[290]	valid_0's auc: 0.782881
[300]	valid_0's auc: 0.783125
[310]	valid_0's auc: 0.783339
[320]	valid_0's auc:

In [12]:
# 模型预测与评估
test_result = gbm.predict(test_df.drop(['label'], axis=1))
model_metric(np.array([i for i in test_result]), test_df['label'].values)

模型准确率:0.787677, AUC得分:0.7800132557415307, LogLoss:0.46045023784450323
              precision    recall  f1-score   support

           0       0.81      0.94      0.87    751819
           1       0.65      0.32      0.43    248181

    accuracy                           0.79   1000000
   macro avg       0.73      0.63      0.65   1000000
weighted avg       0.77      0.79      0.76   1000000



##### K折交叉验证

1. 将原始数据集分为k折，将一部分作为测试集，其余作为训练集；
2. 训练模型并验证测试集上的准确率；
3. 重复k次，将平均准确率作为最终准确率。

K折交叉验证可以有效防止过拟合和欠拟合，同时取训练出的多个模型预测结果的平均值相比单模型效果更好。

In [14]:
def get_prob_by_5model(feature):
    """ 
    加载5个模型取平均值作为预测结果
    :feature 参与预测的所有有效特征
    """
    best_iters_path = './lgb_models/best_iters.pkl'
    kf_test = np.zeros((feature.shape[0],))
    result = np.empty((5, feature.shape[0]))
    best_iterations = load_var(best_iters_path)
    for i in range(5):
        print('model ', i)
        gbm = lgb.Booster(model_file='./lgb_models/model_kf_{}.txt'.format(str(i)))
        print('gbm.best_iteration: ', best_iterations[i])
        result[i, :] = gbm.predict(feature, num_iteration=best_iterations[i])
    kf_test[:] = result.mean(axis=0)
    return kf_test

def train_model_by_kfold(X, y):
    """ 通过kfold训练模型 """
    best_iters_path = './lgb_models/best_iters.pkl'
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    kf_train = np.zeros((X.shape[0],))
    best_iters = []
    for i, (train_index, val_index) in enumerate(kfold.split(X)):
        lgb_train = lgb.Dataset(
                    X.loc[train_index],
                    y[train_index])
        lgb_eval = lgb.Dataset(
                    X.loc[val_index],
                    y[val_index],
                    reference=lgb_train)
        params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': {'auc'},
            'learning_rate': 0.03,
            'max_depth': 7,
            'num_leaves': 80,
            'subsample': 0.9,
            'min_data_in_leaf':100,
            'bagging_fraction':0.7,
            'bagging_freq' :1,
            'verbose': -1
        }
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=40000,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=50,
                        verbose_eval=10)
        kf_train[val_index] = gbm.predict(
                                X.loc[val_index],
                                num_iteration=gbm.best_iteration+5)
        print('------ i: {}, gbm.best_iteration: {}'.format(i, gbm.best_iteration))
        # 保存模型和最佳训练轮数
        gbm.save_model('./lgb_models/model_kf_{}.txt'.format(i))
        best_iters.append(gbm.best_iteration)
    save_var(best_iters, best_iters_path)
    
train_model_by_kfold(train_df.drop(['label'], axis=1), train_df['label'].values)







Training until validation scores don't improve for 50 rounds
[10]	valid_0's auc: 0.736511
[20]	valid_0's auc: 0.743939
[30]	valid_0's auc: 0.749507
[40]	valid_0's auc: 0.753578
[50]	valid_0's auc: 0.756914
[60]	valid_0's auc: 0.759704
[70]	valid_0's auc: 0.762485
[80]	valid_0's auc: 0.764845
[90]	valid_0's auc: 0.76717
[100]	valid_0's auc: 0.769445
[110]	valid_0's auc: 0.770971
[120]	valid_0's auc: 0.772323
[130]	valid_0's auc: 0.773547
[140]	valid_0's auc: 0.774504
[150]	valid_0's auc: 0.775257
[160]	valid_0's auc: 0.776102
[170]	valid_0's auc: 0.776763
[180]	valid_0's auc: 0.777306
[190]	valid_0's auc: 0.777805
[200]	valid_0's auc: 0.778253
[210]	valid_0's auc: 0.778608
[220]	valid_0's auc: 0.779035
[230]	valid_0's auc: 0.779395
[240]	valid_0's auc: 0.779732
[250]	valid_0's auc: 0.780012
[260]	valid_0's auc: 0.780293
[270]	valid_0's auc: 0.780589
[280]	valid_0's auc: 0.780798
[290]	valid_0's auc: 0.781109
[300]	valid_0's auc: 0.781362
[310]	valid_0's auc: 0.781569
[320]	valid_0's auc

[640]	valid_0's auc: 0.785256
[650]	valid_0's auc: 0.785267
[660]	valid_0's auc: 0.785287
[670]	valid_0's auc: 0.785286
[680]	valid_0's auc: 0.785304
[690]	valid_0's auc: 0.785314
[700]	valid_0's auc: 0.785354
[710]	valid_0's auc: 0.785367
[720]	valid_0's auc: 0.7854
[730]	valid_0's auc: 0.785466
[740]	valid_0's auc: 0.785483
[750]	valid_0's auc: 0.785502
[760]	valid_0's auc: 0.785504
[770]	valid_0's auc: 0.785519
[780]	valid_0's auc: 0.785525
[790]	valid_0's auc: 0.785524
[800]	valid_0's auc: 0.785519
[810]	valid_0's auc: 0.785543
[820]	valid_0's auc: 0.785535
[830]	valid_0's auc: 0.785547
[840]	valid_0's auc: 0.785526
[850]	valid_0's auc: 0.785534
[860]	valid_0's auc: 0.785546
[870]	valid_0's auc: 0.785538
[880]	valid_0's auc: 0.78556
[890]	valid_0's auc: 0.785563
[900]	valid_0's auc: 0.785564
[910]	valid_0's auc: 0.785564
[920]	valid_0's auc: 0.785582
[930]	valid_0's auc: 0.785568
[940]	valid_0's auc: 0.785601
[950]	valid_0's auc: 0.785627
[960]	valid_0's auc: 0.785626
[970]	valid_0

In [15]:
# 模型预测与评估
preds = get_prob_by_5model(test_df.drop(['label'], axis=1))
model_metric(preds, test_df['label'], thr=0.5)

model  0
gbm.best_iteration:  956
model  1
gbm.best_iteration:  860
model  2
gbm.best_iteration:  1008
model  3
gbm.best_iteration:  898
model  4
gbm.best_iteration:  996
模型准确率:0.788607, AUC得分:0.782457809799455, LogLoss:0.458584069653928
              precision    recall  f1-score   support

           0       0.81      0.94      0.87    751819
           1       0.65      0.32      0.43    248181

    accuracy                           0.79   1000000
   macro avg       0.73      0.63      0.65   1000000
weighted avg       0.77      0.79      0.76   1000000

