In [1]:
'''
训练GBDT
'''

import xgboost as xgb
import numpy as np
import sys
from sklearn.linear_model import LogisticRegressionCV as LRCV
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
# 将模型整体实例化输出
from sklearn.externals import joblib



In [2]:
output_train_file = "F:/db/tmp/data/gbdt_test/output_train.txt"
output_feature_num_file = "F:/db/tmp/data/gbdt_test/feature_num.txt"
output_model_file = "F:/db/tmp/data/gbdt_test/gbdt.model"

In [3]:
# 得到训练数据
def get_train_data(train_file, feature_num_file):
    '''
    :param train_file:
    :param feature_num_file:
    :return:
    '''
    # 获取总的特征数目
    # total_feature_num = GF.get_feature_num(feature_num_file)
    total_feature_num = 103
    
    # label
    train_label = np.genfromtxt(train_file, dtype = np.int32, delimiter = ",", usecols = -1)
    
    # feature
    feature_list = range(int(total_feature_num))
    train_feature = np.genfromtxt(train_file, dtype = np.int32, delimiter = ",", usecols = feature_list)
    return train_feature, train_label

In [4]:
# gbdt核心代码
def train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate):
    '''
    :param train_mat:
        data AND label
    :param tree_depth:
        深度
    :param tree_num:
        树的个数
    :param learning_rate:
        步长
    :return:
        Booster结构的数据
    ''' 
    
    # 优化目标函数: 回归问题的线性优化 "objective": "reg:linear",
    # 不输出的一些信息 "silent": 1
    para_dict = {"max_path": tree_depth, "eta": learning_rate, "objective": "reg:squarederror", "silent": 1}
    bst = xgb.train(para_dict, train_mat, tree_num)
    
    # 利用交叉验证（5折交叉验证）查看一些训练指标
    # 每一棵树的 auc
    print(xgb.cv(para_dict, train_mat, tree_num, nfold = 5, metrics = {"auc"}))
    
    return bst

In [5]:
# 选择最优参数的参数列表
def choose_parameter():
    '''
    :return:
        list
            [(tree_depth, tree_num, step_size) ...]
    '''
    result_list = []
    tree_depth_list = [4, 5, 6]
    tree_num_list = [10, 50, 100]
    learning_rate_list = [0.3, 0.5, 0.7]
    for ele_tree_depth in tree_depth_list:
        for ele_tree_num in tree_num_list:
            for ele_learning_rate in learning_rate_list:
                result_list.append((ele_tree_depth, ele_tree_num, ele_learning_rate))

    return result_list

In [6]:
# 为gbdt选取最优参数
def grid_search(train_mat):
    '''
    :param train_mat:
         data AND label
    :return:
    '''
    # 构建参数列表
    para_list = choose_parameter()

    for ele in para_list:
        (tree_depth, tree_num, learning_rate) = ele
        para_dict = {"max_path": tree_depth, "eta": learning_rate, "objective": "reg:squarederror", "silent": 1}
        res = xgb.cv(para_dict, train_mat, tree_num, nfold = 5, metrics = {"auc"})
        print(res)
        auc_score = res.loc[tree_num - 1, ["test-auc-mean"]].values[0]
        
        # GBDT
        # 输出每一组参数以及auc的得分
        # tree_depth: 6, tree_num: 50, learning_tare: 0.3, auc: 0.923335
        # tree_depth: 5, tree_num: 50, learning_tare: 0.3, auc: 0.923335
        # tree_depth: 4, tree_num: 50, learning_tare: 0.3, auc: 0.923335
        print("tree_depth: %s, tree_num: %s, learning_tare: %s, auc: %f" \
              % (tree_depth, tree_num, learning_rate, auc_score))

In [7]:
# 训练gbdt
def train_tree_model(train_file, feature_num_file, tree_model_file):
    '''
    :param train_file:
        筛选好的训练数据
    :param feature_num_file:
        特征维度
    :param tree_model_file
        存储模型
    '''

    train_feature, train_label = get_train_data(train_file, feature_num_file)
    
    # 包装为xgboost需要的数据格式
    train_mat = xgb.DMatrix(train_feature, train_label)
    
    # fm = fm - 1 + step_size * Tm
    '''先手动设置参数，需要根据指标选择最优参数'''
    # 树的个数
    # tree_num = 50
    tree_num = 10
    # 树的深度
    # tree_depth = 6
    tree_depth = 4
    # 步长
    learning_rate = 0.3
    # learning_rate = 0.3
    
    '''选择最优参数'''
    # grid_search(train_mat)
    bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate)
    bst.save_model(tree_model_file)

In [8]:
if __name__ == "__main__":
    train_tree_model(output_train_file, output_feature_num_file, output_model_file)

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.897219       0.002147       0.892260      0.001693
1        0.905737       0.001149       0.900225      0.003617
2        0.911490       0.000854       0.905588      0.003665
3        0.915301       0.000697       0.908965      0.003656
4        0.917767       0.001026       0.911052      0.003666
5        0.920263       0.001220       0.912948      0.003493
6        0.922710       0.001413       0.914579      0.003290
7        0.924379       0.001134       0.915629      0.003058
8        0.926490       0.000949       0.917177      0.003317
9        0.928015       0.000914       0.917806      0.003342
