In [1]:
'''
训练GBDT_LR
'''

import xgboost as xgb
import numpy as np
import sys
from sklearn.linear_model import LogisticRegressionCV as LRCV
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
# 将模型整体实例化输出
from sklearn.externals import joblib



In [2]:
output_train_file = "F:/db/tmp/data/gbdt_test/output_train.txt"
output_feature_num_file = "F:/db/tmp/data/gbdt_test/feature_num.txt"
output_model_file = "F:/db/tmp/data/gbdt_test/gbdt.model"
mix_tree_model_file = "F:/db/tmp/data/gbdt_test/gbdt_tree.model"
mix_lr_model_file = "F:/db/tmp/data/gbdt_test/gbdt_lr.model"
# 参数化模型
mix_lr_model_coef_file = "F:/db/tmp/data/gbdt_test/gbdt_lr_coef.model"

In [3]:
# 得到训练数据
def get_train_data(train_file, feature_num_file):
    '''
    :param train_file:
    :param feature_num_file:
    :return:
    '''
    # 获取总的特征数目
    # total_feature_num = GF.get_feature_num(feature_num_file)
    total_feature_num = 103
    
    # label
    train_label = np.genfromtxt(train_file, dtype = np.int32, delimiter = ",", usecols = -1)
    
    # feature
    feature_list = range(int(total_feature_num))
    train_feature = np.genfromtxt(train_file, dtype = np.int32, delimiter = ",", usecols = feature_list)
    return train_feature, train_label

In [4]:
# gbdt核心代码
def train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate):
    '''
    :param train_mat:
        data AND label
    :param tree_depth:
        深度
    :param tree_num:
        树的个数
    :param learning_rate:
        步长
    :return:
        Booster结构的数据
    ''' 
    
    # 优化目标函数: 回归问题的线性优化 "objective": "reg:linear",
    # 不输出的一些信息 "silent": 1
    para_dict = {"max_path": tree_depth, "eta": learning_rate, "objective": "reg:squarederror", "silent": 1}
    bst = xgb.train(para_dict, train_mat, tree_num)
    
    # 利用交叉验证（5折交叉验证）查看一些训练指标
    # 每一棵树的 auc
    print(xgb.cv(para_dict, train_mat, tree_num, nfold = 5, metrics = {"auc"}))
    
    return bst

In [5]:
# 选择最优参数的参数列表
def choose_parameter():
    '''
    :return:
        list
            [(tree_depth, tree_num, step_size) ...]
    '''
    result_list = []
    tree_depth_list = [4, 5, 6]
    tree_num_list = [10, 50, 100]
    learning_rate_list = [0.3, 0.5, 0.7]
    for ele_tree_depth in tree_depth_list:
        for ele_tree_num in tree_num_list:
            for ele_learning_rate in learning_rate_list:
                result_list.append((ele_tree_depth, ele_tree_num, ele_learning_rate))

    return result_list

In [6]:
# 为gbdt选取最优参数
def grid_search(train_mat):
    '''
    :param train_mat:
         data AND label
    :return:
    '''
    # 构建参数列表
    para_list = choose_parameter()

    for ele in para_list:
        (tree_depth, tree_num, learning_rate) = ele
        para_dict = {"max_path": tree_depth, "eta": learning_rate, "objective": "reg:squarederror", "silent": 1}
        res = xgb.cv(para_dict, train_mat, tree_num, nfold = 5, metrics = {"auc"})
        print(res)
        auc_score = res.loc[tree_num - 1, ["test-auc-mean"]].values[0]
        
        # GBDT
        # 输出每一组参数以及auc的得分
        # tree_depth: 6, tree_num: 50, learning_tare: 0.3, auc: 0.923335
        # tree_depth: 5, tree_num: 50, learning_tare: 0.3, auc: 0.923335
        # tree_depth: 4, tree_num: 50, learning_tare: 0.3, auc: 0.923335
        print("tree_depth: %s, tree_num: %s, learning_tare: %s, auc: %f" \
              % (tree_depth, tree_num, learning_rate, auc_score))

In [7]:
# 训练gbdt + lr的混合模型
# gbdt和lr需要分开单独训练
def train_tree_and_lr_model(train_file, feature_num_file, mix_tree_model, mix_lr_model_file):
    '''
    :param tain_file:
        训练数据集
    :param feature_num_file:
        记录特征维度的文件
    :param mix_tree_model:
        混合模型 树模型部分的文件
    :param mix_lr_model_file:
        混合模型 lr部分的文件
    '''
    train_feature, train_label = get_train_data(train_file, feature_num_file)
    # 将数据结构转换为gbdt需要的数据结构
    train_mat = xgb.DMatrix(train_feature, train_label)
    # 选取的最优参数
    # tree_num, tree_depth, learning_rate = 50, 4, 0.3
    tree_num = 10
    tree_depth = 4
    learning_rate = 0.3
    '''GBDT'''
    # 树模型
    bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate)
    # 存储树模型
    bst.save_model(mix_tree_model)

    # lr所需的特征是由树经过编码得到的(只需要将叶子节点编程离散化的特征)
    # 用树模型预测样本（看样本最终落在哪个节点上[在 1， 不在 0]）
    tree_leaf = bst.predict(train_mat, pred_leaf = True)
    
    # print(len(tree_leaf), "30162_tree_leaf")
    # 10棵树，表示每棵树最终样本输出的结果是落到了哪个叶子节点上
    # 特征: 样本数 => 1: 100
    # 叶子节点: 2 ** tree_depth
    # 非叶子节点: 叶子节点 - 1
    print(tree_leaf[0])
    # print(tree_leaf)
    # print(np.max(tree_leaf))
    
    # 将样本落在哪个节点上的数据进行加工，最终获取训练lr所需要的特征
    # 需要叶子节点最终落在了哪个叶子节点[0, 0, 0, 1, 0, 0]
    # 特征转换 -> 叶子节点最终落在了哪个叶子节点上
    num_leaf_max = np.max(tree_leaf)
    total_feature_list = np.zeros([len(tree_leaf),len(tree_leaf[0]) * num_leaf_max], dtype=np.int64)
    for i in range(0, len(tree_leaf)):
        temp = np.arange(len(tree_leaf[0])) * num_leaf_max - 1 + np.array(tree_leaf[i])
        total_feature_list[i][temp] += 1

        
    '''LR'''
    # AUC
    # 训练模型
    # 参数 [正则化参数], tol 迭代停止条件, max_iter 最大迭代次数, cv 交叉验证(将训练数据分为5份，每次拿20%为测试，80%为训练，一共进行5次), sol 优化方法(使用拟牛顿法, 默认)[希望所有的样本都可以参与到训练当中]
    # lr_cf = LRCV(Cs = [1, 10, 100], penalty = "l2", tol = 0.0001, max_iter = 500, cv = 5, scoring = "roc_auc").fit(total_feature_list, train_label)
    lr_cf = LRCV(Cs = [1], 
                 penalty = "l2", 
                 dual = False, 
                 tol = 0.0001, 
                 max_iter = 500, 
                 cv = 5, 
                 scoring = "roc_auc").fit(total_feature_list, train_label)
    
    # 5行3列的数组
    # scores = lr_cf.scores_.values()[0]
    scores = list(lr_cf.scores_.values())[0]
    # 每一个正则化参数对应的交叉验证的分值
    print("diff: %s" %(",".join([str(ele) for ele in scores.mean(axis = 0)])))
    # 平均auc
    '''
    diff: 0.89907602844,0.898857761654,0.89868638722
    AUC: 0.898873392438
    由此可得第一个参数最优
    Cs = [1, 10, 100] => Cs = [1]
    '''
    print("AUC: %s (+-%0.2f)" % (scores.mean(), scores.std() * 2))

    '''
        GBDT
            tree_depth: 4, tree_num: 50, learning_tare: 0.3, auc: 0.923335
        
        
        GBDT + LR
            diff: 0.936934169273
            AUC: 0.936934169273
    '''

    # 将模型整体实例化输出
    joblib.dump(lr_cf, mix_lr_model_file)

In [8]:
train_tree_and_lr_model(output_train_file, output_feature_num_file, mix_tree_model_file, mix_lr_model_file)

   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0        0.897219       0.002147       0.892260      0.001693
1        0.905737       0.001149       0.900225      0.003617
2        0.911490       0.000854       0.905588      0.003665
3        0.915301       0.000697       0.908965      0.003656
4        0.917767       0.001026       0.911052      0.003666
5        0.920263       0.001220       0.912948      0.003493
6        0.922710       0.001413       0.914579      0.003290
7        0.924379       0.001134       0.915629      0.003058
8        0.926490       0.000949       0.917177      0.003317
9        0.928015       0.000914       0.917806      0.003342
[57 65 64 55 61 61 63 57 61 61]
diff: 0.9282472511996396
AUC: 0.9282472511996396 (+-0.00)
