In [1]:
'''
在测试数据集上的表现
'''
import numpy as np
import xgboost as xgb
from scipy.sparse import coo_matrix, csc_matrix
import math
import sys
from sklearn.externals import joblib



In [2]:
output_test_file = "F:/db/tmp/data/gbdt_test/output_test.txt"
output_feature_num_file = "F:/db/tmp/data/gbdt_test/feature_num.txt"
output_model_file = "F:/db/tmp/data/gbdt_test/gbdt.model"
# 混合模型
mix_tree_model_file = "F:/db/tmp/data/gbdt_test/gbdt_tree.model"
mix_lr_coef_model_file = "F:/db/tmp/data/gbdt_test/gbdt_lr_coef.model"
mix_lr_model_file = "F:/db/tmp/data/gbdt_test/gbdt_lr.model"

In [3]:
# 读入测试文件
def get_test_data(test_file, feature_num_file):
    '''
    :param test_file:
    :return:
        np array
            测试的特征
            测试的label
    '''
    # 有103个特征参数
    total_feature = 103

    # label
    test_label = np.genfromtxt(test_file, dtype = np.float32, delimiter = ",", usecols = -1)

    feature_list = range(total_feature)
    test_feature = np.genfromtxt(test_file, dtype = np.float32, delimiter = ",", usecols = feature_list)
    # print(test_feature.shape)
    # sys.exit()
    return test_feature, test_label

In [4]:
# 树模型打分函数
def predict_by_tree(test_feature, tree_model):
    '''
    :param test_feature:
        测试特征
    :param tree_model:
        模型
    '''
    # 调用api中的预测函数
    # 转换数据结构
    predict_list = tree_model.predict(xgb.DMatrix(test_feature))
    return predict_list

In [5]:
# auc
def get_auc(predict_list, test_label):
    '''
    :param predict_list:
        模型预测label
    :param test_label:
        测试label
    # pos 正样本
    auc = (sum(pos_index) - pos_num(pos_num + 1) / 2) / pos_num * neg_num
    '''
    total_list = []
    for index in range(len(predict_list)):
        predict_score = predict_list[index]
        label = test_label[index]
        total_list.append((label, predict_score))
    # 排序
    sorted_total_list = sorted(total_list, key = lambda ele: ele[1])

    # 负样本
    neg_num = 0
    # 正样本
    pos_num = 0
    count = 1
    total_pos_index = 0
    for zuhe in sorted_total_list:
        label, predict_score = zuhe
        if label == 0:
            neg_num += 1
        else:
            pos_num += 1
            # 所有正样本的index + 所处的位置
            total_pos_index += count
        count += 1
    auc_score = (total_pos_index - pos_num * (pos_num + 1) / 2) / (pos_num * neg_num)
    print("auc: %5f" %(auc_score))

In [6]:
# 准确率
def get_accuary(predict_list, test_label):
    # 临界值 大于 正样本，小于 负样本
    score_thr = 0.8
    # 预测对的
    right_num = 0
    predict_label_list = []
    for index in range(len(predict_list)):
        predict_score = predict_list[index]
        if predict_score >= score_thr:
            predict_label = 1
        else:
            predict_label = 0
        predict_label_list.append(predict_label)
        if predict_label == test_label[index]:
            # 预测对的
            right_num += 1
    # 预测的，实际的
    print(predict_label_list[:10], test_label[:10])
    total_num = len(predict_list)
    accuary_score = right_num / total_num
    print("accuary: %.5f" %(accuary_score))

In [7]:
# 特征转换 提取特征
def get_gbdt_and_lr_feature(tree_leaf, tree_num, tree_depth):
    num_leaf_max = np.max(tree_leaf)
    total_feature_list = np.zeros([len(tree_leaf),len(tree_leaf[0]) * num_leaf_max], dtype=np.int64)
    
    for i in range(0, len(tree_leaf)):
        temp = np.arange(len(tree_leaf[0])) * num_leaf_max - 1 + np.array(tree_leaf[i])
        total_feature_list[i][temp] += 1

    return total_feature_list

In [8]:
# 实例化模型打分函数
def predict_by_lr_gbdt(test_feature, mix_tree_model, mix_lr_coef, tree_info):
    '''
    :param test_feature:
    :param mix_tree_model:
    :param mix_lr_coef:
    :param tree_info:
    '''
    # 得到每一个样本在gbdt预测中最终落在了哪个节点上
    tree_leaf = mix_tree_model.predict(xgb.DMatrix(test_feature), pred_leaf = True)
    (tree_num, tree_depth, step_size) = tree_info
    # 特征转换
    total_feature_list = get_gbdt_and_lr_feature(tree_leaf, tree_num, tree_depth)
    '''实例化模型'''
    result_list = []
    # 预测每一个样本label为 1 的概率
    prob_list = mix_lr_coef.predict_proba(total_feature_list)
    # label为0的概率， label为1的概率
    # [ 0.99811782  0.00188218]
    print(prob_list[0])
    
    # 将每一个样本为 1 的概率输出
    for index in range(len(prob_list)):
        result_list.append(prob_list[index][1])
    return result_list

In [9]:
# 定义阶跃模型(sigmoid)
def sigmoid(x):
    '''
    :param x:
    :return:
    '''
    return 1 / (1 + math.exp(-x))

In [10]:
def run_check_lr_gbdt_core(test_feature, test_label, mix_tree_model, mix_lr_coef, tree_info, score_func):
    '''
    :param test_feature:
        测试数据集
    :param test_label:
        测试label
    :param mix_tree_model:
        混合树模型
    :param mix_lr_coef:
        混合lr模型
    :param tree_info:
        混合树模型参数
    :param predict_by_lr_gbdt:
        综合打分函数
    '''
    # 模型的打分列表
    predict_list = score_func(test_feature, mix_tree_model, mix_lr_coef, tree_info)
    get_auc(predict_list, test_label)
    get_accuary(predict_list, test_label)

In [11]:
# GBDT + LR 在测试集上的效果
def run_check_lr_gbdt(test_file, tree_mix_model_file, lr_coef_mix_model_file, feature_num_file):
    '''
    :param test_file:
        测试文件
    :param tree_mix_model_file:
        混合模型 tree模型
    :param lr_coef_mix_model_file:
        混合模型 lr模型
    :param feature_num_file:
        特征维度
    :return:
    '''
    test_feature, test_label = get_test_data(test_file, feature_num_file)
    mix_tree_model = xgb.Booster(model_file = tree_mix_model_file)
    
    # 加载模型实例
    mix_lr_model = joblib.load(lr_coef_mix_model_file)
    # 混合模型中树模型的参数
    # num depth step
    tree_info = (10, 4, 0.3)
    run_check_lr_gbdt_core(test_feature, test_label, mix_tree_model, mix_lr_model, tree_info, predict_by_lr_gbdt)

In [12]:
run_check_lr_gbdt(output_test_file, mix_tree_model_file, mix_lr_model_file, output_feature_num_file)

[0.99712085 0.00287915]
auc: 0.918920
[0, 0, 0, 1, 0, 0, 0, 0, 1, 0] [0. 0. 1. 1. 0. 1. 0. 0. 1. 0.]
accuary: 0.83918
