In [1]:
'''
在测试数据集上的表现
'''
import numpy as np
import xgboost as xgb
from scipy.sparse import coo_matrix, csc_matrix
import math
import sys
from sklearn.externals import joblib



In [2]:
output_test_file = "F:/db/tmp/data/gbdt_test/output_test.txt"
output_feature_num_file = "F:/db/tmp/data/gbdt_test/feature_num.txt"
output_model_file = "F:/db/tmp/data/gbdt_test/gbdt.model"

In [3]:
# 读入测试文件
def get_test_data(test_file, feature_num_file):
    '''
    :param test_file:
    :return:
        np array
            测试的特征
            测试的label
    '''
    # 有103个特征参数
    total_feature = 103

    # label
    test_label = np.genfromtxt(test_file, dtype = np.float32, delimiter = ",", usecols = -1)

    feature_list = range(total_feature)
    test_feature = np.genfromtxt(test_file, dtype = np.float32, delimiter = ",", usecols = feature_list)
    # print(test_feature.shape)
    # sys.exit()
    return test_feature, test_label

In [4]:
# 树模型打分函数
def predict_by_tree(test_feature, tree_model):
    '''
    :param test_feature:
        测试特征
    :param tree_model:
        模型
    '''
    # 调用api中的预测函数
    # 转换数据结构
    predict_list = tree_model.predict(xgb.DMatrix(test_feature))
    return predict_list

In [5]:
# auc
def get_auc(predict_list, test_label):
    '''
    :param predict_list:
        模型预测label
    :param test_label:
        测试label
    # pos 正样本
    auc = (sum(pos_index) - pos_num(pos_num + 1) / 2) / pos_num * neg_num
    '''
    total_list = []
    for index in range(len(predict_list)):
        predict_score = predict_list[index]
        label = test_label[index]
        total_list.append((label, predict_score))
    # 排序
    sorted_total_list = sorted(total_list, key = lambda ele: ele[1])

    # 负样本
    neg_num = 0
    # 正样本
    pos_num = 0
    count = 1
    total_pos_index = 0
    for zuhe in sorted_total_list:
        label, predict_score = zuhe
        if label == 0:
            neg_num += 1
        else:
            pos_num += 1
            # 所有正样本的index + 所处的位置
            total_pos_index += count
        count += 1
    auc_score = (total_pos_index - pos_num * (pos_num + 1) / 2) / (pos_num * neg_num)
    print("auc: %5f" %(auc_score))

In [6]:
# 准确率
def get_accuary(predict_list, test_label):
    # 临界值 大于 正样本，小于 负样本
    score_thr = 0.8
    # 预测对的
    right_num = 0
    predict_label_list = []
    for index in range(len(predict_list)):
        predict_score = predict_list[index]
        if predict_score >= score_thr:
            predict_label = 1
        else:
            predict_label = 0
        predict_label_list.append(predict_label)
        if predict_label == test_label[index]:
            # 预测对的
            right_num += 1
    # 预测的，实际的
    print(predict_label_list[:10], test_label[:10])
    total_num = len(predict_list)
    accuary_score = right_num / total_num
    print("accuary: %.5f" %(accuary_score))

In [7]:
def run_check_core(test_feature, test_label, model, score_func):
    '''
    :param test_feature:
    :param test_label:
    :param model:
        树模型
    :param score_func:
        打分函数
    '''

    # 预测每一个样本label为 1 的概率
    predict_list = score_func(test_feature, model)
    # 计算auc
    get_auc(predict_list, test_label)
    # 准确率
    get_accuary(predict_list, test_label)

    '''
    auc: 0.920596
    预测的
        [0, 0, 0, 1, 0, 1, 0, 0, 1, 0] 
    实际的
        [ 0.  0.  1.  1.  0.  1.  0.  0.  1.  0.]
    可以将测试集的label去掉模拟真实环境预测
    accuary: 0.83386
    '''

In [8]:
# GBDT在测试集上的表现
def run_check(test_file, tree_model_file, feature_num_file):
    '''
    :param test_file:
        测试集
    :param tree_model_file:
        树模型
    :param feature_num_file:
        特征维度
    :return:
    '''
    test_feature, test_label = get_test_data(test_file, feature_num_file)
    # 加载模型
    tree_model = xgb.Booster(model_file = tree_model_file)
    run_check_core(test_feature, test_label, tree_model, predict_by_tree)

In [9]:
run_check(output_test_file, output_model_file, output_feature_num_file)

auc: 0.917323
[0, 0, 0, 1, 0, 0, 0, 0, 1, 0] [0. 0. 1. 1. 0. 1. 0. 0. 1. 0.]
accuary: 0.82171
