In [1]:
#-*-coding:utf8-*-

'''
模型在测试集上的表现
    auc高于0.7的模型可以部署上线
'''
import numpy as np
from sklearn.externals import joblib
import math



In [2]:
output_test_file = "F:/db/tmp/data/lr_test/output_test.txt"
# output_test_file = "F:/db/tmp/data/lr/output_test_no_label.txt"
model_coef_file = "F:/db/tmp/data/lr_test/lr_coef"
model_dump_file = "F:/db/tmp/data/lr_test/lr_dump"

In [3]:
# 读入测试文件
def get_test_data(test_file):
    '''
    :param test_file:
    :return:
        np array
            测试的特征
            测试的label
    '''
    # 有118个特征参数
    # total_feature = 118

    # 添加特征组合以后变为 174
    # total_feature = 174

    # 添加特征组合01和02以后变为 150
    total_feature = 150

    # label
    test_label = np.genfromtxt(test_file, dtype = np.float32, delimiter = ",", usecols = -1)

    feature_list = range(total_feature)
    test_feature = np.genfromtxt(test_file, dtype = np.float32, delimiter = ",", usecols = feature_list)
    return test_feature, test_label

In [4]:
# 实例化模型打分函数
def predict_by_lr_model(test_feature, lr_model):
    '''
    :param test_feature:
        测试的特征
    :param lr_model:
        实例化模型
    :return:
    '''
    # (15060, 150)
    # print(test_feature.shape)
    # # <class 'numpy.ndarray'>
    # print(type(test_feature))
    # sys.exit()


    result_list = []

    # 预测每一个样本label为 1 的概率
    prob_list = lr_model.predict_proba(test_feature)
    # label为0的概率， label为1的概率
    # [ 0.99811782  0.00188218]
    # print(prob_list[0])
    
    # 将每一个样本为 1 的概率输出
    for index in range(len(prob_list)):
        result_list.append(prob_list[index][1])
    return result_list


In [5]:
# 参数行模型打分函数
def predict_by_lr_coef(test_feature, lr_coef):

    # 将sigmoid函数转为np array适用的方法
    # 1个输入1个输出
    sigmoid_func = np.frompyfunc(sigmoid, 1, 1)

    # 参数和每一个测试样本的特征相乘再通过sigmoid计算概率
    return sigmoid_func(np.dot(test_feature, lr_coef))


In [6]:
# 定义阶跃模型(sigmoid)
def sigmoid(x):
    '''
    :param x:
    :return:
    '''
    return 1 / (1 + math.exp(-x))

In [7]:
# auc
def get_auc(predict_list, test_label):
    '''
    :param predict_list:
        模型预测label
    :param test_label:
        测试label
    # pos 正样本
    auc = (sum(pos_index) - pos_num(pos_num + 1) / 2) / pos_num * neg_num
    '''
    total_list = []
    for index in range(len(predict_list)):
        predict_score = predict_list[index]
        label = test_label[index]
        total_list.append((label, predict_score))
    # 排序
    sorted_total_list = sorted(total_list, key = lambda ele: ele[1])

    # 负样本
    neg_num = 0
    # 正样本
    pos_num = 0
    count = 1
    total_pos_index = 0
    for zuhe in sorted_total_list:
        label, predict_score = zuhe
        if label == 0:
            neg_num += 1
        else:
            pos_num += 1
            # 所有正样本的index + 所处的位置
            total_pos_index += count
        count += 1
    auc_score = (total_pos_index - pos_num * (pos_num + 1) / 2) / (pos_num * neg_num)
    print("auc: %5f" %(auc_score))

In [8]:
# 准确率
def get_accuracy(predict_list, test_label):
    print(predict_list, "predict_list")
    # 预测每一个样本label为 1 的概率 > 0.8 ? 1 : 0
    # 临界值 大于 正样本，小于 负样本
    score_thr = 0.8
    # 预测对的
    right_num = 0
    predict_label_list = []
    for index in range(len(predict_list)):
        predict_score = predict_list[index]
        if predict_score >= score_thr:
            predict_label = 1
        else:
            predict_label = 0
        predict_label_list.append(predict_label)
        if predict_label == test_label[index]:
            # 预测对的
            right_num += 1
    '''
    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0] [ 0.  0.  1.  1.  0.  1.  0.  0.  1.  0.]
    accuracy: 0.80737
    [0, 0, 0, 1, 0, 1, 0, 0, 1, 1] [ 0.  0.  1.  1.  0.  1.  0.  0.  1.  0.]
    accuracy: 0.83778
    '''
    print(predict_label_list[:10], test_label[:10])
    total_num = len(predict_list)
    accuracy_score = right_num / total_num
    print("accuracy: %.5f" %(accuracy_score))


In [9]:
def run_check_core(test_feature, test_label, model, score_func):
    '''
    :param test_feature:
    :param test_label:
    :param model:
        参数模型
        实例化模型
    :param score_func:
        使用不同的 model 来打分
    '''

    # 预测每一个样本label为 1 的概率
    predict_list = score_func(test_feature, model)
    # 计算auc
    # get_auc(predict_list, test_label)
    # 准确率
    get_accuracy(predict_list, test_label)

In [10]:
def run_check(test_file, lr_coef_file, lr_model_file):
    '''
    :param test_file:
        测试文件
    :param lr_coef_file:
        参数化的模型
            每一个特征所对应的参数
    :param lr_model_file:
        实例化的模型
            训练好整体导出的
    '''
    test_feature, test_label = get_test_data(test_file)
    # 读入参数模型
    lr_coef = np.genfromtxt(lr_coef_file, dtype = np.float32, delimiter = ",")
    # 读入实例化模型
    lr_model = joblib.load(lr_model_file)
    # 实例化模型打分
    run_check_core(test_feature, test_label, lr_model, predict_by_lr_model)
    # 参数化模型打分
    run_check_core(test_feature, test_label, lr_coef, predict_by_lr_coef)

In [11]:
if __name__ == "__main__":
    run_check(output_test_file, model_coef_file, model_dump_file)

[0.001811385095930419, 0.23425369061845644, 0.1719693368887944, 0.6415153967274243, 0.00757832640191531, 0.9330959836081079, 0.0025611496754787697, 0.10317119760267338, 0.5626529973806639, 0.543392278404805, 0.005600071664139567, 0.7080985227373481, 0.8687979450253658, 0.0020399971756721834, 0.3753347214229281, 0.01653587868131359, 0.7230925562718561, 0.00319453869075711, 0.04630885645037159, 0.300739701944024, 0.6942648284940931, 0.004856878150362495, 0.016642040881099187, 0.2907835026838765, 0.08666378133391413, 0.7956100694429238, 0.009647423377188484, 0.056837222376625425, 0.1539649841513276, 0.012491264047245884, 0.4874818348377423, 0.0014980824631169209, 0.004736518873878332, 0.005098428161344947, 0.835449961455652, 0.6171956413558274, 0.09325555657447542, 0.051665424379148034, 0.0020738263924793797, 0.2607084531467668, 0.0568314314966669, 0.8050484064843795, 0.004121529237974201, 0.19541949059158473, 0.0028189564088206566, 0.013063954388871398, 0.0035238579710847938, 0.063577292