In [1]:
import numpy as np
# 使用含有交叉验证的lr
from sklearn.linear_model import LogisticRegressionCV as LRCV
# 将模型整体实体化输出
from sklearn.externals import joblib



In [2]:
output_train_file = "F:/db/tmp/data/lr_test/output_train.txt"
model_coef_file = "F:/db/tmp/data/lr_test/lr_coef"
model_dump_file = "F:/db/tmp/data/lr_test/lr_dump"

In [21]:
def train_lr_mode(train_file, model_coef, model_file):
    """
    :param train_file:
        训练数据
    :param model_coef:
        模型参数
    :param model_file:
        模型实例化文件(保存模型)
    :return:
    """
    # 特征总维度
    total_feture_num = 150
    
    # label 只需要使用最后一列
    train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=-1)
    
    # 读入特征部分
    # 除去label
    feature_list = range(total_feture_num)
    train_feature = np.genfromtxt(train_file, dtype = np.int32, delimiter = ",", usecols = feature_list)

    '''Accuracy'''
    # 训练模型
    # 参数 [正则化参数], tol 迭代停止条件, max_iter 最大迭代次数, cv 交叉验证(将训练数据分为5份，每次拿20%为测试，80%为训练，一共进行5次), 
    # sol 优化方法(使用拟牛顿法, 默认)[希望所有的样本都可以参与到训练当中]
    lr_cf = LRCV(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5)\
        .fit(train_feature, train_label)
    # return
    # 5行3列的数组
    scores = list(lr_cf.scores_.values())[0]
    
    # 每一个正则化参数对应的交叉验证的平均分值
    print("diff: %s" %(",".join([str(ele) for ele in scores.mean(axis = 0)])))
    # 平均准确率
    # scores.std() * 2 偏差的范围
    print("Accuracy: %s (+-%0.2f)" % (scores.mean(), scores.std() * 2))
    
    '''AUC'''
    # 训练模型
    # 参数 [正则化参数], tol 迭代停止条件, max_iter 最大迭代次数, cv 交叉验证(将训练数据分为5份，每次拿20%为测试，80%为训练，一共进行5次), 
    # sol 优化方法(使用拟牛顿法, 默认)[希望所有的样本都可以参与到训练当中]
    lr_cf = LRCV(Cs = [1], penalty = "l2", tol = 0.0001, max_iter = 500, cv = 5, scoring = "roc_auc").fit(train_feature, train_label)
    # 5行3列的数组
    scores = list(lr_cf.scores_.values())[0]
    # 每一个正则化参数对应的交叉验证的分值
    print("diff: %s" %(",".join([str(ele) for ele in scores.mean(axis = 0)])))
    # 平均auc
    '''
    diff: 0.89907602844,0.898857761654,0.89868638722
    AUC: 0.898873392438
    由此可得第一个参数最优
    Cs = [1, 10, 100] => Cs = [1]
    '''
    print("AUC: %s (+-%0.2f)" % (scores.mean(), scores.std() * 2))
    
    
    """参数模型"""
    # 将得到的参数输出
    coef = lr_cf.coef_[0]
    # print(coef)
    fw = open(model_coef, "w+")
    fw.write(",".join(str(ele) for ele in coef))
    fw.close()

    """实例化模型"""
    # 将模型整体实例化输出
    joblib.dump(lr_cf, model_file)

In [22]:
train_lr_mode(output_train_file, model_coef_file, model_dump_file)

diff: 0.8424179155024063
Accuracy: 0.8424179155024063 (+-0.01)
diff: 0.8990815542769827
AUC: 0.8990815542769827 (+-0.01)
