In [1]:
import pandas as pd
import numpy as np
import operator
import sys

import logging

log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [2]:
train_file = "F:/db/tmp/data/lr/train.csv"
test_file = "F:/db/tmp/data/lr/test.csv"
output_train_file = "F:/db/tmp/data/lr_test/output_train.txt"
output_test_file = "F:/db/tmp/data/lr_test/output_test.txt"
output_feature_num_file = "F:/db/tmp/data/lr_test/feature_num.txt"

In [3]:
# 读取数据集(简单预处理)
def get_input(input_train_file, input_test_file):
    '''
    :param input_train_file:
    :param input_test_file:
    :return:
        pd.DataFrame.train_data
        pd.DataFrame.test_data
    '''
    # 单独声明int类型
    dtype_dict = {
        "age": np.int32,
        "education-num": np.int32, # 受教育年限
        "capital-gain": np.int32,
        "capital-loss": np.int32,
        "hours-per-week": np.int32
    }
    
    # use_list = range(15)
    use_list = list(range(0, 15))
    # 去掉第三列（舍弃第三列数据）
    # [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    use_list.remove(2)
    
    # 训练文件，分隔符，列索引，特征的数据类型，缺省值(默认值)，需要哪些特征
    # na_values 无效值
    train_data_df = pd.read_csv(input_train_file, 
                                sep=",", 
                                header=0, 
                                dtype=dtype_dict, 
                                na_values=["?", "NaN", "？"], 
                                usecols=use_list)
    
    # train_data_df.shape (32561, 14)

    # 样本选择，丢弃空值 NaN
    # .replace(' ?', np.nan) 弥补 na_value 失效 bug
    train_data_df = train_data_df.replace(' ?', np.nan).dropna(axis = 0, how = "any")
    # train_data_df.shape (30162, 14)
    
    # 测试集
    test_data_df = pd.read_csv(input_test_file, sep = ",", header = 0, dtype = dtype_dict, na_values= "?", usecols = use_list)
    # print(test_data_df.shape) # (16281, 14)
    test_data_df = test_data_df.replace(' ?', np.nan).dropna(axis = 0, how = "any")
    # print(test_data_df.shape) # (15060, 14)

    return train_data_df, test_data_df

In [4]:
# label处理
def label_trans(x):
    '''
    :param x:
        label 值
    :return:
    '''
    if x.strip() == "<=50K" or x.strip() == "<=50K.":
        # print(x, "<=50")
        return "0"
    if x.strip() == ">50K" or x.strip() == ">50K.":
        # print(x, ">50")
        return "1"
    # print(type(x), x, "0")
    return "0"

In [5]:
# 处理label特征
def process_label_feature(label_feature_str, df_in):
    '''
    :param label_feature_str:
        label
    :param df_in:
        DataFrame
    :return:
    '''
    # apply 对每一个元素执行自定义操作
    # 处理label
    df_in.loc[:, label_feature_str] = df_in.loc[:, label_feature_str].apply(label_trans)

In [6]:
# 离散特征排序并记录位置
def dict_trans(dict_in):
    '''
    :param dict_in: 
        key: str 特征
        value int 总取值个数
    :return:
        dict
            key: str
            value: key所对应的位置
    '''
    output_dict = {}
    index = 0
    
    for zuhe in sorted(dict_in.items(), key=operator.itemgetter(1), reverse=True):
        output_dict[zuhe[0]] = index
        index += 1
    
    return output_dict

In [7]:
def dis_to_feature(x, feature_dict):
    '''
    :param x:
    :param feature_dict:
    :return:
        str
          1, 0, 0
    '''
    # 有多少key就是多少维度
    output_list = [0] * len(feature_dict)
    if x not in feature_dict:
        # 初始化为0
        return ",".join([str(ele) for ele in output_list])
    else:
        index = feature_dict[x]
        output_list[index] = 1
        
    return ",".join([str(ele) for ele in output_list])

In [8]:
# 处理离散型特征
def process_dis_feature(feature_str, df_train, df_test):
    '''
    先统计该特征一共有哪些值，作离散化 onehot
    保持训练集和测试集一致
    '''
    # 统计当前特征一共有多少取值
    origin_dict = df_train.loc[:, feature_str].value_counts().to_dict()
    
    # 类别下的分类: 值
    feature_dict = dict_trans(origin_dict)
    
    df_train.loc[:, feature_str] = df_train.loc[:, feature_str].apply(dis_to_feature, args = (feature_dict, ))
    df_test.loc[:, feature_str] = df_test.loc[:, feature_str].apply(dis_to_feature, args = (feature_dict, ))
    
    # onehot
#     print(df_train.loc[:3, feature_str])
    logging.info('This is an info message %s, %s', df_train.loc[:3, feature_str], "onehot")
    # 原始位置
#     print(feature_dict)

    # 每一个特征离散化后的维度
    return len(feature_dict)

In [9]:
def list_trans(input_dict):
    '''
    分桶
    :param input_dict:
        {'count': 30162.0, 'mean': 38.437901995888865, 'std': 13.134664776856338, 'min': 17.0, 
         '25%': 28.0, '50%': 37.0, '75%': 47.0, 'max': 90.0}
    :return:
        list
            [0.1, 0.2, 0.3, 0.4, 0.5]
    '''
    # 初始化
    output_list = [0] * 5
    key_list = ["min", "25%", "50%", "75%", "max"]
    for index in range(len(key_list)):
        fix_key = key_list[index]
        if fix_key not in input_dict:
            print("error list_trans")
            sys.exit()
        else:
            output_list[index] = input_dict[fix_key]
        
    return output_list

In [10]:
def con_to_feature(x, feature_list):
    '''
    :param x:
        每一行的元素
    :param feature_list:
        分段的list
    :return:
        str
            "1_0_0_0"
    '''
    feature_len = len(feature_list) - 1
    result = [0] * feature_len
    
    # 判断值位于哪个区间内，将对应list中的位置赋值为1
    for index in range(feature_len):
        if x >= feature_list[index] and x <= feature_list[index + 1]:
            result[index] = 1
            return ",".join([str(ele) for ele in result])
    
    return ",".join([str(ele) for ele in result])

In [11]:
# 处理连续特征
# 按照 min': 17.0, '25%': 28.0, '50%': 37.0, '75%': 47.0, 'max': 90.0 为区间将连续特征划分并离散化
def process_con_feature(feature_str, df_train, df_test):
    '''
    :param feature_str:
        特征类别
    :param df_train:
        训练集
    :param df_test:
        测试集
    :return:
    '''
    # 先统计分布
    # {'count': 30162.0, 'mean': 38.437901995888865, 'std': 13.134664776856338, 
    #  'min': 17.0, '25%': 28.0, '50%': 37.0, '75%': 47.0, 'max': 90.0}
    origin_dict = df_train.loc[:, feature_str].describe().to_dict()
    
    feature_list = list_trans(origin_dict)
    df_train.loc[:, feature_str] = df_train.loc[:, feature_str].apply(con_to_feature, args = (feature_list, ))
    df_test.loc[:, feature_str] = df_test.loc[:, feature_str].apply(con_to_feature, args = (feature_list, ))
    
    '''测试'''
    # onehot
    # print(df_train.loc[:3, feature_str])
    # 区间
    # print(feature_list)
    # feature的维度
    return len(feature_list) - 1

In [12]:
def add(str_one, str_two):
    '''
    因为已经对特征进行了离散化的处理，所以都是onehot的形式
    :param str_one:
        "0,0,1,0"
    :param str_two:
        "1,0,0,0"
    :return:
        新的str
            "0,1,0,0,0,0,0 ..."
    '''
    # 新的特征维度是两个旧特征维度的乘积
    # 新特征的位置 = one中为1的位置 * two的长度 + two中为1的位置
    
    # 切割
    list_one = str_one.split(",")
    list_two = str_two.split(",")
    list_one_len = len(list_one)
    list_two_len = len(list_two)
    
    # 长度相乘
    return_list = [0] * (list_one_len * list_two_len)
    # 找到对应的位置
    try:
        index_one = list_one.index("1")
    except:
        # 找不到为0
        index_one = 0
    
    try:
        index_two = list_two.index("1")
    except:
        index_two = 0
    
    # 新特征赋值为1
    return_list[index_one * list_two_len + index_two] = 1
    
    return ",".join([str(ele) for ele in return_list])

In [13]:
# 特征组合函数
def combine_feature(feature_one, feature_two, new_feature, train_data_df, test_data_df, feature_num_dict):
    '''
    :param feature_one:
        第一个特征
    :param feature_two:
        第二个特征
    :param new_feature:
        组合后的新特征
    :param train_data_df:
        训练数据的dataframe
    :param test_data_df:
        测试数据的dataframe
    :param feature_num_dict:
        存储组合特征的dict
    :return
        新的特征对应的维度
            两个旧特征维度的乘积
    '''
    # 新增一列(由原来数据的两列组成)
    # axis = 1 按行操作
    train_data_df[new_feature] = train_data_df.apply(lambda row: add(row[feature_one], row[feature_two]), axis=1)
    test_data_df[new_feature] = test_data_df.apply(lambda row: add(row[feature_one], row[feature_two]), axis=1)
    
    if feature_one not in feature_num_dict:
        sys.exit()
    if feature_two not in feature_num_dict:
        sys.exit()
    return feature_num_dict[feature_one] * feature_num_dict[feature_two]

In [14]:
# 将dataframe的数据写入文件当中
def output_file(df_in, out_file):
    # 按行写入
    fw = open(out_file, "w+", encoding="utf-8")
    for row_index in df_in.index:
        outline = ",".join([str(ele) for ele in df_in.loc[row_index].values])
        fw.write(outline + "\n")
    
    fw.close()

In [15]:
def ana_train_data(input_train_data, input_test_data, out_train_file, out_test_file, feature_num_file):
    '''
    需要对训练集和测试集做同样的数据处理
    :param input_file_data:
    :param input_test_data:
    :param out_train_data:
    :param out_test_file:
    :param feature_num_file:
        特征数目输出文件
    :return:
    '''
    # 读入文件
    train_data_df, test_data_df = get_input(input_train_data, input_test_data)
    
    label_feature_str = "label"
    
    # 离散特征
    dis_feature_list = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
    
    # 连续特征
    con_feature_list = ["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
    
    # 所有特征的顺序初始化
    index_list = ["age", "workclass", "education", "education-num", "marital-status", "occupation", "relationship", 
                  "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country"]
    
    # 处理数据集
    process_label_feature(label_feature_str, train_data_df)
    process_label_feature(label_feature_str, test_data_df)
    
    """处理离散特征"""
    # 统计离散化成了多少维
    dis_feature_num = 0
    # 统计连续特征离散化成了多少维
    con_feature_num = 0
    
    # 对每一个特征都做相应的处理并添加组合特征
    # 组合特征是两个特征维度的乘积
    # 存储每一个特征对应的维度
    feature_num_dict = {}
    
#     print(train_data_df.loc[:, "workclass"])
    
    # 离散特征
    for dis_feature in dis_feature_list:
        # 每个特征的维度
        tmp_feature_num = process_dis_feature(dis_feature, train_data_df, test_data_df)
        dis_feature_num += tmp_feature_num
        feature_num_dict[dis_feature] = tmp_feature_num
    
    """处理连续特征"""
    for con_feature in con_feature_list:
        # 每个特征的维度
        tmp_feature_num = process_con_feature(con_feature, train_data_df, test_data_df)
        con_feature_num += tmp_feature_num
        feature_num_dict[con_feature] = tmp_feature_num
        
    
    """组合特征"""
    # 将 age 和 capital-gain(收入) 作组合特征01
    new_feature_len = combine_feature("age", "capital-gain", "age_gain", train_data_df, test_data_df, feature_num_dict)
    # 将 capital-gain(收入) 和 capital-loss(支出) 作组合特征02
    new_feature_len_two = combine_feature("capital-gain", "capital-loss", "loss_gain", train_data_df, test_data_df, feature_num_dict)
    # 将组合特征和label放到最后
    train_data_df = train_data_df.reindex(columns = index_list + ["age_gain", "loss_gain", "label"])
    test_data_df = test_data_df.reindex(columns = index_list + ["age_gain", "loss_gain", "label"])
    
    # 模拟着呢是数据，没有label
    test_data_df_no_label = test_data_df.reindex(columns = index_list + ["age_gain", "loss_gain"])
    
    
    # 写入训练文件
    output_file(train_data_df, out_train_file)
    # 测试
    output_file(test_data_df, out_test_file)
    
    # 模拟真实数据，将test的label去除掉
    output_file(test_data_df_no_label, "F:/db/tmp/data/lr/output_test_no_label.txt")

    # 特征数目(离散特征 + 连续特征 + 组合特征)输出文件, 这样后面的训练模型和预测就可以从这里去特征数目了，且特征数目发生变化也不会受影响
    fw = open(feature_num_file, "w+")
    fw.write("feature_num=" + str(dis_feature_num + con_feature_num + new_feature_len + new_feature_len_two))
    fw.close()

    # 离散特征和连续特征以及特征组合的维度
    print(dis_feature_num)
    print(con_feature_num)
    print(new_feature_len)
    print(new_feature_len_two)

In [16]:
ana_train_data(train_file, test_file, output_train_file, output_test_file, output_feature_num_file)

[2020-01-06 17:27:43,071] INFO in <ipython-input-8-f41217cdb90b>: This is an info message 0    0,0,0,1,0,0,0
1    0,1,0,0,0,0,0
2    1,0,0,0,0,0,0
3    1,0,0,0,0,0,0
Name: workclass, dtype: object, onehot
[2020-01-06 17:27:43,267] INFO in <ipython-input-8-f41217cdb90b>: This is an info message 0    0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1    0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2    1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3    0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Name: education, dtype: object, onehot
[2020-01-06 17:27:43,374] INFO in <ipython-input-8-f41217cdb90b>: This is an info message 0    0,1,0,0,0,0,0
1    1,0,0,0,0,0,0
2    0,0,1,0,0,0,0
3    1,0,0,0,0,0,0
Name: marital-status, dtype: object, onehot
[2020-01-06 17:27:43,539] INFO in <ipython-input-8-f41217cdb90b>: This is an info message 0    0,0,0,1,0,0,0,0,0,0,0,0,0,0
1    0,0,1,0,0,0,0,0,0,0,0,0,0,0
2    0,0,0,0,0,0,0,0,1,0,0,0,0,0
3    0,0,0,0,0,0,0,0,1,0,0,0,0,0
Name: occupation, dtype: object, onehot
[2020-01-06 17:27:43,635] INFO in <ipython-inp

98
20
16
16
