In [1]:
'''
树模型样本选择与特征选择
GBDT不需要对连续特征作处理，只需要将离散特征向量化(01编码)即可
'''

import pandas as pd
import numpy as np
import operator
import sys

In [2]:
train_file = "F:/db/tmp/data/lr/train.csv"
test_file = "F:/db/tmp/data/lr/test.csv"
# 输出文件改为gbdt
output_train_file = "F:/db/tmp/data/gbdt_test/output_train.txt"
output_test_file = "F:/db/tmp/data/gbdt_test/output_test.txt"
output_feature_num_file = "F:/db/tmp/data/gbdt_test/feature_num.txt"

In [3]:
def get_input(input_train_file, input_test_file):
    '''
    :param input_train_file:
    :param input_test_file:
    :return:
        pd.DataFrame.train_data
        pd.DataFrame.test_data
    '''
    # int类型单独声明，其他的都是string
    dtype_dict = {
        "age": np.int32,
        "education-num": np.int32, # 受教育年限
        "capital-gain": np.int32,
        "capital-loss": np.int32,
        "hours-per-week": np.int32
    }
    
    # use_list = range(15)
    use_list = list(range(0, 15))
    # 去掉第三列
    use_list.remove(2)
    # 训练文件，分隔符，列索引，特征的数据类型，缺省值(默认值)，需要哪些特征
    # na_values 无效
    train_data_df = pd.read_csv(input_train_file, 
                                sep = ",", 
                                header = 0, 
                                dtype = dtype_dict, 
                                na_values = ["?", "NaN", "？"], 
                                usecols = use_list)
    
    # print(train_data_df.shape) # (32561, 14)
    # 样本选择，丢弃空值 NaN
    # .replace(' ?', np.nan) 弥补 na_value 失效 bug
    train_data_df = train_data_df.replace(' ?', np.nan).dropna(axis = 0, how = "any")
    # print(train_data_df.shape) # (30162, 14)


    # 测试集
    test_data_df = pd.read_csv(input_test_file, sep = ",", header = 0, dtype = dtype_dict, na_values= "?", usecols = use_list)
    # print(test_data_df.shape) # (16281, 14)
    test_data_df = test_data_df.replace(' ?', np.nan).dropna(axis = 0, how = "any")
    # print(test_data_df.shape) # (15060, 14)

    return train_data_df, test_data_df

In [4]:
# label处理
def label_trans(x):
    # print(x.strip())
    '''
    :param x:
        label 值
    :return:
    '''
    if x.strip() == "<=50K" or x.strip() == "<=50K.":
        # print(x, "<=50")
        return "0"
    if x.strip() == ">50K" or x.strip() == ">50K.":
        # print(x, ">50")
        return "1"
    # print(type(x), x, "0")
    return "0"

In [5]:
# 处理label特征
def process_label_feature(label_feature_str, df_in):
    '''
    :param label_feature_str:
        label
    :param df_in:
        DataFrame
    :return:
    '''
    # apply 对每一个元素执行自定义操作
    df_in.loc[:, label_feature_str] = df_in.loc[:, label_feature_str].apply(label_trans)

In [6]:
# 排序并记录位置
def dict_trans(dict_in):
    '''
    :param dict_in:
        key: str
        value int
    :return:
        dict
            key: str
            value: key所对应的位置
    '''
    output_dict = {}
    index = 0
    for zuhe in sorted(dict_in.items(), key = operator.itemgetter(1), reverse = True):
        output_dict[zuhe[0]] = index
        index += 1
    return output_dict

In [7]:
def dis_to_feature(x, feature_dict):
    '''
    离散特征
    :param x:
    :param feature_dict:
    :return:
        str
          1, 0, 0
    '''
    # 有多少key就是多少维度
    output_list = [0] * len(feature_dict)
    if x not in feature_dict:
        # 全部为 0
        return ",".join([str(ele) for ele in output_list])
    else:
        index = feature_dict[x]
        output_list[index] = 1
        
    return ",".join([str(ele) for ele in output_list])

In [8]:
# 处理离散特征
def process_dis_feature(feature_str, df_train, df_test):
    '''
    先统计该特征一共有哪些值，作离散化 onehot
    保持训练集和测试集一致
    '''
    # 统计
    origin_dict = df_train.loc[:, feature_str].value_counts().to_dict()
    
    # 类别下的分类: 值
    # print(origin_dict)
    feature_dict = dict_trans(origin_dict)
    df_train.loc[:, feature_str] = df_train.loc[:, feature_str].apply(dis_to_feature, args = (feature_dict, ))
    df_test.loc[:, feature_str] = df_test.loc[:, feature_str].apply(dis_to_feature, args = (feature_dict, ))
    
    # onehot
    # print(df_train.loc[:3, feature_str])
    # 原始位置
    # print(feature_dict)
    # 每一个特征离散化后的维度
    return len(feature_dict)

In [9]:
# 将dataframe的数据写入文件当中
def output_file(df_in, out_file):
    # 按行写入
    fw = open(out_file, "w+", encoding = "utf-8")
    for row_index in df_in.index:
        outline = ",".join([str(ele) for ele in df_in.loc[row_index].values])
        fw.write(outline + "\n")
    fw.close()

In [10]:
def ana_train_data(input_train_data, input_test_data, out_train_file, out_test_file, feature_num_file):
    '''
    需要对训练集和测试集做同样的数据处理
    :param input_file_data:
    :param input_test_data:
    :param out_train_data:
    :param out_test_file:
    :param feature_num_file:
        特征数目输出文件
    :return:
    '''
    # 读入文件
    train_data_df, test_data_df = get_input(input_train_data, input_test_data)
    label_feature_str = "label"
    # 需要处理的离散特征
    dis_feature_list = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
    # 连续特征
    con_feature_list = ["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]

    process_label_feature(label_feature_str, train_data_df)
    process_label_feature(label_feature_str, test_data_df)

    # 统计离散化成了多少维
    dis_feature_num = 0
    # 统计连续特征离散化成了多少维
    con_feature_num = 0


    # 对每一个特征都做相应的处理
    # 离散特征
    for dis_feature in dis_feature_list:
        # 每个特征的维度
        tmp_feature_num = process_dis_feature(dis_feature, train_data_df, test_data_df)
        dis_feature_num += tmp_feature_num

    # 不处理连续特征
    for con_feature in con_feature_list:
        # 每个特征的维度 为 1
        con_feature_num += 1
    print(train_data_df.shape)
    print(test_data_df.shape)

    # 写入训练文件
    output_file(train_data_df, out_train_file)
    # 测试
    output_file(test_data_df, out_test_file)


    # 特征数目(离散特征 + 连续特征)输出文件, 这样后面的训练模型和预测就可以从这里去特征数目了，且特征数目发生变化也不会受影响
    fw = open(feature_num_file, "w+")
    fw.write("feature_num=" + str(dis_feature_num + con_feature_num))
    fw.close()

    # 离散特征和连续特征以及特征组合的维度
    print(dis_feature_num)
    print(con_feature_num)

In [11]:
if __name__ == "__main__":
    ana_train_data(train_file, test_file, output_train_file, output_test_file, output_feature_num_file)

(30162, 14)
(15060, 14)
98
5
