In [1]:
import os
import time
import pandas as pd
import numpy as np
import functools

from multiprocessing import Pool
from functools import reduce

In [2]:
def time_pass(func):
    @functools.wraps(func)
    def wrapper(*args, **kw):
        time_begin = time.time()
        result = func(*args, **kw)
        time_stop = time.time()
        time_passed = time_stop - time_begin
        minutes, seconds = divmod(time_passed, 60)
        hours, minutes = divmod(minutes, 60)
        print('%s: %s:%s:%s' % (func.__name__, int(hours), int(minutes), int(seconds)))
        return result

    return wrapper


@time_pass
def read_big_csv(path):
    reader = pd.read_csv(path, chunksize=10000)
    data = pd.concat(reader, axis=0, ignore_index=True)

    return data


def cal_dup_ratio(series, n):
    """
    计算一个人给另一个人不同月份转的频次
    """
    the_dup_ratio = np.zeros(4)
    tmp = pd.Series(series.value_counts().values).value_counts()
    for j in tmp.index:
        if j > 3:
            continue
        else:
            the_dup_ratio[j] = tmp[j] / n

    the_dup_ratio[0] = 1 - np.sum(the_dup_ratio)

    return the_dup_ratio

In [59]:
def cal_both(id_loc):
    """
    data_from: 取值为dat_edge_from
    data_to: 取值为dat_edge_to
    """
    the_id_dat_from = one_step_son[int(id_loc[1]):int(id_loc[2])][['to_id', 'times_sum', 'weight_sum']].copy()
    the_id_dat_to = one_step_father[int(id_loc[3]):int(id_loc[4])][['from_id', 'times_sum', 'weight_sum']].copy()

    if the_id_dat_from['times_sum'].dtype == 'object':
        the_id_dat_from['times_sum'] = list(map(eval, the_id_dat_from['times_sum']))

    if the_id_dat_to['times_sum'].dtype == 'object':
        the_id_dat_to['times_sum'] = list(map(eval, the_id_dat_to['times_sum']))

    agg_the_id_dat_from = the_id_dat_from.groupby('to_id')['times_sum', 'weight_sum'].sum()
    agg_the_id_dat_to = the_id_dat_to.groupby('from_id')['times_sum', 'weight_sum'].sum()

    agg_dat = pd.concat([agg_the_id_dat_from, agg_the_id_dat_to], axis=0)

    value_counts_pre = the_id_dat_from['to_id'].append(the_id_dat_to['from_id'])
    if len(value_counts_pre) > 0:
        both_dup_ratio = cal_dup_ratio(value_counts_pre, len(agg_dat))
        both_dup_ratio_3 = both_dup_ratio[3]
    else:
        both_dup_ratio_3 = np.nan
    
    if len(the_id_dat_to['from_id']) > 0:
        in_dup_ratio = cal_dup_ratio(the_id_dat_to['from_id'], len(agg_the_id_dat_to))
        in_dup_ratio_3 = in_dup_ratio[3]
    else:
        in_dup_ratio_3 = np.nan
    
    # 开始计算特征
    length = len(value_counts_pre)
    unique_count = len(agg_dat)
    if length > 0:
        multi_ratio = (length - unique_count) / length
    else:
        multi_ratio = np.nan

    weight = agg_dat['weight_sum']
    if len(weight) > 0:
        weight_0 = np.min(weight)
        weight_25 = np.percentile(weight, 25)
        weight_50 = np.percentile(weight, 50)
        weight_100 = np.max(weight)
    else:
        weight_0, weight_25, weight_50, weight_100 = np.nan, np.nan, np.nan, np.nan
    
    times = agg_dat['times_sum']
    value_counts = times.value_counts()
    value_counts_0_9 = value_counts.reindex(list(range(10)))
    value_counts_0_9[0] = len(times) - np.nansum(value_counts_0_9)
    value_counts_ratio = value_counts_0_9/len(times)
    
    if len(agg_the_id_dat_to) > 0:
        in_weight_0 = np.min(agg_the_id_dat_to['weight_sum'])
        in_weight_25 = np.percentile(agg_the_id_dat_to['weight_sum'], 25)
        in_weight_50 = np.percentile(agg_the_id_dat_to['weight_sum'], 50)
        in_weight_100 = np.max(agg_the_id_dat_to['weight_sum'])
    else:
        in_weight_0, in_weight_25, in_weight_50, in_weight_100 = np.nan, np.nan, np.nan, np.nan
        
    if len(agg_the_id_dat_from) > 0:
        out_weight_0 = np.min(agg_the_id_dat_from['weight_sum'])
        out_weight_50 = np.percentile(agg_the_id_dat_from['weight_sum'], 50)
        out_weight_100 = np.max(agg_the_id_dat_from['weight_sum'])
    else:
        out_weight_0, out_weight_50, out_weight_100 = np.nan, np.nan, np.nan
    
    columns = (['both_dup_ratio_3', 'in_dup_ratio_3', 'length', 'unique_count', 'multi_ratio'] 
               + ['weight_0', 'weight_25', 'weight_50', 'weight_100'] 
               + ['value_counts_ratio_%s' % str(x) for x in range(10)]
               + ['in_weight_0', 'in_weight_25', 'in_weight_50', 'in_weight_100'] 
               + ['out_weight_0', 'out_weight_50', 'out_weight_100'])
    
    result = ([both_dup_ratio_3, in_dup_ratio_3, length, unique_count, multi_ratio] 
              + [weight_0, weight_25, weight_50, weight_100] 
              + list(value_counts_ratio) 
              + [in_weight_0, in_weight_25, in_weight_50, in_weight_100] 
              + [out_weight_0, out_weight_50, out_weight_100])

    return result

In [3]:
input_path = './'
sample_train = pd.read_table(os.path.join(input_path, "open_data/sample_train.txt"))  # 训练集约1.9万
valid_id = pd.read_table(os.path.join(input_path, "open_data/valid_id.txt"))  # 验证集
test_id = pd.read_table(os.path.join(input_path, "open_data/test_id.txt"))  # 测试集

file_names = ['dat_edge_feature_%s.csv' % str(x) for x in range(1, 12)]
dat_edge_feature = reduce(lambda x, y: x.append(y),
                          (read_big_csv('./output/dat_edge_feature/%s' % z) for z in file_names))

son = pd.read_csv('./output/son.csv')
father = pd.read_csv('./output/father.csv')

read_big_csv: 0:0:18
read_big_csv: 0:0:16
read_big_csv: 0:0:20
read_big_csv: 0:0:20
read_big_csv: 0:0:17
read_big_csv: 0:0:17
read_big_csv: 0:0:17
read_big_csv: 0:0:17
read_big_csv: 0:0:17
read_big_csv: 0:0:17
read_big_csv: 0:0:1


In [4]:
%%time
one_step = pd.DataFrame({'id': list(set(son['to_id']).union(set(father['from_id'])))})

one_step_son = pd.merge(one_step, dat_edge_feature, left_on='id', right_on='from_id')
one_step_father = pd.merge(one_step, dat_edge_feature, left_on='id', right_on='to_id')

CPU times: user 2min 43s, sys: 1min 14s, total: 3min 58s
Wall time: 3min 58s


In [5]:
del(dat_edge_feature)

In [6]:
%%time
one_step_son.sort_values(by='id', inplace=True)
one_step_son.reset_index(drop=True, inplace=True)
one_step_father.sort_values(by='id', inplace=True)
one_step_father.reset_index(drop=True, inplace=True)

CPU times: user 20.6 s, sys: 42 s, total: 1min 2s
Wall time: 1min 2s


In [7]:
%%time
one_step_son_id_counts = one_step_son['id'].value_counts(sort=False).sort_index().cumsum()
one_step_father_id_counts = one_step_father['id'].value_counts(sort=False).sort_index().cumsum()

CPU times: user 4.41 s, sys: 1.4 s, total: 5.8 s
Wall time: 5.8 s


In [8]:
%%time
id_loc_info_from = pd.DataFrame({'id': list(one_step_son_id_counts.index),
                                 'start_from': [0] + list(one_step_son_id_counts.values)[:-1],
                                 'stop_from': list(one_step_son_id_counts.values)})
id_loc_info_to = pd.DataFrame({'id': list(one_step_father_id_counts.index),
                               'start_to': [0] + list(one_step_father_id_counts.values)[:-1],
                               'stop_to': list(one_step_father_id_counts.values)})

CPU times: user 32.8 s, sys: 88 ms, total: 32.9 s
Wall time: 32.9 s


In [9]:
%%time
id_loc_info = pd.merge(id_loc_info_from, id_loc_info_to, on='id', how='outer')
id_loc_info = id_loc_info[['id', 'start_from', 'stop_from', 'start_to', 'stop_to']]
id_loc_info.fillna(0, inplace=True)

CPU times: user 1.57 s, sys: 0 ns, total: 1.57 s
Wall time: 1.59 s


In [10]:
id_loc_s = list(id_loc_info.values)  # 生成一个list,准备并行

In [74]:
%%time
with Pool(20) as p:
    feature_7 = p.map(cal_both, id_loc_s)

CPU times: user 1min 9s, sys: 9.2 s, total: 1min 18s
Wall time: 53min 2s


In [75]:
%%time

columns = (['both_dup_ratio_3', 'in_dup_ratio_3', 'length', 'unique_count', 'multi_ratio'] 
           + ['weight_0', 'weight_25', 'weight_50', 'weight_100'] 
           + ['value_counts_ratio_%s' % str(x) for x in range(10)]
           + ['in_weight_0', 'in_weight_25', 'in_weight_50', 'in_weight_100'] 
           + ['out_weight_0', 'out_weight_50', 'out_weight_100'])

feature_7_df = pd.DataFrame(feature_7, columns=columns)
feature_7_df['id'] = id_loc_info['id']

CPU times: user 1min 7s, sys: 388 ms, total: 1min 8s
Wall time: 1min 7s


In [76]:
feature_7_df.to_csv('./output/feature_7_df.csv', index=False)