In [None]:
import time
import datetime
import gc
from collections import Counter

import pandas as pd
import numpy as np

In [None]:
# 读取Step_2处理后的7号数据
data = pd.read_csv('../Temp/train_day_7_test_step_2.csv')

### 特证组合与统计函数

#### 以下四个函数实现的功能说明，以feats=['user_id', 'item_id']为例
- count_size()计算某个user看过多少个item，比如某个user看了代号分别为1,2,2,3,3的item，则计算出该user看了5个item;
- all_unique_count_size()计算某个user看过的所有item中，有多少个不同的item，上例中只有1,2,3这三个不同的item，故计算出为3；
- single_count_size()计算某个user看某个item看了多少次，比如上例中该user看了代号为3的item两次，故该user该item对应的数值为2
- count_funcs_compute()把上述三个函数融合起来计算，periods参数指明需要统计的时间段，比如是一天还是一小时等；
- 式子df[str(names[j]+'/'+names[0])] = df[names[j]] / df[names[0]]用来计算unique个数占总个数的比例,在上面的例子中该比例分别为3/5和2/5

In [None]:
# feats为要组合的特证
# df为输入的DataFrame数据
def count_size(feats, df):
    name = '_'.join(feat for feat in feats) + '_size'
    temp = pd.DataFrame(data[feats].groupby(feats[:-1])[feats[-1]].size()).rename(columns={feats[-1]:name}).reset_index()
    df = pd.merge(df, temp, how='left', on=feats[:-1])
    return df, name

def all_unique_count_size(feats, df):
    name = '_'.join(feat for feat in feats) + '_all_unique_size'
    temp = pd.DataFrame(df[feats].groupby(feats[:-1]).nunique()[feats[-1]]).rename(columns={feats[-1]:name}).reset_index()
    df = pd.merge(df, temp, how='left', on=feats[:-1])
    return df, name

def single_count_size(feats, df):
    name = '_'.join(feat for feat in feats) + '_single_size'
    temp = pd.DataFrame(data[feats].groupby(feats).size()).rename(columns={0:name}).reset_index()
    df = pd.merge(df, temp, how='left', on=feats)
    return df, name

In [None]:
# 把上面三个函数统一起来，方便计算
# funcs为函数名，periods为时间周期，在该时间周期内进行统计，可以是日，时，分，秒
# feats为要组合的特证
# df为输入的DataFrame数据
# 为保证逻辑正确，count_size()函数一定要第一个输入funcs
def count_funcs_compute(funcs, periods, feats, df):
    for i in range(len(periods)):
        print('Start computing in one {0}...'.format(periods[i]))
        period = periods[:i+1]
        period.extend(feats)
        names = []
        for func in funcs:
            df, name = func(period, df)
            names.append(name)
        for j in range(1, len(names)):
            df[str(names[j]+'/'+names[0])] = df[names[j]] / df[names[0]] 
        print('Computing in one {0} is over...'.format(periods[i]))
    return df

### 进行特征组合与统计

In [None]:
# 需要进行组合与组合的特征
feats_list = [
    ['user_id'],
    ['item_id'],
    ['user_id', 'item_id'],
    ['shop_id', 'item_id'],
    ['user_id', 'shop_id', 'item_id'],
    ###################################
    ['item_brand_id'],
    ['item_city_id'],
    ['item_brand_id', 'item_id'],
    ['item_city_id', 'item_id'],
    ['item_brand_id', 'item_city_id',  'item_id'],
    ####################################
    ['user_age_level', 'item_id'],
    ['user_gender_id', 'item_id'],
    ['user_gender_id', 'user_age_level',  'item_id'],
    ####################################
    ['user_id', 'item_brand_id'],
    ['user_id', 'item_city_id'],
    ['user_id', 'item_city_id', 'item_brand_id'],
    ####################################
    ['item_id', 'user_gender_id'],
    ['item_id', 'user_occupation_id'],
    ['item_id', 'user_gender_id', 'user_occupation_id'],
    ####################################
    ['item_id', 'context_id'],
    ['item_id', 'context_page_id'],
    ['item_id', 'context_page_id', 'context_id'],
    ####################################
    ['item_category_list', 'item_id'],
    ['item_property_list', 'item_id'],
    ['item_category_list',  'item_property_list', 'item_id'],
    ['predict_category_property', 'item_id'],
    ['item_category_list', 'predict_category_property', 'item_id'],
    ['item_property_list', 'predict_category_property', 'item_id']
]

In [None]:
funcs = [count_size, all_unique_count_size, single_count_size]
periods = ['day', 'hour', 'minute', 'second']

In [None]:
# 使用上述函数，进行统计计算
for num, feats in enumerate(feats_list):
    print('######  Features {} begin  ######'.format(num))
    data = count_funcs_compute(funcs, periods, feats, data)
    print('######  Features {} end  ######'.format(num))

In [None]:
# 保存该步骤处理后的数据
data.to_csv('../Temp/train_day_7_test_step_3.csv', index=False)