In [None]:
import time
import datetime
import gc
from collections import Counter

import pandas as pd
import numpy as np

In [None]:
# 读取Step_3处理后的数据
data = pd.read_csv('../Temp/train_day_7_test_step_3.csv')

### 时间差函数

In [None]:
# 计算本次行为之前已经有了多少次行为(累计特证)
def before_click_count(feats, df):
    df = df.sort_values(by='context_timestamp').reset_index(drop=True)
    name = '_'.join(feat for feat in feats) + '_before_click_count'
    df[name] = df.groupby(feats).cumcount().rename(name)
    return df

# 计算本次行为之后还会有多少次行为(累计特证)
def after_click_count(feats, df):
    df = df.sort_values(by='context_timestamp').reset_index(drop=True)
    name = '_'.join(feat for feat in feats) + '_after_click_count'
    df[name] = df.iloc[::-1].groupby(feats).cumcount().rename(name).iloc[::-1]
    return df

# 计算本次行为距离上次行为的时间
def prev_time_diff(feats, df):
    df = df.sort_values(by='context_timestamp').reset_index(drop=True)
    name = '_'.join(feat for feat in feats) + '_next_time_diff'
    df[name] = df['context_timestamp'] - df.groupby(feats)['context_timestamp'].shift(1)
    return df

# 计算本次行为距离下次行为的时间
def next_time_diff(feats, df):
    df = df.sort_values(by='context_timestamp').reset_index(drop=True)
    name = '_'.join(feat for feat in feats) + '_next_time_diff'
    df[name] = df.groupby(feats)['context_timestamp'].shift(-1) - df['context_timestamp']
    return df

In [6]:
# 需要进行时间差计算的特征(同Step_3需要组合统计的特征)
feats_list = [
    ['user_id'],
    ['item_id'],
    ['user_id', 'item_id'],
    ['shop_id', 'item_id'],
    ['user_id', 'shop_id', 'item_id'],
    ###################################
    ['item_brand_id'],
    ['item_city_id'],
    ['item_brand_id', 'item_id'],
    ['item_city_id', 'item_id'],
    ['item_brand_id', 'item_city_id',  'item_id'],
    ####################################
    ['user_age_level', 'item_id'],
    ['user_gender_id', 'item_id'],
    ['user_gender_id', 'user_age_level',  'item_id'],
    ####################################
    ['user_id', 'item_brand_id'],
    ['user_id', 'item_city_id'],
    ['user_id', 'item_city_id', 'item_brand_id'],
    ####################################
    ['item_id', 'user_gender_id'],
    ['item_id', 'user_occupation_id'],
    ['item_id', 'user_gender_id', 'user_occupation_id'],
    ####################################
    ['item_id', 'context_id'],
    ['item_id', 'context_page_id'],
    ['item_id', 'context_page_id', 'context_id'],
    ####################################
    ['item_category_list', 'item_id'],
    ['item_property_list', 'item_id'],
    ['item_category_list',  'item_property_list', 'item_id'],
    ['predict_category_property', 'item_id'],
    ['item_category_list', 'predict_category_property', 'item_id'],
    ['item_property_list', 'predict_category_property', 'item_id']
]

In [None]:
# 时间差函数计算
for num, feats in enumerate(feats_list):
    print('######  Features {} begin  ######'.format(num))
    data = before_click_count(feat, data)
    data = after_click_count(feat, data)
    data = prev_time_diff(feat, data)
    data = next_time_diff(feat, data)
    print('######  Features {} end  ######'.format(num))

In [None]:
# 保存该步骤处理后的结果
data.to_csv('../Temp/train_day_7_test_step_4.csv', index=False)