In [7]:
import os
import pickle
import gc
import pandas as pd
import numpy as np
from tqdm import tqdm
from utils import load_pickle, dump_pickle, raw_data_path, feature_data_path

In [8]:
pre_user_id = None
pre_feature = None
continue_cnt = 0 

def get_user_feature_pre_click(row, feature):
    
    global pre_user_id
    global pre_feature
    global continue_cnt
    
    if row['user_id'] == pre_user_id: 
        if row[feature] == pre_feature:
            # 该用户当前点击与上次一样的feature
            continue_cnt += 1
            return 1
        else:
            # 记录用户当前点击的物品
            pre_feature = row[feature]
            return 0
    
    else:
        # 上一个用户已经计算完成
        pre_user_id = row['user_id']
        pre_feature = row[feature]
        return 0
    
def get_user_feature_continue_click(row, feature):
    
    global pre_user_id
    global pre_feature
    global continue_cnt
    
    if row['user_id'] == pre_user_id: 
        if row[feature] == pre_feature:
            # 该用户当前点击与上次一样的feature
            continue_cnt += 1
            return continue_cnt
        else:
            # 记录用户当前点击的物品
            pre_feature = row[feature]
            continue_cnt = 1
            return continue_cnt
    
    else:
        # 上一个用户已经计算完成
        pre_user_id = row['user_id']
        pre_feature = row[feature]
        continue_cnt = 1
        return continue_cnt


def gen_user_feature_pre_click(update=True):
    '''用户当前点击与上次一样的feature

    file_name: user_feature_pre_click.pkl

    features:
        'user_item_id_pre_click', 'user_item_brand_id_pre_click',
        'user_shop_id_pre_click', 'user_category2_label_pre_click',

    '''

    all_data = load_pickle(raw_data_path + 'all_data.pkl')

    feature_list = ['item_id', 'item_brand_id', 'shop_id', 'category2_label',]

    for feature in tqdm(feature_list):

        feature_path = feature_data_path + 'user_'+feature+'_pre_click.pkl'

        if os.path.exists(feature_path) and update == False:
            print('found '+feature_path)
        else:
            print('generating '+feature_path)

            pre_click_feature_name = 'user_' + feature + '_pre_click'
            continue_click_feature_name = 'user_' + feature + '_continue_click'
            

            # 用户点击时间戳排序
            sorted_data = all_data.sort_values(
                by=['user_id', 'context_timestamp'], ascending=True)[['user_id', feature, 'context_timestamp']]
            
            sorted_data[pre_click_feature_name] = sorted_data.apply(lambda row: get_user_feature_pre_click(row, feature), axis=1)
            sorted_data[continue_click_feature_name] = sorted_data.apply(lambda row: get_user_feature_continue_click(row, feature), axis=1)
            
            sorted_data = sorted_data[[pre_click_feature_name, continue_click_feature_name]]
            
            dump_pickle(sorted_data, feature_path)


def add_user_feature_pre_click(data):


    feature_list = ['item_id', 'item_brand_id', 'shop_id', 'category2_label',]

    for feature in tqdm(feature_list):
        feature_path = feature_data_path + 'user_'+feature+'_pre_click.pkl'
        if not os.path.exists(feature_path):
            gen_user_feature_pre_click()
        user_feature_pre_click = load_pickle(feature_path)
        data = data.join(user_feature_pre_click)

    return data


# user click interval

In [9]:
if __name__ =='__main__':
    all_data = load_pickle(raw_data_path + 'all_data.pkl')
    gen_user_feature_pre_click()
    all_data = add_user_feature_pre_click(all_data)

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

generating ../features/user_item_id_pre_click.pkl


 25%|█████████████████████                                                               | 1/4 [00:23<01:11, 23.67s/it]

generating ../features/user_item_brand_id_pre_click.pkl


 50%|██████████████████████████████████████████                                          | 2/4 [00:46<00:46, 23.43s/it]

generating ../features/user_shop_id_pre_click.pkl


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [01:09<00:23, 23.33s/it]

generating ../features/user_category2_label_pre_click.pkl


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:31<00:00, 22.79s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  7.03it/s]


In [6]:
sorted_data = all_data.sort_values(by=['user_id', 'context_timestamp'], ascending=True)[
    ['user_id', 'item_id', 'shop_id', 'category2_label',  'context_timestamp', 'user_item_id_pre_click', 'user_shop_id_pre_click']]
# sorted_data[sorted_data['user_item_id_pre_click'] == 1]
sorted_data[sorted_data['user_id'] == 10452258869178394]

Unnamed: 0,user_id,item_id,shop_id,category2_label,context_timestamp,user_item_id_pre_click,user_shop_id_pre_click
59830,10452258869178394,6677310098456620917,1075328862710772224,7,1537258651,0,0
13444,10452258869178394,7905777711208821810,4215742678121737516,10,1537258715,0,0
4887,10452258869178394,6625137068692942664,8953967780295706657,7,1537258878,0,0
5228,10452258869178394,6625137068692942664,8953967780295706657,7,1537258921,1,1
