# 用户当天点击特征的次数

In [1]:
import os
import pickle
import gc
import pandas as pd
import numpy as np
from tqdm import tqdm
from utils import load_pickle, dump_pickle, raw_data_path, feature_data_path
from utils import extract_ctr

In [2]:
def gen_user_feature_click_day():
    """生成用户对所有分类属性的当天点击量
    """
    data = load_pickle(raw_data_path + 'all_data_4567.pkl')

    feature_list = ['category2_label', 'category3_label',
                    'shop_id', 'item_id', 'item_brand_id',
                    'context_page_id', 
                    'item_price_level_bin', 'item_sales_level_bin', 
                    'item_property_topic_k_15',
                    ]

    for feature in tqdm(feature_list):
        feature_path = feature_data_path +'_2_1_'+'user_' + feature + '_click_day.pkl'
        
        if os.path.exists(feature_path):
            print('found ' + feature_path)   
        else:
            print('generating '+feature_path)

            user_feature_click_day = data.groupby(['user_id', 'day', feature]).size(
            ).reset_index().rename(columns={0: 'user_'+feature+'_click_day'})
            dump_pickle(user_feature_click_day, feature_path)


def add_user_feature_click_day(data):
    """添加用户对所有分类属性的当天点击量

    join_key: ['user_id', 'feature_id', 'day']

    """

    feature_list = ['category2_label', 'category3_label',
                    'shop_id', 'item_id', 'item_brand_id',
                    'context_page_id', 
                    'item_price_level_bin', 'item_sales_level_bin', 
                    'item_property_topic_k_15',
                    ]

    for feature in tqdm(feature_list):
        feature_path = feature_data_path +'_2_1_'+ 'user_'+feature+'_click_day.pkl'
        if not os.path.exists(feature_path):
            gen_user_feature_click_day()
            
        user_feature_click_day = load_pickle(feature_path)
        data = pd.merge(data, user_feature_click_day,
                        'left', [feature, 'day', 'user_id'])

    return data

In [4]:
def gen_user_feature_click_hour():
    """生成用户对所有分类属性的当前小时点击量
    """

    data = load_pickle(raw_data_path + 'all_data_4567.pkl')

    feature_list = ['category2_label', 'category3_label',
                    'shop_id', 'item_id', 'item_brand_id',
                    'context_page_id', 
                    'item_price_level_bin', 'item_sales_level_bin', 
                    'item_property_topic_k_15',
                    ]

    for feature in tqdm(feature_list):
        feature_path = feature_data_path +'_2_1_'+ 'user_'+feature+'_click_hour.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)   
        else:        
            print('generating '+feature_path)

            user_feature_click_hour = data.groupby(['user_id', 'day', 'hour', feature]).size(
            ).reset_index().rename(columns={0: 'user_' + feature + '_click_hour'})
            dump_pickle(user_feature_click_hour, feature_path)


def add_user_feature_click_hour(data):
    """添加用户对所有分类属性的当天点击统计量

    join_key: ['user_id', 'feature_id', 'day', 'hour']

    """

    feature_list = ['category2_label', 'category3_label',
                    'shop_id', 'item_id', 'item_brand_id',
                    'context_page_id', 
                    'item_price_level_bin', 'item_sales_level_bin', 
                    'item_property_topic_k_15',
                    ]

    for feature in tqdm(feature_list):
        feature_path = feature_data_path +'_2_1_'+ 'user_' +feature+'_click_hour.pkl'
        if not os.path.exists(feature_path):
            gen_user_feature_click_hour()
            
        user_feature_click_hour = load_pickle(feature_path)
        data = pd.merge(data, user_feature_click_hour, 'left', [feature, 'day', 'hour', 'user_id'])

    return data

In [5]:
if __name__ =='__main__':
    
    data = load_pickle(raw_data_path + 'all_data_4567.pkl')
    
    data = add_user_feature_click_day(data)
    data = add_user_feature_click_hour(data)

    print(data.columns)

 89%|████████▉ | 8/9 [01:14<00:09,  9.33s/it]
  0%|          | 0/9 [00:00<?, ?it/s][A

found ../features/_2_1_user_category2_label_click_day.pkl
found ../features/_2_1_user_category3_label_click_day.pkl
found ../features/_2_1_user_shop_id_click_day.pkl
found ../features/_2_1_user_item_id_click_day.pkl
found ../features/_2_1_user_item_brand_id_click_day.pkl
found ../features/_2_1_user_context_page_id_click_day.pkl
found ../features/_2_1_user_item_price_level_bin_click_day.pkl
found ../features/_2_1_user_item_sales_level_bin_click_day.pkl
generating ../features/_2_1_user_item_property_topic_k_15_click_day.pkl



100%|██████████| 9/9 [00:04<00:00,  2.23it/s][A
100%|██████████| 9/9 [01:36<00:00, 10.69s/it]
 89%|████████▉ | 8/9 [01:15<00:09,  9.41s/it]
  0%|          | 0/9 [00:00<?, ?it/s][A

found ../features/_2_1_user_category2_label_click_hour.pkl
found ../features/_2_1_user_category3_label_click_hour.pkl
found ../features/_2_1_user_shop_id_click_hour.pkl
found ../features/_2_1_user_item_id_click_hour.pkl
found ../features/_2_1_user_item_brand_id_click_hour.pkl
found ../features/_2_1_user_context_page_id_click_hour.pkl
found ../features/_2_1_user_item_price_level_bin_click_hour.pkl
found ../features/_2_1_user_item_sales_level_bin_click_hour.pkl
generating ../features/_2_1_user_item_property_topic_k_15_click_hour.pkl



100%|██████████| 9/9 [00:18<00:00,  2.06s/it][A
100%|██████████| 9/9 [02:08<00:00, 14.28s/it]

Index(['index', 'instance_id', 'item_id', 'item_category_list',
       'item_property_list', 'item_brand_id', 'item_city_id',
       'item_price_level', 'item_sales_level', 'item_collected_level',
       ...
       'user_item_property_topic_k_15_click_day',
       'user_category2_label_click_hour', 'user_category3_label_click_hour',
       'user_shop_id_click_hour', 'user_item_id_click_hour',
       'user_item_brand_id_click_hour', 'user_context_page_id_click_hour',
       'user_item_price_level_bin_click_hour',
       'user_item_sales_level_bin_click_hour',
       'user_item_property_topic_k_15_click_hour'],
      dtype='object', length=105)





In [8]:
data.isnull().sum()

index                                   0
instance_id                             0
item_id                                 0
item_category_list                      0
item_property_list                      0
item_brand_id                           0
item_city_id                            0
item_price_level                        0
item_sales_level                        0
item_collected_level                    0
item_pv_level                           0
user_id                                 0
user_gender_id                          0
user_age_level                          0
user_occupation_id                      0
user_star_level                         0
context_id                              0
context_timestamp                       0
context_page_id                         0
predict_category_property               0
shop_id                                 0
shop_review_num_level                   0
shop_review_positive_rate               0
shop_star_level                   

In [9]:
data.dtypes

index                                      int64
instance_id                                int64
item_id                                    int64
item_category_list                        object
item_property_list                        object
item_brand_id                              int64
item_city_id                               int64
item_price_level                           int64
item_sales_level                           int64
item_collected_level                       int64
item_pv_level                              int64
user_id                                    int64
user_gender_id                             int64
user_age_level                             int64
user_occupation_id                         int64
user_star_level                            int64
context_id                                 int64
context_timestamp                          int64
context_page_id                            int64
predict_category_property                 object
shop_id             

In [10]:
data.day.value_counts()

7    2806831
6    1934443
5    1200219
4    1157641
Name: day, dtype: int64

In [10]:
extract_ctr(data[(data.is_trade != -1) & (data.day == 7)], feature='user_item_id_click_day', alias='feature_ctr')

Unnamed: 0,user_item_id_click_day,query_cnt,conversion_cnt,feature_ctr
0,1,986185,45460.0,0.046097
1,2,76781,3603.0,0.046926
2,3,10647,457.0,0.042923
3,4,2360,85.0,0.036017
4,5,685,25.0,0.036496
5,6,271,11.0,0.04059
6,7,104,1.0,0.009615
7,8,51,1.0,0.019608
8,9,45,3.0,0.066667
9,10,25,0.0,0.0
