# 特征的平均日点击量

In [1]:
import os
import pickle
import gc
import pandas as pd
import numpy as np
from tqdm import tqdm
from utils import load_pickle, dump_pickle, raw_data_path, feature_data_path

In [2]:
def gen_feature_click_day_stats(data, feature):
    '''生成分类属性日点击量的统计特征

    '''

    data = data.copy()[[feature, 'day']]

    feature_click_day = pd.DataFrame(data.groupby(['day', feature]).size(
    )).reset_index().rename(columns={0: 'feature_click_day'})

    feature_click_day_mean = pd.DataFrame(feature_click_day.groupby([feature])['feature_click_day'].mean(
    )).rename(columns={'feature_click_day': feature + '_click_day_mean'}).reset_index()

    # 每个类别只保留一条记录
    data = data.drop(['day', ], axis=1)
    data = data.drop_duplicates([feature, ])
    data = pd.merge(data, feature_click_day_mean, how='left', on=feature)

    return data


def gen_feature_click_stats():
    """生成各个分类属性日点击量的统计特征

    file_name: (feature_id)_click_day_mean.pkl

    example:
        user_id_click_day_mean 该用户平均每天点击多少次

    features:
        'user_id_click_day_mean',  
        'item_id_click_day_mean', 
        'item_brand_id_click_day_mean', 
        'shop_id_click_day_mean', 
        'context_page_id_click_day_mean', 
        'category2_label_click_day_mean',
        'category2_label_click_day_mean',

    """

    all_data = load_pickle(raw_data_path + 'all_data.pkl')

    feature_list = ['user_id',
                    'category2_label', 'category3_label',
                    'shop_id', 'item_id', 'item_brand_id',
                    'context_page_id',
                    ]

    for feature in tqdm(feature_list):
        feature_path = feature_data_path +'_2_2_' + feature + '_click_day_mean.pkl'
        print('generating ' + feature_path)

        feature_stats = gen_feature_click_day_stats(all_data, feature)

        print(feature_stats.columns)
        dump_pickle(feature_stats, feature_path)


def add_feature_click_stats(data,):
    """添加分类属性日点击量的统计特征

    join_key: ['feature_id',]

    """

    feature_list = ['user_id',
                    'category2_label', 'category3_label',
                    'shop_id', 'item_id', 'item_brand_id',
                    'context_page_id',
                    ]

    for feature in tqdm(feature_list):
        feature_path = feature_data_path +'_2_2_'+ feature + '_click_day_mean.pkl'
        if not os.path.exists(feature_path):
            gen_feature_click_stats()
            
        feature_click_day_stats = load_pickle(feature_path)
        data = pd.merge(data, feature_click_day_stats, 'left', [feature, ])

    return data

In [None]:
if __name__ =='__main__':
    data = load_pickle(raw_data_path + 'all_data_4567.pkl')
    data = add_feature_click_stats(data)
    print(data.columns)


  0%|          | 0/7 [00:00<?, ?it/s][A

generating ../features/_2_2_user_id_click_day_mean.pkl



 14%|█▍        | 1/7 [00:26<02:37, 26.32s/it][A

Index(['user_id', 'user_id_click_day_mean'], dtype='object')
generating ../features/_2_2_category2_label_click_day_mean.pkl



 29%|██▊       | 2/7 [00:40<01:40, 20.03s/it][A

Index(['category2_label', 'category2_label_click_day_mean'], dtype='object')
generating ../features/_2_2_category3_label_click_day_mean.pkl



 43%|████▎     | 3/7 [00:52<01:09, 17.45s/it][A

Index(['category3_label', 'category3_label_click_day_mean'], dtype='object')
generating ../features/_2_2_shop_id_click_day_mean.pkl



 57%|█████▋    | 4/7 [01:52<01:24, 28.21s/it][A

Index(['shop_id', 'shop_id_click_day_mean'], dtype='object')
generating ../features/_2_2_item_id_click_day_mean.pkl



 71%|███████▏  | 5/7 [02:58<01:11, 35.79s/it][A

Index(['item_id', 'item_id_click_day_mean'], dtype='object')
generating ../features/_2_2_item_brand_id_click_day_mean.pkl



 86%|████████▌ | 6/7 [03:16<00:32, 32.73s/it][A

Index(['item_brand_id', 'item_brand_id_click_day_mean'], dtype='object')
generating ../features/_2_2_context_page_id_click_day_mean.pkl



100%|██████████| 7/7 [04:08<00:00, 35.45s/it][A
[A

Index(['context_page_id', 'context_page_id_click_day_mean'], dtype='object')


 57%|█████▋    | 4/7 [06:47<05:05, 101.98s/it]

In [4]:
data.isnull().sum()

index                                        0
instance_id                                  0
item_id                                      0
item_category_list                           0
item_property_list                           0
item_brand_id                                0
item_city_id                                 0
item_price_level                             0
item_sales_level                             0
item_collected_level                         0
item_pv_level                                0
user_id                                      0
user_gender_id                               0
user_age_level                               0
user_occupation_id                           0
user_star_level                              0
context_id                                   0
context_timestamp                            0
context_page_id                              0
predict_category_property                    0
shop_id                                      0
shop_review_n

In [5]:
data.dtypes

index                                           int64
instance_id                                     int64
item_id                                         int64
item_category_list                             object
item_property_list                             object
item_brand_id                                   int64
item_city_id                                    int64
item_price_level                                int64
item_sales_level                                int64
item_collected_level                            int64
item_pv_level                                   int64
user_id                                         int64
user_gender_id                                  int64
user_age_level                                  int64
user_occupation_id                              int64
user_star_level                                 int64
context_id                                      int64
context_timestamp                               int64
context_page_id             

In [6]:
data.day.value_counts()

6    1934443
7    1597063
5    1200219
4    1157641
Name: day, dtype: int64