In [2]:
import os
import pickle
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from utils import raw_data_path, feature_data_path,result_path,cache_pkl_path,dump_pickle,load_pickle
from smooth import BayesianSmoothing

# 2-9历史CTR

In [3]:
def gen_29_smooth_ctr():
    all_data = load_pickle(raw_data_path+'all_data.pkl')    
    for feature_1 in (['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']):
        for feature_2 in tqdm(['item_price_level', 'item_sales_level', 
                                'shop_star_level', 'shop_review_num_level', 'shop_review_positive_rate',
                               'category2_label', 'category3_label',
                              ]):

            feature_path = feature_data_path+'_2_5_'+feature_1 + '_' + feature_2+'_CTR.pkl' #要存放的目录
            if os.path.exists(feature_path):
                print('found ' + feature_path)
            else:
                feature = feature_1 + '_' + feature_2
                print('generating ' + feature_path)
                I_alias = feature+'_smooth_I' #总点击次数
                C_alias = feature+'_smooth_C' #购买次数
                CTR_alias = feature+'_smooth_CTR'
                history_ctr = pd.DataFrame()
                for day in range(4,8):            
                    history_data = all_data[all_data['day'] < day]
                    I = history_data.groupby([feature]).size().reset_index().rename(columns={0: I_alias})
                    C = history_data[history_data['is_trade'] == 1].groupby([feature]).size().reset_index().rename(columns={0: C_alias})
                    CTR = pd.merge(I, C, how='left', on=[feature])
                    CTR[C_alias] = CTR[C_alias].fillna(0)
                    CTR[CTR_alias] = (CTR[C_alias]) / (CTR[I_alias])
                    CTR['day'] = day
                    history_ctr = history_ctr.append(CTR)

#                 dump_pickle(history_ctr[['day', feature, I_alias, C_alias, CTR_alias]],feature_path)  #存储
                dump_pickle(history_ctr[['day', feature, CTR_alias]],feature_path)  #存储
                   
    #     user自身特征交叉
    user_features = ['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']
    for i, feature_1 in enumerate(user_features):
        for j, feature_2 in enumerate(user_features):
            if i < j:
                feature_path = feature_data_path+'_2_5_'+feature_1 + '_' + feature_2+'_CTR.pkl' #要存放的目录
                if os.path.exists(feature_path):
                    print('found ' + feature_path)
                else:
                    feature = feature_1 + '_' + feature_2
                    print('generating ' + feature_path)
                    I_alias = feature+'_smooth_I' #总点击次数
                    C_alias = feature+'_smooth_C' #购买次数
                    CTR_alias = feature+'_smooth_CTR'
                    history_ctr = pd.DataFrame()
                    for day in range(4,8):            
                        history_data = all_data[all_data['day'] < day]
                        I = history_data.groupby([feature]).size().reset_index().rename(columns={0: I_alias})
                        C = history_data[history_data['is_trade'] == 1].groupby([feature]).size().reset_index().rename(columns={0: C_alias})
                        CTR = pd.merge(I, C, how='left', on=[feature])
                        CTR[C_alias] = CTR[C_alias].fillna(0)
                        CTR[CTR_alias] = (CTR[C_alias]) / (CTR[I_alias])
                        CTR['day'] = day
                        history_ctr = history_ctr.append(CTR)

                    dump_pickle(history_ctr[['day', feature, CTR_alias]],feature_path)  #存储
                
                
                

def add_29_smooth_ctr(all_data):
    for feature_1 in (['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']):
        for feature_2 in tqdm(['item_price_level', 'item_sales_level', 
                               'shop_star_level', 'shop_review_num_level', 'shop_review_positive_rate',
                               'category2_label', 'category3_label',
                              ]):
            feature_path = feature_data_path+'_2_5_'+feature_1 + '_' + feature_2+'_CTR.pkl' #要存放的目录
            if not os.path.exists(feature_path):
                gen_29_smooth_ctr()
            ctr_data = load_pickle(feature_path)
            feature = feature_1 + '_' + feature_2
            all_data = pd.merge(all_data, ctr_data, how='left', on=[feature, 'day'])
#             all_data[feature+'_smooth_I'] = all_data[feature+'_smooth_I'].fillna(0)
#             all_data[feature+'_smooth_C'] = all_data[feature+'_smooth_C'].fillna(0)
            
     #     user自身特征交叉
    user_features = ['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']
    for i, feature_1 in enumerate(user_features):
        for j, feature_2 in enumerate(user_features):
            if i < j:
                feature_path = feature_data_path+'_2_5_'+feature_1 + '_' + feature_2+'_CTR.pkl' #要存放的目录
                if not os.path.exists(feature_path):
                    gen_29_smooth_ctr()
                ctr_data = load_pickle(feature_path)
                feature = feature_1 + '_' + feature_2
                all_data = pd.merge(all_data, ctr_data, how='left', on=[feature, 'day'])
#                 all_data[feature+'_smooth_I'] = all_data[feature+'_smooth_I'].fillna(0)
#                 all_data[feature+'_smooth_C'] = all_data[feature+'_smooth_C'].fillna(0)
            
    return all_data       

# 2-9hour CTR

In [4]:
def gen_29_hour_ctr():
    all_data = load_pickle(raw_data_path+'all_data.pkl')    
    for feature_1 in (['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']):
        for feature_2 in tqdm(['item_price_level', 'item_sales_level', 
                                'shop_star_level', 'shop_review_num_level', 'shop_review_positive_rate',
                               'category2_label', 'category3_label',
                              ]):

            feature_path = feature_data_path+'_2_5_'+feature_1 + '_' + feature_2+'_hour_CTR.pkl' #要存放的目录
            if os.path.exists(feature_path):
                print('found ' + feature_path)
            else:
                feature = feature_1 + '_' + feature_2
                print('generating ' + feature_path)
                I_alias = feature+'_hour_I' #总点击次数
                C_alias = feature+'_hour_C' #购买次数
                CTR_alias = feature+'_hour_CTR'
                history_ctr = pd.DataFrame()
                for day in range(4,8):            
                    history_data = all_data[all_data['day'] < day]
                    I = history_data.groupby([feature, 'hour_bin']).size().reset_index().rename(columns={0: I_alias})
                    C = history_data[history_data['is_trade'] == 1].groupby([feature, 'hour_bin']).size().reset_index().rename(columns={0: C_alias})
                    CTR = pd.merge(I, C, how='left', on=[feature, 'hour_bin'])
                    CTR[C_alias] = CTR[C_alias].fillna(0)
                    CTR[CTR_alias] = (CTR[C_alias]) / (CTR[I_alias])
                    CTR['day'] = day
                    history_ctr = history_ctr.append(CTR)

                dump_pickle(history_ctr[['day', 'hour_bin', feature, CTR_alias]],feature_path)  #存储
                   
    #     user自身特征交叉
    user_features = ['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']
    for i, feature_1 in enumerate(user_features):
        for j, feature_2 in enumerate(user_features):
            if i < j:
                feature_path = feature_data_path+'_2_5_'+feature_1 + '_' + feature_2+'_hour_CTR.pkl' #要存放的目录
                if os.path.exists(feature_path):
                    print('found ' + feature_path)
                else:
                    feature = feature_1 + '_' + feature_2
                    print('generating ' + feature_path)
                    I_alias = feature+'_hour_I' #总点击次数
                    C_alias = feature+'_hour_C' #购买次数
                    CTR_alias = feature+'_hour_CTR'
                    history_ctr = pd.DataFrame()
                    for day in range(4,8):            
                        history_data = all_data[all_data['day'] < day]
                        I = history_data.groupby([feature, 'hour_bin']).size().reset_index().rename(columns={0: I_alias})
                        C = history_data[history_data['is_trade'] == 1].groupby([feature, 'hour_bin']).size().reset_index().rename(columns={0: C_alias})
                        CTR = pd.merge(I, C, how='left', on=[feature, 'hour_bin'])
                        CTR[C_alias] = CTR[C_alias].fillna(0)
                        CTR[CTR_alias] = (CTR[C_alias]) / (CTR[I_alias])
                        CTR['day'] = day
                        history_ctr = history_ctr.append(CTR)

                    dump_pickle(history_ctr[['day', 'hour_bin', feature, CTR_alias]],feature_path)  #存储
                
                
                

def add_29_hour_ctr(all_data):
    for feature_1 in (['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']):
        for feature_2 in tqdm(['item_price_level', 'item_sales_level', 
                                'shop_star_level', 'shop_review_num_level', 'shop_review_positive_rate',
                               'category2_label', 'category3_label',
                              ]):
            feature_path = feature_data_path+'_2_5_'+feature_1 + '_' + feature_2+'_hour_CTR.pkl' #要存放的目录
            if not os.path.exists(feature_path):
                gen_29_hour_ctr()
            ctr_data = load_pickle(feature_path)
            feature = feature_1 + '_' + feature_2
            all_data = pd.merge(all_data, ctr_data, how='left', on=[feature, 'day', 'hour_bin'])
#             all_data[feature+'_hour_I'] = all_data[feature+'_hour_I'].fillna(0)
#             all_data[feature+'_hour_C'] = all_data[feature+'_hour_C'].fillna(0)
            
     #     user自身特征交叉
    user_features = ['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']
    for i, feature_1 in enumerate(user_features):
        for j, feature_2 in enumerate(user_features):
            if i < j:
                feature_path = feature_data_path+'_2_5_'+feature_1 + '_' + feature_2+'_hour_CTR.pkl' #要存放的目录
                if not os.path.exists(feature_path):
                    gen_29_hour_ctr()
                ctr_data = load_pickle(feature_path)
                feature = feature_1 + '_' + feature_2
                all_data = pd.merge(all_data, ctr_data, how='left', on=[feature, 'day', 'hour_bin'])
#                 all_data[feature+'_hour_I'] = all_data[feature+'_hour_I'].fillna(0)
#                 all_data[feature+'_hour_C'] = all_data[feature+'_hour_C'].fillna(0)
            
    return all_data     

# 单特征历史CTR

In [5]:
def gen_features_smooth_ctr():
    all_data = load_pickle(raw_data_path+'all_data.pkl')    
    for feature in tqdm(['user_id', 
                         'item_id', 'item_brand_id',
                         'category2_label', 'category3_label',
                         'context_page_id', 
                         'shop_id',
                         'item_sales_level_bin', 'item_price_level_bin','item_collected_level_bin','item_pv_level_bin',
                         'shop_review_num_level_bin', 'shop_review_positive_rate_bin', 'shop_star_level_bin',
                         'shop_score_service_bin', 'shop_score_delivery_bin', 'shop_score_description_bin',
                         'hour'
                        ]):    
        feature_path = feature_data_path+'_2_5_'+feature+'_smooth_CTR.pkl' #要存放的目录
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            I_alias = feature+'_smooth_I' #总点击次数
            C_alias = feature+'_smooth_C' #购买次数
            CTR_alias = feature+'_smooth_CTR'
            history_ctr = pd.DataFrame()
            for day in range(4,8):            
                history_data = all_data[all_data['day'] < day]
                I = history_data.groupby([feature]).size().reset_index().rename(columns={0: I_alias})
                C = history_data[history_data['is_trade'] == 1].groupby([feature]).size().reset_index().rename(columns={0: C_alias})
                CTR = pd.merge(I, C, how='left', on=[feature])
                CTR[C_alias] = CTR[C_alias].fillna(0)
                CTR[CTR_alias] = (CTR[C_alias]) / (CTR[I_alias])
                CTR['day'] = day
                history_ctr = history_ctr.append(CTR)
 
            dump_pickle(history_ctr[['day', feature, I_alias, C_alias, CTR_alias]],feature_path)  #存储

def add_features_smooth_ctr(all_data):
    for feature in tqdm(['user_id', 
                         'item_id', 'item_brand_id',
                         'category2_label', 'category3_label',
                         'context_page_id', 
                         'shop_id',
                         'item_sales_level_bin', 'item_price_level_bin','item_collected_level_bin','item_pv_level_bin',
                         'shop_review_num_level_bin', 'shop_review_positive_rate_bin', 'shop_star_level_bin',
                         'shop_score_service_bin', 'shop_score_delivery_bin', 'shop_score_description_bin',
                         'hour'
                        ]):  
        feature_path = feature_data_path+'_2_5_'+feature+'_smooth_CTR.pkl'
        if not os.path.exists(feature_path):
            gen_features_smooth_ctr()
        ctr_data = load_pickle(feature_path)
        all_data = pd.merge(all_data, ctr_data, how='left', on=[feature, 'day'])
        all_data[feature+'_smooth_I'] = all_data[feature+'_smooth_I'].fillna(0)
        all_data[feature+'_smooth_C'] = all_data[feature+'_smooth_C'].fillna(0)
    return all_data       

# 单特征与hour组合

In [6]:
def gen_features_hour_ctr():
    all_data = load_pickle(raw_data_path+'all_data.pkl')    
    for feature in tqdm(['user_id', 
                         'item_id', 'item_brand_id',
                         'category2_label', 'category3_label',
                         'context_page_id', 
                         'shop_id',
                         'item_sales_level_bin', 'item_price_level_bin','item_collected_level_bin','item_pv_level_bin',
                         'shop_review_num_level_bin', 'shop_review_positive_rate_bin', 'shop_star_level_bin',
                         'shop_score_service_bin', 'shop_score_delivery_bin', 'shop_score_description_bin',
                        ]):    
        feature_path = feature_data_path+'_2_5_'+feature+'_hour_CTR.pkl' #要存放的目录
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            I_alias = feature+'_hour_I' #总点击次数
            C_alias = feature+'_hour_C' #购买次数
            CTR_alias = feature+'_hour_CTR'
            history_ctr = pd.DataFrame()
            for day in range(4,8):            
                history_data = all_data[all_data['day'] < day]
                I = history_data.groupby([feature, 'hour_bin']).size().reset_index().rename(columns={0: I_alias})
                C = history_data[history_data['is_trade'] == 1].groupby([feature, 'hour_bin']).size().reset_index().rename(columns={0: C_alias})
                CTR = pd.merge(I, C, how='left', on=[feature, 'hour_bin'])
                CTR[C_alias] = CTR[C_alias].fillna(0)
                CTR[CTR_alias] = (CTR[C_alias]) / (CTR[I_alias])
                CTR['day'] = day
                history_ctr = history_ctr.append(CTR)
 
            dump_pickle(history_ctr[['day', 'hour_bin', feature,CTR_alias]],feature_path)  #存储

def add_features_hour_ctr(all_data):
    for feature in tqdm(['user_id', 
                         'item_id', 'item_brand_id',
                         'category2_label', 'category3_label',
                         'context_page_id', 
                         'shop_id',
                         'item_sales_level_bin', 'item_price_level_bin','item_collected_level_bin','item_pv_level_bin',
                         'shop_review_num_level_bin', 'shop_review_positive_rate_bin', 'shop_star_level_bin',
                         'shop_score_service_bin', 'shop_score_delivery_bin', 'shop_score_description_bin',
                        ]):  
        feature_path = feature_data_path+'_2_5_'+feature+'_hour_CTR.pkl'
        if not os.path.exists(feature_path):
            gen_features_hour_ctr()
        ctr_data = load_pickle(feature_path)
        all_data = pd.merge(all_data, ctr_data, how='left', on=[feature, 'day', 'hour_bin'])

    return all_data       

# user_id历史点击某某某的数量

In [7]:
def gen_features_cross_history_ctr():
    all_data = load_pickle(raw_data_path+'all_data.pkl')    
    for feature in tqdm(['user_id']):  
        for feature2 in tqdm(['item_id', 'item_brand_id',
                             'category2_label', 'category3_label', 
                             'shop_id', 'item_sales_level_bin', 'item_price_level_bin']):            
            feature_path = feature_data_path+'_2_5_'+feature+'_'+feature2+'_before_history_CTR.pkl' #要存放的目录
            if os.path.exists(feature_path):
                print('found ' + feature_path)
            else:

                print('generating ' + feature_path)
                I_alias = feature+'_'+feature2+'_history_I' #总点击次数
                C_alias = feature+'_'+feature2+'_history_C' #购买次数
                CTR_alias = feature+'_'+feature2+'_history_CTR'
                history_ctr = pd.DataFrame()
                for day in range(4,8):

                    history_data = all_data[all_data['day'] <= day-1]
                    I = history_data.groupby([feature, feature2]).size().reset_index().rename(columns={0: I_alias})
                    C = history_data[history_data['is_trade'] == 1].groupby([feature, feature2]).size().reset_index().rename(columns={0: C_alias})
                    CTR = pd.merge(I, C, how='left', on=[feature, feature2])
                    CTR[C_alias] = CTR[C_alias].fillna(0)
                    CTR[CTR_alias] = CTR[C_alias] / CTR[I_alias]
                    CTR['day'] = day
                    history_ctr = history_ctr.append(CTR)

                dump_pickle(history_ctr[['day', feature, feature2, I_alias, C_alias, CTR_alias]],feature_path)  #存储

def add_features_cross_history_ctr(all_data):
    for feature in tqdm(['user_id',]):
   
        for feature2 in tqdm(['item_id', 'item_brand_id',
                         'category2_label', 'category3_label', 
                         'shop_id', 'item_sales_level_bin', 'item_price_level_bin']):  
            
            I_alias = feature+'_'+feature2+'_history_I' #总点击次数
            C_alias = feature+'_'+feature2+'_history_C' #购买次数
            feature_path = feature_data_path+'_2_5_'+feature+'_'+feature2+'_before_history_CTR.pkl' #要存放的目录
       
            if not os.path.exists(feature_path):
                gen_features_cross_history_ctr()
            ctr_data = load_pickle(feature_path)
            all_data = pd.merge(all_data, ctr_data, how='left', on=[feature, feature2, 'day'])
            all_data[I_alias] = all_data[I_alias].fillna(0)
            all_data[C_alias] = all_data[C_alias].fillna(0)
    return all_data       

# 特征交叉

In [None]:
def gen_features_cross_smooth_ctr():
    '''
    贝叶斯平滑版
    提取每天前些天的，分别以feature=['user_id', 'item_id', 'item_brand_id', 'shop_id']分类的，总点击次数_I,总购买次数_C,点击率_CTR
    以['day', feature, I_alias, C_alias, CTR_alias]存储
    文件名，【】_CTR.pkl
    '''
    all_data = load_pickle(raw_data_path+'all_data.pkl')    
 
    for feature in tqdm(['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']):
   
        for feature2 in tqdm(['shop_id', 'item_id', 'item_brand_id',]):  
            
            feature_path = feature_data_path+feature+'_'+feature2+'_smooth_CTR.pkl' #要存放的目录
            if os.path.exists(feature_path):
                print('found ' + feature_path)
            else:

                print('generating ' + feature_path)
                I_alias = feature+'_'+feature2+'_smooth_I' #总点击次数
                C_alias = feature+'_'+feature2+'_smooth_C' #购买次数
                CTR_alias = feature+'_'+feature2+'_smooth_CTR'
                history_ctr = pd.DataFrame()
                for day in range(4,8):

                    history_data = all_data[all_data['day'] < day]
                    I = history_data.groupby([feature, feature2]).size().reset_index().rename(columns={0: I_alias})
                    C = history_data[history_data['is_trade'] == 1].groupby([feature, feature2]).size().reset_index().rename(columns={0: C_alias})
                    CTR = pd.merge(I, C, how='left', on=[feature, feature2])
                    CTR[C_alias] = CTR[C_alias].fillna(0)

                    CTR[CTR_alias] = (CTR[C_alias]) / (CTR[I_alias])
                    CTR['day'] = day
                    history_ctr = history_ctr.append(CTR)
                dump_pickle(history_ctr[['day', feature, feature2, CTR_alias]],feature_path)  #存储

def add_features_cross_smooth_ctr(all_data):
    '''
    向总体数据添加特征
    feature=['user_id', 'item_id', 'item_brand_id', 'shop_id', 'context_page_id', 'category2_label', 'item_price_level', 'category_predict_rank']
    拼接键[feature, 'day']
    '''
    for feature in tqdm(['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']):
   
        for feature2 in tqdm(['shop_id', 'item_id', 'item_brand_id',]):  
            
            feature_path = feature_data_path+feature+'_'+feature2+'_smooth_CTR.pkl' #要存放的目录
       
            if not os.path.exists(feature_path):
                gen_features_cross_smooth_ctr()
            ctr_data = load_pickle(feature_path)
            all_data = pd.merge(all_data, ctr_data, how='left', on=[feature, feature2, 'day'])
    return all_data     

# 测试

In [None]:
if __name__ =='__main__':

    all_data = load_pickle(raw_data_path + 'all_data_4567.pkl')
    

    
    all_data = add_29_smooth_ctr(all_data)
    all_data = add_29_hour_ctr(all_data)
    all_data = add_features_smooth_ctr(all_data)
    all_data = add_features_hour_ctr(all_data) 
    all_data = add_features_cross_history_ctr(all_data)
    all_data = add_features_cross_smooth_ctr(all_data)
    
    
    
    print(all_data.columns)  


  0%|          | 0/17 [00:00<?, ?it/s][A

generating ../features/_2_5_user_id_hour_CTR.pkl



  6%|▌         | 1/17 [00:47<12:32, 47.04s/it][A

generating ../features/_2_5_item_id_hour_CTR.pkl



 12%|█▏        | 2/17 [01:02<07:46, 31.10s/it][A

generating ../features/_2_5_item_brand_id_hour_CTR.pkl



 18%|█▊        | 3/17 [01:14<05:49, 24.96s/it][A

generating ../features/_2_5_category2_label_hour_CTR.pkl



 24%|██▎       | 4/17 [01:28<04:48, 22.18s/it][A

generating ../features/_2_5_category3_label_hour_CTR.pkl



 29%|██▉       | 5/17 [01:42<04:05, 20.44s/it][A

generating ../features/_2_5_context_page_id_hour_CTR.pkl



 35%|███▌      | 6/17 [01:55<03:32, 19.33s/it][A

generating ../features/_2_5_shop_id_hour_CTR.pkl



 41%|████      | 7/17 [02:10<03:06, 18.69s/it][A

generating ../features/_2_5_item_sales_level_bin_hour_CTR.pkl



 47%|████▋     | 8/17 [02:22<02:39, 17.76s/it][A

generating ../features/_2_5_item_price_level_bin_hour_CTR.pkl



 53%|█████▎    | 9/17 [02:31<02:14, 16.87s/it][A

generating ../features/_2_5_item_collected_level_bin_hour_CTR.pkl



 59%|█████▉    | 10/17 [02:41<01:53, 16.15s/it][A

generating ../features/_2_5_item_pv_level_bin_hour_CTR.pkl



 65%|██████▍   | 11/17 [02:51<01:33, 15.56s/it][A

generating ../features/_2_5_shop_review_num_level_bin_hour_CTR.pkl



 71%|███████   | 12/17 [03:00<01:15, 15.06s/it][A

generating ../features/_2_5_shop_review_positive_rate_bin_hour_CTR.pkl



 76%|███████▋  | 13/17 [03:09<00:58, 14.59s/it][A

generating ../features/_2_5_shop_star_level_bin_hour_CTR.pkl



 82%|████████▏ | 14/17 [03:19<00:42, 14.23s/it][A

generating ../features/_2_5_shop_score_service_bin_hour_CTR.pkl



 88%|████████▊ | 15/17 [03:28<00:27, 13.88s/it][A

generating ../features/_2_5_shop_score_delivery_bin_hour_CTR.pkl



 94%|█████████▍| 16/17 [03:37<00:13, 13.59s/it][A

generating ../features/_2_5_shop_score_description_bin_hour_CTR.pkl



100%|██████████| 17/17 [03:46<00:00, 13.33s/it][A
100%|██████████| 17/17 [05:52<00:00, 20.74s/it]  
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/7 [00:00<?, ?it/s][A

  0%|          | 0/1 [00:00<?, ?it/s][A[A


  0%|          | 0/7 [00:00<?, ?it/s][A[A[A

generating ../features/_2_5_user_id_item_id_before_history_CTR.pkl





 14%|█▍        | 1/7 [00:45<04:30, 45.15s/it][A[A[A

generating ../features/_2_5_user_id_item_brand_id_before_history_CTR.pkl





 29%|██▊       | 2/7 [01:21<03:23, 40.71s/it][A[A[A

generating ../features/_2_5_user_id_category2_label_before_history_CTR.pkl





 43%|████▎     | 3/7 [01:55<02:34, 38.60s/it][A[A[A

generating ../features/_2_5_user_id_category3_label_before_history_CTR.pkl





 57%|█████▋    | 4/7 [02:25<01:48, 36.31s/it][A[A[A

generating ../features/_2_5_user_id_shop_id_before_history_CTR.pkl





 71%|███████▏  | 5/7 [03:09<01:15, 37.91s/it][A[A[A

generating ../features/_2_5_user_id_item_sales_level_bin_before_history_CTR.pkl





 86%|████████▌ | 6/7 [03:44<00:37, 37.35s/it][A[A[A

generating ../features/_2_5_user_id_item_price_level_bin_before_history_CTR.pkl





100%|██████████| 7/7 [04:16<00:00, 36.61s/it][A[A[A


[A[A[A

100%|██████████| 1/1 [04:16<00:00, 256.31s/it][A[A

[A[A
 14%|█▍        | 1/7 [04:54<29:24, 294.11s/it][A
 29%|██▊       | 2/7 [05:10<12:57, 155.49s/it][A
 43%|████▎     | 3/7 [05:24<07:13, 108.33s/it][A

In [11]:
pd.set_option('display.max_rows', None)
data = all_data[(all_data.day==7)|(all_data.day==6)]
data.isnull().sum()

index                                                            0
instance_id                                                      0
item_id                                                          0
item_category_list                                               0
item_property_list                                               0
item_brand_id                                                    0
item_city_id                                                     0
item_price_level                                                 0
item_sales_level                                                 0
item_collected_level                                             0
item_pv_level                                                    0
user_id                                                          0
user_gender_id                                                   0
user_age_level                                                   0
user_occupation_id                                            