In [1]:
import os
import pickle
import gc
import pandas as pd
import numpy as np
from tqdm import tqdm
from utils import load_pickle, dump_pickle, raw_data_path, feature_data_path
from utils import extract_ctr

In [3]:
def get_before_2min(s):
    time_now,times = s.split('-')
    time_one_hour_before = int(time_now) - 120
    times = times.split(':')
    
    count = 0
    for t in times:
        if (int(t)<int(time_now))&(int(t)>=int(time_one_hour_before)):
            count = count + 1
    return count
def get_before_15min(s):
    time_now,times = s.split('-')
    time_one_hour_before = int(time_now) - 1000
    times = times.split(':')
    
    count = 0
    for t in times:
        if (int(t)<int(time_now))&(int(t)>=int(time_one_hour_before)):
            count = count + 1
    return count

def get_before_1hour(s):
    time_now,times = s.split('-')
    time_one_hour_before = int(time_now) - 3600
    times = times.split(':')
    
    count = 0
    for t in times:
        if (int(t)<int(time_now))&(int(t)>=int(time_one_hour_before)):
            count = count + 1
    return count

def gen_user_feature_before():

    all_data = load_pickle(raw_data_path + 'all_data_4567.pkl')

    feature_list = ['category2_label', 'category3_label',
                    'shop_id', 'item_id', 'item_brand_id', 
                   'item_sales_level_bin', 'item_price_level_bin',
                   'item_property_topic_k_15',
                   ]

    for feature in tqdm(feature_list):

        feature_path = feature_data_path + '_2_10_user_' + feature + '_before.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)   
        else:
            print('generating '+feature_path)

            before_2min = 'user_' + feature + '_before_2min'
            before_15min = 'user_' + feature + '_before_15min'
            before_1hour = 'user_' + feature + '_before_1hour'

            t1 = data[['user_id', feature, 'context_timestamp']]
            t1.context_timestamp = t1.context_timestamp.astype('str')
            t1 = t1.groupby(['user_id', feature])['context_timestamp'].agg(lambda x:':'.join(x)).reset_index()
            t1.rename(columns={'context_timestamp':'times'},inplace=True)

            t2 = data[['user_id', feature, 'context_timestamp']]
            t2 = pd.merge(t2, t1, on=['user_id', feature], how='left')
            t2['time_now'] = t2.context_timestamp.astype('str') + '-' + t2.times

            t2[before_2min] = t2.time_now.apply(get_before_2min)
            t2[before_15min] = t2.time_now.apply(get_before_15min)
            t2[before_1hour] = t2.time_now.apply(get_before_1hour)


            t3 = t2[[before_2min, before_15min,before_1hour]] 

            dump_pickle(t3, feature_path)


def add_user_feature_before(data):


    feature_list = ['category2_label', 'category3_label',
                    'shop_id', 'item_id', 'item_brand_id', 
                   'item_sales_level_bin', 'item_price_level_bin',
                   'item_property_topic_k_15',
                   ]


    for feature in tqdm(feature_list):
        feature_path = feature_data_path + '_2_10_user_' + feature + '_before.pkl'
        if not os.path.exists(feature_path):
            gen_user_feature_before()

        user_feature_click_rank_global = load_pickle(feature_path)
        data = data.join(user_feature_click_rank_global)

    return data

In [2]:
def get_feature_2min(s):
    time_now,times = s.split('-')
    time_one_hour_after = int(time_now) + 120
    times = times.split(':')
    
    count = 0
    for t in times:
        if (int(t)>int(time_now))&(int(t)<=int(time_one_hour_after)):
            count = count + 1
    return count
def get_feature_15min(s):
    time_now,times = s.split('-')
    time_one_hour_after = int(time_now) + 1000
    times = times.split(':')
    
    count = 0
    for t in times:
        if (int(t)>int(time_now))&(int(t)<=int(time_one_hour_after)):
            count = count + 1
    return count


In [4]:
def gen_user_feature_future():

    all_data = load_pickle(raw_data_path + 'all_data_4567.pkl')

    feature_list = ['category2_label', 'category3_label',
                    'shop_id', 'item_id', 'item_brand_id', 
                   'item_sales_level_bin', 'item_price_level_bin',
                   'item_property_topic_k_15',
                   ]

    for feature in tqdm(feature_list):

        feature_path = feature_data_path + '_2_10_user_' + feature + '_future.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)   
        else:
            print('generating '+feature_path)

            future_2min = 'user_' + feature + '_future_2min'
            future_15min = 'user_' + feature + '_future_15min'

            t1 = data[['user_id', feature, 'context_timestamp']]
            t1.context_timestamp = t1.context_timestamp.astype('str')
            t1 = t1.groupby(['user_id', feature])['context_timestamp'].agg(lambda x:':'.join(x)).reset_index()
            t1.rename(columns={'context_timestamp':'times'},inplace=True)

            t2 = data[['user_id', feature, 'context_timestamp']]
            t2 = pd.merge(t2, t1, on=['user_id', feature], how='left')
            t2['time_now'] = t2.context_timestamp.astype('str') + '-' + t2.times

            t2[future_2min] = t2.time_now.apply(get_feature_2min)
            t2[future_15min] = t2.time_now.apply(get_feature_15min)

            t3 = t2[[future_2min, future_15min,]] 

            dump_pickle(t3, feature_path)


def add_user_feature_future(data):


    feature_list = ['category2_label', 'category3_label',
                    'shop_id', 'item_id', 'item_brand_id', 
                   'item_sales_level_bin', 'item_price_level_bin',
                   'item_property_topic_k_15',
                   ]


    for feature in tqdm(feature_list):
        feature_path = feature_data_path + '_2_10_user_' + feature + '_future.pkl'
        if not os.path.exists(feature_path):
            gen_user_feature_future()

        user_feature_click_rank_global = load_pickle(feature_path)
        data = data.join(user_feature_click_rank_global)

    return data

In [4]:
if __name__ =='__main__':
    
    data = load_pickle(raw_data_path + 'all_data_4567.pkl')   
#     data = add_user_feature_future(data)
    data = add_user_feature_before(data)
    
    print(data.columns)


  0%|          | 0/8 [00:00<?, ?it/s][A

generating ../features/_2_10_user_category2_label_before.pkl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value

 12%|█▎        | 1/8 [02:59<20:55, 179.36s/it][A

generating ../features/_2_10_user_category3_label_before.pkl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value

 25%|██▌       | 2/8 [05:58<17:54, 179.01s/it][A

generating ../features/_2_10_user_shop_id_before.pkl



 38%|███▊      | 3/8 [08:41<14:28, 173.69s/it][A

generating ../features/_2_10_user_item_id_before.pkl



 50%|█████     | 4/8 [11:26<11:26, 171.67s/it][A

generating ../features/_2_10_user_item_brand_id_before.pkl



 62%|██████▎   | 5/8 [14:10<08:30, 170.17s/it][A

generating ../features/_2_10_user_item_sales_level_bin_before.pkl



 75%|███████▌  | 6/8 [16:47<05:35, 167.84s/it][A

generating ../features/_2_10_user_item_price_level_bin_before.pkl



 88%|████████▊ | 7/8 [19:26<02:46, 166.60s/it][A

generating ../features/_2_10_user_item_property_topic_k_15_before.pkl



100%|██████████| 8/8 [22:04<00:00, 165.61s/it][A
100%|██████████| 8/8 [22:49<00:00, 171.19s/it]   

Index(['index', 'instance_id', 'item_id', 'item_category_list',
       'item_property_list', 'item_brand_id', 'item_city_id',
       'item_price_level', 'item_sales_level', 'item_collected_level',
       ...
       'user_item_brand_id_before_1hour',
       'user_item_sales_level_bin_before_2min',
       'user_item_sales_level_bin_before_15min',
       'user_item_sales_level_bin_before_1hour',
       'user_item_price_level_bin_before_2min',
       'user_item_price_level_bin_before_15min',
       'user_item_price_level_bin_before_1hour',
       'user_item_property_topic_k_15_before_2min',
       'user_item_property_topic_k_15_before_15min',
       'user_item_property_topic_k_15_before_1hour'],
      dtype='object', length=111)





In [14]:
float_cols = [c for c in data if data[c].dtype == "object"]
data['category2_label'] = data['category2_label'].astype(np.int)

In [5]:
data[data.user_id == 50153023443529][['context_timestamp','item_brand_id' ,'user_' + 'item_id' + '_before_15min']]

Unnamed: 0,context_timestamp,item_brand_id,user_item_id_before_15min
190276,1536240332,5510998113992316938,0
265603,1536239858,8764113826665518348,1
475314,1536241480,7147251880558104750,0
543278,1536239177,7147251880558104750,0
683293,1536239046,8764113826665518348,0
832258,1536240799,6030170612722734811,0
1010442,1536239046,1326455302495217018,0
1369495,1536240561,4990550054026668580,0
1389183,1536239358,5399897164205922814,0
1429931,1536239269,190377194252611445,0


In [11]:
data[data.user_id == 50153023443529][['context_timestamp','item_brand_id' ,'user_' + 'item_id' + '_future_15min']]

Unnamed: 0,context_timestamp,item_brand_id,user_item_id_future_15min
190276,1536240332,5510998113992316938,0
265603,1536239858,8764113826665518348,0
475314,1536241480,7147251880558104750,0
543278,1536239177,7147251880558104750,0
683293,1536239046,8764113826665518348,1
832258,1536240799,6030170612722734811,0
1010442,1536239046,1326455302495217018,0
1369495,1536240561,4990550054026668580,0
1389183,1536239358,5399897164205922814,0
1429931,1536239269,190377194252611445,0


In [6]:
pd.set_option('display.max_rows', None)
data.isnull().sum()

index                                           0
instance_id                                     0
item_id                                         0
item_category_list                              0
item_property_list                              0
item_brand_id                                   0
item_city_id                                    0
item_price_level                                0
item_sales_level                                0
item_collected_level                            0
item_pv_level                                   0
user_id                                         0
user_gender_id                                  0
user_age_level                                  0
user_occupation_id                              0
user_star_level                                 0
context_id                                      0
context_timestamp                               0
context_page_id                                 0
predict_category_property                       0


In [13]:
data.day.value_counts()

6    1934443
7    1597063
5    1200219
4    1157641
Name: day, dtype: int64