In [1]:
import os
import pickle
import gc
import pandas as pd
import numpy as np
from tqdm import tqdm
from utils import load_pickle, dump_pickle, raw_data_path, feature_data_path

In [2]:
all_data = load_pickle(raw_data_path + 'all_data.pkl')

In [3]:
all_data.day.value_counts()

7     2806831
6     1934443
1     1340697
2     1289476
3     1236828
5     1200219
31    1195557
4     1157641
Name: day, dtype: int64

In [5]:
all_data[all_data.day==4].hour.value_counts()

21    103995
20     91919
22     90633
19     70011
13     67330
14     66021
15     65538
10     62396
12     59831
11     58717
16     58113
23     54176
9      53518
18     50563
17     50180
8      41622
7      31768
0      26812
6      18040
1      11836
5       7203
2       6985
3       5349
4       5085
Name: hour, dtype: int64

In [3]:
all_data[all_data.day==6].hour.value_counts()

22    230139
21    211086
23    200513
20    171831
19    126378
15     90094
14     89363
13     89347
18     87657
16     84107
10     79817
12     78254
17     77445
11     75730
9      66615
8      50575
7      34736
0      28813
6      21230
1      12988
5       8483
2       7749
3       5801
4       5692
Name: hour, dtype: int64

In [4]:
all_data[all_data.day==7].hour.value_counts()

0     176798
10    142795
9     135256
11    126942
8     125187
7     106500
1      96070
21     61354
22     59170
20     55669
6      55359
19     43623
2      43315
23     43206
14     40184
13     39852
15     39372
16     37254
12     35608
18     32412
17     32184
3      25404
5      24391
4      19158
Name: hour, dtype: int64

In [6]:
def extract_ctr(data, feature, alias):
    '''统计给定数据的转化率
    
    '''
    query_cnt = data.groupby([feature]).size().reset_index().rename(columns={0: 'query_cnt'})
    conversion_cnt = data[data['is_trade'] == 1].groupby([feature]).size().reset_index().rename(columns={0: 'conversion_cnt'})
    ctr = pd.merge(query_cnt, conversion_cnt, how='left', on=[feature])
    ctr[['conversion_cnt']] = ctr[['conversion_cnt']].fillna(0)
    ctr[alias] = ctr['conversion_cnt'] / ctr['query_cnt']
    return ctr

In [7]:
extract_ctr(all_data[all_data.day==4], feature='hour', alias='feature_ctr')

Unnamed: 0,hour,query_cnt,conversion_cnt,feature_ctr
0,0,26812,279,0.010406
1,1,11836,113,0.009547
2,2,6985,76,0.01088
3,3,5349,51,0.009534
4,4,5085,50,0.009833
5,5,7203,73,0.010135
6,6,18040,239,0.013248
7,7,31768,483,0.015204
8,8,41622,690,0.016578
9,9,53518,818,0.015285


In [8]:
extract_ctr(all_data[all_data.day==6], feature='hour', alias='feature_ctr')

Unnamed: 0,hour,query_cnt,conversion_cnt,feature_ctr
0,0,28813,200,0.006941
1,1,12988,94,0.007237
2,2,7749,61,0.007872
3,3,5801,26,0.004482
4,4,5692,52,0.009136
5,5,8483,65,0.007662
6,6,21230,173,0.008149
7,7,34736,349,0.010047
8,8,50575,516,0.010203
9,9,66615,610,0.009157


In [9]:
extract_ctr(all_data[all_data.day==7], feature='hour', alias='feature_ctr')

Unnamed: 0,hour,query_cnt,conversion_cnt,feature_ctr
0,0,176798,8860.0,0.050114
1,1,96070,4331.0,0.045082
2,2,43315,1848.0,0.042664
3,3,25404,1098.0,0.043222
4,4,19158,881.0,0.045986
5,5,24391,1104.0,0.045263
6,6,55359,2458.0,0.044401
7,7,106500,4906.0,0.046066
8,8,125187,6001.0,0.047936
9,9,135256,6390.0,0.047244


In [3]:
all_data = load_pickle(raw_data_path + 'all_data.pkl')
extract_ctr(all_data, feature='day', alias='feature_ctr')

Unnamed: 0,day,query_cnt,conversion_cnt,feature_ctr
0,1,1340697,18246,0.013609
1,2,1289476,17137,0.01329
2,3,1236828,15309,0.012378
3,4,1157641,13159,0.011367
4,5,1200219,11729,0.009772
5,6,1934443,10635,0.005498
6,7,1597063,49646,0.031086
7,31,1195557,15349,0.012838


In [7]:
%time data = load_pickle(feature_data_path + 'all_data_all_features.pkl')

In [14]:
extract_ctr(data, feature='user_id_day_C', alias='feature_ctr')

Unnamed: 0,user_id_day_C,query_cnt,conversion_cnt,feature_ctr
0,0.0,3524309,60064.0,0.017043
1,1.0,6833,196.0,0.028684
2,2.0,339,20.0,0.058997
3,3.0,24,1.0,0.041667
4,4.0,1,0.0,0.0


In [17]:
extract_ctr(data, feature='user_id_click_day', alias='feature_ctr')

Unnamed: 0,user_id_click_day,query_cnt,conversion_cnt,feature_ctr
0,1,226743,14483.0,0.063874
1,2,209663,10926.0,0.052112
2,3,165010,7672.0,0.046494
3,4,121708,5174.0,0.042512
4,5,89350,3440.0,0.0385
5,6,65744,2238.0,0.034041
6,7,46971,1639.0,0.034894
7,8,36068,1127.0,0.031247
8,9,26335,725.0,0.02753
9,10,19585,545.0,0.027827


In [9]:
pd.set_option('display.max_rows', None)
data.isnull().sum()

index                                                      0
instance_id                                                0
item_id                                                    0
item_brand_id                                              0
item_city_id                                               0
item_price_level                                           0
item_sales_level                                           0
item_collected_level                                       0
item_pv_level                                              0
user_id                                                    0
user_gender_id                                             0
user_age_level                                             0
user_occupation_id                                         0
user_star_level                                            0
context_id                                                 0
context_timestamp                                          0
context_page_id         

In [3]:
all_data = load_pickle(raw_data_path + 'all_data.pkl')
data = all_data[(all_data.is_trade != -1) & (all_data.day == 7)]
all_data.shape, data.shape

((10951924, 32), (1077175, 32))

In [5]:
def unique(row):
    return len(row['item_price_level'].unique())

In [6]:
price_size = all_data.groupby(['item_id']).apply(unique).reset_index().rename(columns={0: 'price_size'})
data_price = pd.merge(data, price_size, 'left', on=['item_id'])

In [7]:
extract_ctr(data_price, feature='price_size', alias='feature_ctr')

Unnamed: 0,price_size,query_cnt,conversion_cnt,feature_ctr
0,1,1071466,49407.0,0.046112
1,2,5708,239.0,0.041871
2,3,1,0.0,0.0


In [8]:
def unique(row):
    return len(row['item_sales_level'].unique())
sales_size = all_data.groupby(['item_id']).apply(unique).reset_index().rename(columns={0: 'sales_size'})
data_sales = pd.merge(data, sales_size, 'left', on=['item_id'])
extract_ctr(data_sales, feature='sales_size', alias='feature_ctr')

Unnamed: 0,sales_size,query_cnt,conversion_cnt,feature_ctr
0,1,302618,15060,0.049766
1,2,559614,26754,0.047808
2,3,169076,6221,0.036794
3,4,33700,1227,0.036409
4,5,10165,327,0.032169
5,6,1851,53,0.028633
6,7,151,4,0.02649


In [9]:
def unique(row):
    return len(row['item_collected_level'].unique())
collected_size = all_data.groupby(['item_id']).apply(unique).reset_index().rename(columns={0: 'collected_size'})
data_collected = pd.merge(data, collected_size, 'left', on=['item_id'])
extract_ctr(data_collected, feature='collected_size', alias='feature_ctr')

Unnamed: 0,collected_size,query_cnt,conversion_cnt,feature_ctr
0,1,771516,37217,0.048239
1,2,247947,10341,0.041706
2,3,36663,1414,0.038567
3,4,11949,421,0.035233
4,5,4680,133,0.028419
5,6,4420,120,0.027149


In [None]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
data = load_pickle(all_data_path)
data = data[(data.is_trade != -1) & (data.day == 7)]
data.shape

In [22]:
a[a['size']!=1]

Unnamed: 0,item_id,size
51,5742694433481645,2
575,59598076344549667,2
1100,115983701730257574,2
1370,144619254694547538,2
1953,206220471182824436,2
2141,227216806795434267,2
2144,227333699869651275,2
2540,268077257919580225,2
3134,330996138776025063,2
3318,349172856305719432,2


In [8]:
extract_ctr(data[(data.is_trade != -1) & (data.day == 7)], feature='user_click_rank_day', alias='feature_ctr')

Unnamed: 0,user_click_rank_day,query_cnt,conversion_cnt,feature_ctr
0,0,226743,14483,0.063874
1,1,260921,9153,0.03508
2,2,378128,12306,0.032545
3,3,211383,13704,0.06483


In [11]:
extract_ctr(data[(data.is_trade != -1) & (data.day == 7)], feature='user_click_rank_global', alias='feature_ctr')

Unnamed: 0,user_click_rank_global,query_cnt,conversion_cnt,feature_ctr
0,0,188590,11812,0.062633
1,1,198549,7038,0.035447
2,2,440500,14421,0.032738
3,3,249536,16375,0.065622


In [13]:
data[(data.is_trade != -1) & (data.day == 7)]['user_occupation_id_category3_label_smooth_CTR']

1488847    0.014568
1488848    0.014724
1488849    0.010830
1488850    0.004695
1488851    0.010830
1488852    0.010830
1488853    0.010830
1488854    0.010830
1488855    0.021995
1488856    0.010830
1488857    0.010830
1488858    0.003278
1488859    0.010830
1488860    0.010272
1488861    0.010830
1488862    0.015002
1488863    0.006548
1488864    0.010272
1488865    0.002993
1488866    0.010830
1488867    0.010830
1488868    0.010272
1488869    0.010830
1488870    0.002993
1488871    0.010272
1488872    0.010830
1488873    0.010830
1488874    0.014448
1488875    0.003278
1488876    0.010272
             ...   
2565992    0.014704
2565993    0.010830
2565994    0.004695
2565995    0.012651
2565996    0.010830
2565997    0.000483
2565998    0.010272
2565999    0.014042
2566000    0.003145
2566001    0.010272
2566002    0.010830
2566003    0.005288
2566004    0.010830
2566005    0.010272
2566006    0.009669
2566007    0.005288
2566008    0.010830
2566009    0.009669
2566010    0.010830


In [12]:
extract_ctr(data, feature='user_click_rank_global', alias='feature_ctr')

Unnamed: 0,user_click_rank_global,query_cnt,conversion_cnt,feature_ctr
0,0,662053,14553,0.021982
1,1,734105,8978,0.01223
2,2,1401243,17341,0.012375
3,3,734105,19409,0.026439


In [11]:
extract_ctr(all_data[all_data.day==6], feature='hour', alias='feature_ctr')

Unnamed: 0,hour,query_cnt,conversion_cnt,feature_ctr
0,0,28813,200,0.006941
1,1,12988,94,0.007237
2,2,7749,61,0.007872
3,3,5801,26,0.004482
4,4,5692,52,0.009136
5,5,8483,65,0.007662
6,6,21230,173,0.008149
7,7,34736,349,0.010047
8,8,50575,516,0.010203
9,9,66615,610,0.009157


In [12]:
extract_ctr(all_data[all_data.day==7], feature='hour', alias='feature_ctr')

Unnamed: 0,hour,query_cnt,conversion_cnt,feature_ctr
0,0,176798,8860.0,0.050114
1,1,96070,4331.0,0.045082
2,2,43315,1848.0,0.042664
3,3,25404,1098.0,0.043222
4,4,19158,881.0,0.045986
5,5,24391,1104.0,0.045263
6,6,55359,2458.0,0.044401
7,7,106500,4906.0,0.046066
8,8,125187,6001.0,0.047936
9,9,135256,6390.0,0.047244


In [3]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

In [4]:
data = all_data[(all_data.is_trade != -1) & (all_data.day == 7)]

In [30]:

extract_ctr(data, feature='user_shop_id_click_rank_day', alias='feature_ctr')

Unnamed: 0,user_shop_id_click_rank_day,query_cnt,conversion_cnt,feature_ctr
0,0,940631,43791,0.046555
1,1,65345,986,0.015089
2,2,11974,264,0.022048
3,3,59225,4605,0.077754


In [5]:
extract_ctr(data, feature='item_property_topic', alias='feature_ctr')

Unnamed: 0,item_property_topic,query_cnt,conversion_cnt,feature_ctr
0,0,72666,4596,0.063248
1,1,68220,2647,0.038801
2,2,70129,2041,0.029104
3,3,66599,1987,0.029835
4,4,53940,2259,0.04188
5,5,58868,1629,0.027672
6,6,70015,2687,0.038377
7,7,40626,2407,0.059248
8,8,61678,3457,0.056049
9,9,46889,2771,0.059097


In [6]:
data = all_data[all_data.is_trade != -1]
data = data[data.day == 7]

extract_ctr(data, feature='hour', alias='feature_ctr')

Unnamed: 0,hour,query_cnt,conversion_cnt,feature_ctr
0,0,176798,8860,0.050114
1,1,96070,4331,0.045082
2,2,43315,1848,0.042664
3,3,25404,1098,0.043222
4,4,19158,881,0.045986
5,5,24391,1104,0.045263
6,6,55359,2458,0.044401
7,7,106500,4906,0.046066
8,8,125187,6001,0.047936
9,9,135256,6390,0.047244


In [9]:
test_data = all_data[all_data.is_trade == -1]
extract_ctr(test_data, feature='hour', alias='feature_ctr')

Unnamed: 0,hour,query_cnt,conversion_cnt,feature_ctr
0,12,35608,0.0,0.0
1,13,39852,0.0,0.0
2,14,40184,0.0,0.0
3,15,39372,0.0,0.0
4,16,37254,0.0,0.0
5,17,32184,0.0,0.0
6,18,32412,0.0,0.0
7,19,43623,0.0,0.0
8,20,55669,0.0,0.0
9,21,61354,0.0,0.0
