In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import time
from utils import raw_data_path, feature_data_path, result_path, cache_pkl_path, dump_pickle, load_pickle, model_path
from sklearn.preprocessing import LabelEncoder
import gc

# 时间戳转字符串


def timestamp_datetime(value):
    format = '%Y-%m-%d %H:%M:%S'
    value = time.localtime(value)
    dt = time.strftime(format, value)
    return dt


def extract_date(data):
    data['time'] = data.context_timestamp.apply(timestamp_datetime)
    data['day'] = data.time.apply(lambda x: int(x[8:10]))
    data['hour'] = data.time.apply(lambda x: int(x[11:13]))
    data['minute'] = data.time.apply(lambda x: int(x[14:16]))
    del data['time']
    return data


def gen_all_data():

    df_train = pd.read_csv(raw_data_path + "round2_train.txt", sep=' ')
    df_test_a = pd.read_csv(
        raw_data_path + "round2_ijcai_18_test_a_20180425.txt", sep=' ')
    df_test_b = pd.read_csv(
        raw_data_path + "round2_ijcai_18_test_b_20180510.txt", sep=' ')

    df_train.drop_duplicates(inplace=True)
    df_test_a['is_trade'] = -1
    df_test_b['is_trade'] = -2

    all_data = pd.concat([df_train, df_test_a, df_test_b], ignore_index=True)
    all_data = extract_date(all_data)

    all_data['category2_label'] = all_data['item_category_list'].apply(lambda x: str(x).split(";")[1])
    all_data['category3_label'] = all_data['item_category_list'].apply(lambda x: str(str(x).split(';')[2]) if len(str(x).split(';')) > 2 else '-1')
 
    all_data[['shop_review_positive_rate','shop_score_service','shop_score_delivery','shop_score_description']]=all_data[['shop_review_positive_rate','shop_score_service','shop_score_delivery','shop_score_description']].astype(np.float32)
    
    dump_pickle(all_data, raw_data_path + 'all_data.pkl')
    
    print('gen all data finished.')


## 生成 all data

In [2]:
gen_all_data()

gen all data finished.


## all data 添加交叉特征 

In [5]:
from tqdm import tqdm


def add_cross_feature(data, feature_1, feature_2):
    comb_index = data[[feature_1, feature_2]].drop_duplicates()
    comb_index[feature_1 + '_' + feature_2] = np.arange(comb_index.shape[0])
    data = pd.merge(data, comb_index, 'left', on=[feature_1, feature_2])

    return data


def cut_features(data):

    features_to_cut = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level',
                       'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level',
                       'shop_score_service', 'shop_score_delivery', 'shop_score_description']

    for feature in features_to_cut:
        data[feature] = pd.qcut(data[feature], q=10, duplicates='drop')

    data['context_page_id'] = pd.qcut(
        data.context_page_id, q=5, duplicates='drop')
    
    data['hour'] = pd.cut(data.hour, bins=12)

    data.user_age_level.replace(
        to_replace=[-1, ], value=data.user_age_level.mean(), inplace=True)
    data['user_age_level'] = pd.cut(data.user_age_level, bins=5)

    data.user_star_level.replace(
        to_replace=[-1, ], value=data.user_star_level.mean(), inplace=True)
    data['user_star_level'] = pd.cut(data.user_star_level, bins=5)

    return data


def gen_feature_interaction_2_order():
    '''生成交叉特征，2 order

    文件名：feature_interaction_2_order.pkl

    '''
    data = load_pickle(raw_data_path + 'all_data.pkl')
    data = cut_features(data)

    cross_features = list()

    feature_path = feature_data_path + 'feature_interaction_2_order_all_data.pkl'
    print('generating '+feature_path)

#     user与各种特征交叉
    for feature_1 in (['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']):
        for feature_2 in tqdm(['item_price_level', 'item_sales_level',
                               'shop_star_level', 'shop_review_num_level', 'shop_review_positive_rate',
                               'category2_label', 'category3_label',
                               'context_page_id', 'hour'
                               ]):

            data = add_cross_feature(data, feature_1, feature_2)
            cross_features.append(feature_1 + '_' + feature_2)
            
    
    dump_pickle(data[cross_features], feature_path, protocol=4)
    

#     user自身特征交叉
    user_features = ['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']
    for i, feature_1 in enumerate(user_features):
        for j, feature_2 in enumerate(user_features):
            if i < j:
                data = add_cross_feature(data, feature_1, feature_2)
                cross_features.append(feature_1 + '_' + feature_2)

    data = data[cross_features]

    dump_pickle(data, feature_path, protocol=4)


def add_feature_interaction_2_order(data):

    feature_path = feature_data_path + 'feature_interaction_2_order_all_data.pkl'
    if not os.path.exists(feature_path):
        gen_feature_interaction_2_order()

    cross_features = load_pickle(feature_path)
    data = pd.concat([data, cross_features], axis=1)

    return data

In [6]:
all_data = load_pickle(raw_data_path + 'all_data.pkl')
gen_feature_interaction_2_order()
all_data = add_feature_interaction_2_order(all_data)
dump_pickle(all_data, raw_data_path + 'all_data.pkl', protocol=4)

  0%|          | 0/9 [00:00<?, ?it/s]

generating ../features/feature_interaction_2_order_all_data.pkl


100%|██████████| 9/9 [02:50<00:00, 18.94s/it]
100%|██████████| 9/9 [03:46<00:00, 25.19s/it]
100%|██████████| 9/9 [04:32<00:00, 30.33s/it]
100%|██████████| 9/9 [05:20<00:00, 35.58s/it]


## 分箱

In [7]:
all_data = load_pickle(raw_data_path + 'all_data.pkl')

features_to_cut = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level',
                       'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level',
                       'shop_score_service', 'shop_score_delivery', 'shop_score_description',]
    
for feature in features_to_cut:
    all_data[feature + '_bin'] = pd.qcut(all_data[feature], q=10, duplicates='drop')
    
all_data['hour_bin'] = pd.cut(all_data.hour, bins=12)
    
dump_pickle(all_data, raw_data_path + 'all_data.pkl', protocol=4)

from utils import extract_ctr

extract_ctr(all_data[(all_data.is_trade != -1)], feature='hour_bin', alias='feature_ctr')

Unnamed: 0,hour_bin,query_cnt,conversion_cnt,feature_ctr
0,"(-0.023, 1.917]",551094,15925,0.028897
1,"(1.917, 3.833]",155950,3843,0.024643
2,"(3.833, 5.75]",133025,2947,0.022154
3,"(5.75, 7.667]",537133,12618,0.023491
4,"(7.667, 9.583]",966983,23266,0.02406
5,"(9.583, 11.5]",1180532,25009,0.021185
6,"(11.5, 13.417]",1131392,12298,0.01087
7,"(13.417, 15.333]",1211810,11728,0.009678
8,"(15.333, 17.25]",1040098,9703,0.009329
9,"(17.25, 19.167]",1199960,10969,0.009141


## 生成 all data 4567

In [10]:
def gen_4567_data():

    all_data = load_pickle(raw_data_path + 'all_data.pkl')
    data_4567 = all_data[(all_data.day >= 4) & (all_data.day <= 7)]
    data_4567 = data_4567.reset_index()
    dump_pickle(data_4567, raw_data_path + 'all_data_4567.pkl')


gen_4567_data()

## 添加lda topic

In [2]:
import lda
lda_model_k_15 = load_pickle(model_path + '4567_item_property_lda_model_k_15.pkl')

data = load_pickle(raw_data_path + 'all_data_4567.pkl')

topic_vector_k_15 = lda_model_k_15.doc_topic_
topic_k_15 = topic_vector_k_15.argmax(axis=1)
topic_k_15 = pd.Series(topic_k_15)

data = pd.concat([data, topic_k_15], axis=1)
data.rename(columns={0: 'item_property_topic_k_15'}, inplace=True)

dump_pickle(data, raw_data_path + 'all_data_4567.pkl')

NameError: name 'extract_ctr' is not defined

In [3]:
from utils import extract_ctr
extract_ctr(data[(data.is_trade != -1) & (data.day == 7)], feature='item_property_topic_k_15', alias='feature_ctr')

Unnamed: 0,item_property_topic_k_15,query_cnt,conversion_cnt,feature_ctr
0,0,224050,2887,0.012886
1,1,111052,4460,0.040161
2,2,84403,1165,0.013803
3,3,114595,3159,0.027567
4,4,243690,6020,0.024704
5,5,120794,2140,0.017716
6,6,153900,2271,0.014756
7,7,162165,3434,0.021176
8,8,94864,2201,0.023202
9,9,178232,4199,0.023559


In [4]:
data.columns

Index(['index', 'instance_id', 'item_id', 'item_category_list',
       'item_property_list', 'item_brand_id', 'item_city_id',
       'item_price_level', 'item_sales_level', 'item_collected_level',
       'item_pv_level', 'user_id', 'user_gender_id', 'user_age_level',
       'user_occupation_id', 'user_star_level', 'context_id',
       'context_timestamp', 'context_page_id', 'predict_category_property',
       'shop_id', 'shop_review_num_level', 'shop_review_positive_rate',
       'shop_star_level', 'shop_score_service', 'shop_score_delivery',
       'shop_score_description', 'is_trade', 'day', 'hour', 'minute',
       'category2_label', 'category3_label', 'user_gender_id_item_price_level',
       'user_gender_id_item_sales_level', 'user_gender_id_shop_star_level',
       'user_gender_id_shop_review_num_level',
       'user_gender_id_shop_review_positive_rate',
       'user_gender_id_category2_label', 'user_gender_id_category3_label',
       'user_gender_id_context_page_id', 'user_gende

In [14]:
all_data = load_pickle(raw_data_path + 'all_data.pkl')

all_data.day.value_counts()

6     1934443
7     1597063
1     1340697
2     1289476
3     1236828
5     1200219
31    1195557
4     1157641
Name: day, dtype: int64

In [15]:
data = load_pickle(raw_data_path + 'all_data_4567.pkl')
data

Unnamed: 0,index,instance_id,item_id,item_category_list,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,...,item_collected_level_bin,item_pv_level_bin,shop_review_num_level_bin,shop_review_positive_rate_bin,shop_star_level_bin,shop_score_service_bin,shop_score_delivery_bin,shop_score_description_bin,hour_bin,item_property_topic_k_10
0,0,7118167540371,7595487976415452415,836752724084922533;8769426218101861255,7344985833148694227;6611726991947724280;783959...,144417789855323213,4228028106931716766,8,10,12,...,"(11.0, 12.0]","(-0.001, 14.0]","(-0.001, 12.0]","(0.985, 0.99]","(4998.999, 5010.0]","(0.973, 0.976]","(0.97, 0.973]","(0.971, 0.976]","(19.167, 21.083]",7
1,1,43508800019098,5340996313837249587,836752724084922533;6670526099037031245,6241534230954727302;367082587220462692;2072967...,1975299596879116255,6219110439660858399,8,15,15,...,"(14.0, 18.0]","(19.0, 22.0]","(16.0, 17.0]","(0.998, 1.0]","(5014.0, 5015.0]","(0.961, 0.965]","(-1.001, 0.96]","(0.979, 0.981]","(15.333, 17.25]",3
2,2,78067858668749,8363638779510498577,836752724084922533;6693726201323251689,2072967855524022579;4621934203383159480;807039...,9167910794067700637,196257267849351217,8,10,13,...,"(12.0, 13.0]","(19.0, 22.0]","(17.0, 18.0]","(0.998, 1.0]","(5014.0, 5015.0]","(0.984, 1.0]","(0.983, 1.0]","(0.988, 1.0]","(19.167, 21.083]",4
3,3,137525720365644,7121266959776152715,836752724084922533;6670526099037031245,6241534230954727302;367082587220462692;2072967...,9542632357298456,6219110439660858399,7,10,10,...,"(9.0, 10.0]","(16.0, 17.0]","(16.0, 17.0]","(0.998, 1.0]","(5013.0, 5014.0]","(0.961, 0.965]","(0.964, 0.968]","(0.976, 0.979]","(13.417, 15.333]",5
4,4,151193932520734,5738516817201102842,836752724084922533;3613783563199627217;6370392...,2072967855524022579;2636395404473730413;772325...,7923860398679336202,6219110439660858399,7,11,14,...,"(13.0, 14.0]","(17.0, 18.0]","(15.0, 16.0]","(0.998, 1.0]","(5012.0, 5013.0]","(-1.001, 0.961]","(-1.001, 0.96]","(0.976, 0.979]","(15.333, 17.25]",7
5,5,191034067765499,3165167092384011466,836752724084922533;1367177154073382718,3657871859501171040;6491818071284064879;528103...,1078956017020681090,2225710480551643517,7,9,11,...,"(10.0, 11.0]","(-0.001, 14.0]","(12.0, 14.0]","(0.993, 0.995]","(5010.0, 5012.0]","(0.976, 0.978]","(0.975, 0.978]","(0.971, 0.976]","(17.25, 19.167]",2
6,6,221786632908010,2157590763338091652,836752724084922533;7314150500379498593,7126426653086863522;2636395404473730413;914848...,-1,3948283326616421003,5,14,13,...,"(12.0, 13.0]","(17.0, 18.0]","(14.0, 15.0]","(0.985, 0.99]","(5010.0, 5012.0]","(-1.001, 0.961]","(-1.001, 0.96]","(-1.001, 0.956]","(19.167, 21.083]",7
7,7,233375766178961,7296864533820220362,836752724084922533;5685690139879409547;7497531...,6241534230954727302;367082587220462692;5131280...,848910589365677287,8762827044490678569,8,9,11,...,"(10.0, 11.0]","(16.0, 17.0]","(15.0, 16.0]","(0.998, 1.0]","(5012.0, 5013.0]","(0.984, 1.0]","(0.983, 1.0]","(0.988, 1.0]","(17.25, 19.167]",0
8,8,235542268576836,3948722777398180782,836752724084922533;1909641874861640857,6241534230954727302;367082587220462692;5131280...,2564374947022447094,6219110439660858399,8,11,16,...,"(14.0, 18.0]","(19.0, 22.0]","(17.0, 18.0]","(0.998, 1.0]","(5014.0, 5015.0]","(0.968, 0.971]","(0.968, 0.97]","(0.981, 0.984]","(13.417, 15.333]",3
9,9,271028956022181,4158769038163595899,836752724084922533;6693726201323251689,2072967855524022579;4067341101015777832;187732...,7756905285745693434,4899814843172066235,7,9,11,...,"(10.0, 11.0]","(15.0, 16.0]","(16.0, 17.0]","(0.998, 1.0]","(5013.0, 5014.0]","(0.961, 0.965]","(0.96, 0.964]","(0.971, 0.976]","(13.417, 15.333]",4
