In [23]:
import numpy as np
import pandas as pd
import datetime
import time
import numba
import os,sys
from sklearn.model_selection import StratifiedKFold

## load data
def LoadData(InputDir):
    """"""
    ## load raw data
    data = {
        'train': pd.read_csv('%s/round1_ijcai_18_train_20180301.txt' % InputDir, sep= ' '),
        'test': pd.read_csv('%s/round1_ijcai_18_test_a_20180301.txt' % InputDir, sep= ' '),
    }
    return data

DataBaseDir = '../../data'
DataSet = LoadData('%s/raw' % DataBaseDir)
for mod in ['train', 'test']:
    DataSet[mod]['context_timestamp'] = DataSet[mod]['context_timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))
    DataSet[mod] = DataSet[mod].sort_values(by= 'context_timestamp')
print('\n Loading data done.')
## extract item category/property features
category_columns = []
numeric_columns = []
@numba.jit
def SplitItemCategoryProperty(colvals):
    """"""
    n = len(colvals)
    result = np.empty((n, 3), dtype= 'object')
    for i in range(n):
        cate_list = colvals[i].split(';')
        for j in range(3):
            if(j < len(cate_list)):
                if(cate_list[j] == '-1'):
                    result[i, j] = '-1'
                else:
                    result[i, j] = cate_list[j]
            else:
                result[i, j] = '-1'
    return result

item_category_property_columns = {
    'item_category_list': ['item_category_0', 'item_category_1', 'item_category_2'],
    'item_property_list': ['item_property_0', 'item_property_1', 'item_property_2']
}
drop_item_category_property_columns = {
    'item_category_list': set(),
    'item_property_list': set()
}
for k in item_category_property_columns.keys():
    for mod in ['train', 'test']:
        tmp = SplitItemCategoryProperty(DataSet[mod][k].values) 
        tmpdf = pd.DataFrame(data= tmp, index= DataSet[mod].index, columns= item_category_property_columns[k])
        DataSet[mod] = pd.concat([DataSet[mod], tmpdf], axis= 1, ignore_index= False)
        for col in item_category_property_columns[k]:
            if(len(DataSet[mod][col].value_counts()) == 1):
                drop_item_category_property_columns[k].add(col)
# drop unique-value columns
for k in drop_item_category_property_columns.keys():
    if(len(drop_item_category_property_columns[k]) > 0):
        for mod in ['train', 'test']:
            DataSet[mod].drop(list(drop_item_category_property_columns[k]), axis= 1, inplace= True)
        print('Drop columns: ')
        print(list(drop_item_category_property_columns[k]))
        for c in drop_item_category_property_columns[k]:
            item_category_property_columns[k].remove(c)
# update category columns
for k in item_category_property_columns.keys():
    category_columns.extend(item_category_property_columns[k])
print('Current category columns: ')
print(category_columns)
print('\n Extracting item category/property done.')

## extract predict category/property features
@numba.jit
def SplitPredictCategoryProperty(colvals):
    ''''''
    n = len(colvals)
    result = np.empty((n, 6), dtype= 'object')
    for i in range(n):
        cate_prop_list = colvals[i].split(';')
        for j in range(3):
            if((j < len(cate_prop_list)) and (cate_prop_list[j] != '-1')):
                cate_prop_pair = cate_prop_list[j].split(':')
                if(len(cate_prop_pair) != 2):
                    print(cate_prop_list[j])
                if((cate_prop_pair[0] is None) or (cate_prop_pair[0] == '-1')):
                    result[i, j] = '-1'
                else:
                    result[i, j] = cate_prop_pair[0]
                if((cate_prop_pair[1] is None) or (cate_prop_pair[1] == '-1')):
                    result[i, j + 3] = '-1'
                else:
                    result[i, j + 3] = cate_prop_pair[1]
            else:
                result[i, j] = '-1'
                result[i, j + 3] = '-1'
    return result

predict_category_property_columns = ['predict_category_0', 'predict_category_1', 'predict_category_2', 
                                    'predict_property_0', 'predict_property_1', 'predict_property_2']
drop_predict_category_property_columns = set()
for mod in ['train', 'test']:
    tmp = SplitPredictCategoryProperty(DataSet[mod]['predict_category_property'].values) 
    tmpdf = pd.DataFrame(data= tmp, index= DataSet[mod].index, columns= predict_category_property_columns)
    DataSet[mod] = pd.concat([DataSet[mod], tmpdf], axis= 1, ignore_index= False)
    for col in predict_category_property_columns:
        if(len(DataSet[mod][col].value_counts()) == 1):
            drop_predict_category_property_columns.add(col)          
# drop unique-value columns
if(len(drop_predict_category_property_columns) > 0):
    for mod in ['train', 'test']:
        DataSet[mod].drop(list(drop_predict_category_property_columns), axis= 1, inplace= True)
    print('\n Drop columns: ')
    print(list(drop_predict_category_property_columns))
    for c in drop_predict_category_property_columns:
        predict_category_property_columns.remove(c)
# update category columns
category_columns.extend(predict_category_property_columns)
print('\n Current category columns: ')
print(category_columns)
print('\n Extracting predict category/property done.')

## hour
for mod in ['train', 'test']:
    DataSet[mod]['hour'] = DataSet[mod]['context_timestamp'].dt.hour ## new column, feature
numeric_columns.append('hour')
## category hit
for c in predict_category_property_columns[:3]:
    for mod in ['train', 'test']:
        DataSet[mod]['%s_hit_0' % c] = DataSet[mod][c].apply(lambda x: (x == '7908382889764677758'))
        DataSet[mod]['%s_hit_0' % c] = DataSet[mod]['%s_hit_0' % c].astype('int32')
    category_columns.append('%s_hit_0' % c)
## count
count_dict = {
    'item_id':  ['item_property_0', 'item_property_1', 'item_property_2', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level'],
    'user_id':  ['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level'],
    'shop_id':  ['shop_review_num_level', 'shop_star_level']
}
for count_id in count_dict.keys():
    for count_key in count_dict[count_id]:
        for mod in ['train', 'test']:
            rec = []
            groupped = DataSet[mod].groupby([count_key])
            for g in groupped.groups:
                ac = {}
                ac[count_key] = g
                ac['count_%s_%s' % (count_key, count_id)] = len(groupped.get_group(g)[count_id].unique())
                rec.append(ac)
            tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
            DataSet[mod] = DataSet[mod].merge(tmpdf, how= 'left', on= [count_key])
        numeric_columns.append('count_%s_%s' % (count_key, count_id))
print('\nAdd hour/category_hit/count features done.')
## encode for id features
id_cate_feat = ['item_id', 'user_id', 'shop_id', 'item_brand_id', 'item_city_id', 'item_category_list']
category_columns.extend(id_cate_feat)
id_cate_feat.extend(item_category_property_columns['item_category_list'])
id_cate_feat.extend(item_category_property_columns['item_property_list'])
id_cate_feat.extend(predict_category_property_columns[:3])
for col in id_cate_feat:
    unique_values = list(DataSet['train'][col].unique())
    encode_dict = dict(zip(unique_values, range(len(unique_values))))
    for mod in ['train', 'test']:    
        DataSet[mod]['%s_encoded' % col] = DataSet[mod][col].apply(lambda x: encode_dict.get(x, -1))
        DataSet[mod].drop([col], axis= 1, inplace= True)
        DataSet[mod].rename(columns= {'%s_encoded' % col: col}, inplace= True)

item_cate_feat = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']
user_cate_feat = ['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']
shop_cate_feat = ['shop_review_num_level', 'shop_star_level']
context_cate_feat = ['context_page_id']
category_columns.extend(item_cate_feat)
category_columns.extend(user_cate_feat)
category_columns.extend(shop_cate_feat)
category_columns.extend(context_cate_feat)
shop_num_feat = ['shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']
numeric_columns.extend(shop_num_feat)
drop_columns = ['item_property_list', 'predict_category_property', 'predict_property_0', 'predict_property_1', 
                'predict_property_2','context_id', 'context_timestamp']
for mod in ['train', 'test']:
    DataSet[mod].drop(drop_columns, axis= 1, inplace= True)
category_columns = [c for c in category_columns if(c not in drop_columns)]
numeric_columns = [c for c in numeric_columns if(c not in drop_columns)]
print('category columns: ')
print(category_columns)
print('numeric columns: ')
print(numeric_columns)
print(DataSet['train'].dtypes)
print('\n Type conversion done. ')
# tagging columns
tagged_numeric_columns = ['n_%s' % col for col in numeric_columns]
tagged_category_columns = ['c_%s' % col for col in category_columns]
renamed_columns = dict(list(dict(zip(numeric_columns, tagged_numeric_columns)).items()) + list(dict(zip(category_columns, tagged_category_columns)).items()))
# renamed_columns['instance_id'] = 'instance_id'
for mod in ['train', 'test']:
    DataSet[mod].rename(columns= renamed_columns, inplace= True)
all_features= ['instance_id']
all_features.extend(tagged_numeric_columns)
all_features.extend(tagged_category_columns)
print('\n Re-sort columns done.')
## re-sort columns
target = 'is_trade'
sorted_columns = ['instance_id']
sorted_columns.append(target)
sorted_columns.extend(tagged_numeric_columns)
sorted_columns.extend(tagged_category_columns)
DataSet['test'][target] = .0
for mod in ['train', 'test']:
    DataSet[mod] = DataSet[mod][sorted_columns]
## checking missing values
for col in all_features:
    null_size = len(DataSet['train'][DataSet['train'][col] == -1])
    print(col, null_size)
print('\n Checking missing values done.')


 Loading data done.
Drop columns: 
['item_category_0']
Current category columns: 
['item_category_1', 'item_category_2', 'item_property_0', 'item_property_1', 'item_property_2']

 Extracting item category/property done.

 Current category columns: 
['item_category_1', 'item_category_2', 'item_property_0', 'item_property_1', 'item_property_2', 'predict_category_0', 'predict_category_1', 'predict_category_2', 'predict_property_0', 'predict_property_1', 'predict_property_2']

 Extracting predict category/property done.

Add hour/category_hit/count features done.
category columns: 
['item_category_1', 'item_category_2', 'item_property_0', 'item_property_1', 'item_property_2', 'predict_category_0', 'predict_category_1', 'predict_category_2', 'predict_category_0_hit_0', 'predict_category_1_hit_0', 'predict_category_2_hit_0', 'item_id', 'user_id', 'shop_id', 'item_brand_id', 'item_city_id', 'item_category_list', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level',

In [24]:
# ## check
# print(DataSet['train'].columns)
# print(len(DataSet['train']['c_item_category_list'].unique()), len(DataSet['train']))
# #print(len(DataSet['train']['c_predict_category'].unique()), len(DataSet['train']))
# for c in ['c_predict_category_0', 'c_predict_category_1', 'c_predict_category_2']:
#     print(c, len(DataSet['train'][DataSet['train'][c] == '7908382889764677758']), len(DataSet['train']))

In [25]:
kfold = 5

OutputDir = '%s/l0' % DataBaseDir
fold = 0
for train_index, valid_index in StratifiedKFold(n_splits= kfold).split(DataSet['train'][all_features], DataSet['train'][target]):
    print('fold %s, train %s, valid %s' % (fold, len(train_index), len(valid_index)))
    FoldOutput = '%s/kfold/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutput) == False):
        os.makedirs(FoldOutput)
    DataSet['train'].iloc[valid_index].to_csv('%s/valid.csv' % FoldOutput, index= False)
    DataSet['test'].to_csv('%s/test.csv' % FoldOutput, index= False)
    ## check
    print('-----------------------------------------')
    print(DataSet['train'].iloc[train_index][target].sum(axis= 0)/len(train_index))
    print('-----------------------------------------\n')
    fold += 1

fold 0, train 382509, valid 95629
-----------------------------------------
0.0188649155968
-----------------------------------------

fold 1, train 382510, valid 95628
-----------------------------------------
0.0188674805887
-----------------------------------------

fold 2, train 382511, valid 95627
-----------------------------------------
0.0188674312634
-----------------------------------------

fold 3, train 382511, valid 95627
-----------------------------------------
0.0188674312634
-----------------------------------------

fold 4, train 382511, valid 95627
-----------------------------------------
0.0188674312634
-----------------------------------------

