In [1]:
###################################
# Time-decaying Weighted Average  #
###################################
import numpy as np
import pandas as pd
import datetime
import time
import numba
import os,sys
import gc

def LoadData(InputDir):
    """"""
    ## load raw data
    data = {
        'tra': pd.read_csv('%s/air_visit_data.csv' % InputDir, parse_dates= ['visit_date']),
        'as': pd.read_csv('%s/air_store_info.csv' % InputDir),
        'hs': pd.read_csv('%s/hpg_store_info.csv' % InputDir),
        'ar': pd.read_csv('%s/air_reserve.csv' % InputDir, parse_dates= ['visit_datetime', 'reserve_datetime']),
        'hr': pd.read_csv('%s/hpg_reserve.csv' % InputDir, parse_dates= ['visit_datetime', 'reserve_datetime']),
        'id': pd.read_csv('%s/store_id_relation.csv' % InputDir),
        'tes': pd.read_csv('%s/sample_submission.csv' % InputDir),
        'hol': pd.read_csv('%s/date_info.csv' % InputDir, parse_dates=['calendar_date']).rename(columns={'calendar_date': 'visit_date'})
    }
    return data

@numba.jit
def ApplyDayoff(VisitCols, ReserveCols):
    """"""
    n = len(VisitCols)
    result = np.zeros((n, 1), dtype= 'int8')
    for i in range(n):
        result[i] = (VisitCols[i]- ReserveCols[i]).days
    return result

reserve2id = {'ar': 'air', 'hr': 'hpg'}
reserve2store = {'ar': 'as', 'hr': 'hs'}# load data set
InputDir = '../../data/raw'
DataSet = LoadData(InputDir)
#### 
# date related features
print('============')
for mod in ['tra', 'tes']:
    start0 = time.time()
    if (mod == 'tes'):
        DataSet[mod]['visit_date'] = DataSet[mod]['id'].map(lambda x: str(x).split('_')[2])
        DataSet[mod]['air_store_id'] = DataSet[mod]['id'].map(lambda x: '_'.join(x.split('_')[:2]))
        DataSet[mod]['visit_date'] = pd.to_datetime(DataSet[mod]['visit_date'])
    DataSet[mod]['dow'] = DataSet[mod]['visit_date'].dt.dayofweek
    DataSet[mod]['year'] = DataSet[mod]['visit_date'].dt.year
    DataSet[mod]['month'] = DataSet[mod]['visit_date'].dt.month
    DataSet[mod]['visit_date'] = DataSet[mod]['visit_date'].dt.date
    end0 = time.time()
    print('%s data: unique stores %s, total %s, time elased %.2fs.' %
            (mod, len(DataSet[mod]['air_store_id'].unique()), len(DataSet[mod]['air_store_id']), (end0 - start0)))
DataSet['hol']['visit_date'] = DataSet['hol']['visit_date'].dt.date
print('============= process date related done.\n')

tra data: unique stores 829, total 252108, time elased 0.76s.
tes data: unique stores 821, total 32019, time elased 0.14s.



In [2]:
#### weight averaging strategy
# visitors in the future equals weight factor multiple of expected visitors in the past
date_info = DataSet['hol']
date_info['is_up_corner'] = date_info['visit_date'] < datetime.date(2016, 7, 1)
wkend_holidays = date_info.apply((lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
date_info.loc[wkend_holidays, 'holiday_flg'] = 0
date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5
air_visit_data = DataSet['tra']
visit_data = air_visit_data.merge(date_info, on='visit_date', how='left')
visit_data['visitors'] = np.log1p(visit_data['visitors'])
wmean = lambda x:( (x.weight * x.visitors).sum() / x.weight.sum() )
gkeys = ['air_store_id', 'day_of_week', 'holiday_flg', 'is_up_corner']
weighted_visitors = visit_data.groupby(gkeys).apply(wmean).reset_index()
weighted_visitors.rename(columns={0:'visitors'}, inplace=True) # cumbersome, should be better ways.
print(weighted_visitors.head(10))
# first step: fill with the expected visitors same pattern in the past
sample_submission = DataSet['tes']
sample_submission['air_store_id'] = sample_submission.id.map(lambda x: '_'.join(x.split('_')[:-1]))
sample_submission['visit_date'] = pd.to_datetime(sample_submission.id.map(lambda x: x.split('_')[2])).dt.date
sample_submission.drop(['visitors'], axis= 1, inplace= True)
sample_submission = sample_submission.merge(date_info, on='visit_date', how='left')
sample_submission = sample_submission.merge(weighted_visitors, on= gkeys, how='left')
print(sample_submission.head(20))
print('============================')
print('Filling with same pattern in the past, %s remaining to be filled up.' % len(sample_submission[sample_submission['visitors'].isnull() == True]))
# second step: fill with expected visitors patterned by holiday ignored in the past
missings = sample_submission.visitors.isnull()
# there are days that is holidays in test data set(2017/4/22-2017/5/31) while not in train data set(2016/1/1-2017/4/22)
# 'visitors' of which will be NaN, we can fill them with normal 'visitors' in the past DOW days.
# holiday_flg == 0 demonstrates normal days
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(weighted_visitors[weighted_visitors['holiday_flg'] == 0], 
                                                                                on=('air_store_id', 'day_of_week', 'is_up_corner'), 
                                                                                how='left'
                                                                               )['visitors_y'].values
print('============================')
print('Filling with holiday ignored, %s remaining to be filled up.' % len(sample_submission[sample_submission['visitors'].isnull() == True]))
## then we can try to fill them with mean 'visitors' in the past days.
missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    weighted_visitors[['air_store_id', 'visitors', 'is_up_corner']].groupby(['air_store_id', 'is_up_corner']).mean().reset_index(), 
    on=['air_store_id','is_up_corner'], how='left')['visitors_y'].values
sample_submission['visitors'] = np.expm1(sample_submission['visitors'])
print('============================')
print('Filling with holiday and DOW ignored, %s remaining to be filled up.' % len(sample_submission[sample_submission['visitors'].isnull() == True]))## unfortunately length not changeed, it did nothing
print(sample_submission.head(100))
# Create submission file
OutputFileName = 'TDWA_submit_%s' % (datetime.datetime.now().strftime("%Y-%m-%d"))
OutputFile = '../../data/submit/%s.csv' % OutputFileName
sample_submission[['id', 'visitors']].to_csv(OutputFile, float_format='%.6f', index=False)
os.system('zip ../../data/submit/%s.zip %s' % (OutputFileName, OutputFile))

           air_store_id day_of_week  holiday_flg  is_up_corner  visitors
0  air_00a91d42b08b08d9      Friday            0         False  3.583535
1  air_00a91d42b08b08d9      Monday            0         False  3.203625
2  air_00a91d42b08b08d9      Monday            1         False  3.091042
3  air_00a91d42b08b08d9    Saturday            0         False  2.524065
4  air_00a91d42b08b08d9      Sunday            0         False  1.098612
5  air_00a91d42b08b08d9    Thursday            0         False  3.475056
6  air_00a91d42b08b08d9     Tuesday            0         False  3.325868
7  air_00a91d42b08b08d9   Wednesday            0         False  3.353439
8  air_0164b9927d20bcc3      Friday            0         False  2.309302
9  air_0164b9927d20bcc3      Friday            1         False  2.761234
                                 id  visit_date          air_store_id  dow  \
0   air_00a91d42b08b08d9_2017-04-23  2017-04-23  air_00a91d42b08b08d9    6   
1   air_00a91d42b08b08d9_2017-04-24  2017

[100 rows x 11 columns]


0