In [23]:
import numpy as np
import pandas as pd
import datetime
import time
import numba
import os,sys
import gc
import math
from math import ceil
from sklearn import *
from scipy.stats import zscore

def week_of_month(dt):
    """ Returns the week of the month for the specified date.
    """

    first_day = dt.replace(day=1)

    dom = dt.day
    adjusted_dom = dom + first_day.weekday()

    return int(ceil(adjusted_dom/7.0))

def LoadData(InputDir):
    """"""
    ## load raw data
    data = {
        'tra': pd.read_csv('%s/air_visit_data.csv' % InputDir, parse_dates= ['visit_date']),
        'as': pd.read_csv('%s/air_store_info.csv' % InputDir),
        'hs': pd.read_csv('%s/hpg_store_info.csv' % InputDir),
        'ar': pd.read_csv('%s/air_reserve.csv' % InputDir, parse_dates= ['visit_datetime', 'reserve_datetime']),
        'hr': pd.read_csv('%s/hpg_reserve.csv' % InputDir, parse_dates= ['visit_datetime', 'reserve_datetime']),
        'id': pd.read_csv('%s/store_id_relation.csv' % InputDir),
        'tes': pd.read_csv('%s/sample_submission.csv' % InputDir),
        'hol': pd.read_csv('%s/date_info.csv' % InputDir, parse_dates=['calendar_date']).rename(columns={'calendar_date': 'visit_date'})
    }
    return data

@numba.jit
def ApplyDayoff(VisitCols, ReserveCols):
    """"""
    n = len(VisitCols)
    result = np.zeros((n, 1), dtype= 'int8')
    for i in range(n):
        d = (VisitCols[i]- ReserveCols[i]).days
        if(d > 0):
            result[i] = d
    return result

reserve2id = {'ar': 'air', 'hr': 'hpg'}
reserve2store = {'ar': 'as', 'hr': 'hs'}# load data set
InputDir = '../../data/raw'
DataSet = LoadData(InputDir)
###
# print('before missing filling, size %s ' % len(DataSet['tra']))
# #Fill in Nans where possible with average in cluster on that day adjusted by the size of the particular restaurants 
# def fill_nans_in_cluster(genre_name,area_name):
#     #get list of the same type of restaurants in the neighborhood
#     neighbors_bool = DataSet['as'].apply(lambda x:(x.air_genre_name==genre_name and x.air_area_name==area_name), axis=1)
#     neighbors_ids=pd.DataFrame((DataSet['as'][neighbors_bool]))
#     neighbors_restaurants= DataSet['tra'].merge(neighbors_ids,on='air_store_id',how='inner')[['air_store_id','visit_date','visitors']]
 
#     #pivot neighbors_restaurants to easy fill in possible missing dates.
#     neighbors_restaurants=neighbors_restaurants.pivot_table(index='visit_date',columns='air_store_id', values='visitors',aggfunc=sum)
    
#     #Fill in missing dates(if any) with Nans
#     idx = pd.date_range('2016-01-01', '2017-04-22')
#     neighbors_restaurants.index = pd.DatetimeIndex(neighbors_restaurants.index)
#     neighbors_restaurants = neighbors_restaurants.reindex(idx, fill_value=np.nan)

#     # Get visitors rate, normalized to the avarage number of visitors per day 
#     neighbors_restaurants_average= neighbors_restaurants.mean(axis=0).tolist()
#     normalized_neighbors_restaurants = neighbors_restaurants.div(neighbors_restaurants_average,axis=1)

#     # Fill in Nans with avarge number of visiotrs in nighbour restaurants 
#     #axis argument to fillna is Not Implemented, so have to use transpond
#     normalized_neighbors_restaurants_with_filled_nans=normalized_neighbors_restaurants.T.fillna(normalized_neighbors_restaurants.mean(axis=1))
    
#     #replace normalized values with real vistors by multipliyng back on average per restaurant
#     neighbors_restaurants_with_filled_nans = normalized_neighbors_restaurants_with_filled_nans.mul(neighbors_restaurants_average,axis=0).reset_index()

#     #return visit data in the original format 
#     df_columns = neighbors_restaurants_with_filled_nans.columns[1:]
#     return  pd.melt(neighbors_restaurants_with_filled_nans,id_vars=['air_store_id'], value_vars=df_columns)

# clusters_names= DataSet['as'].apply(lambda x:(x.air_genre_name + '_' + x.air_area_name), axis=1).unique().tolist()
# full_data = pd.DataFrame(columns= DataSet['tra'].columns)

# cnt = 0
# for cluster in clusters_names:
#     if(cnt % 200 == 0):
#         print('%s processing done.' % cnt)
#     cluster_data = fill_nans_in_cluster (cluster.split('_')[0],cluster.split('_')[1])
#     cluster_data.rename(columns={'variable':'visit_date','value':'visitors'},inplace=True )
#     full_data=full_data.append(cluster_data,ignore_index=True)
#     cnt += 1
# print('after missing filling, size %s' % len(full_data))
# print('%s still missing.' % full_data['visitors'].isnull().sum())
# full_data.dropna(inplace= True)
# DataSet['tra'] = full_data # reset train

In [24]:
###
# date related features
print('\n============')
start = time.time()
for mod in ['tra', 'tes']:
    start0 = time.time()
#     if(mod == 'tra'):
#         outliers = DataSet[mod].groupby(['air_store_id'])['visitors'].transform(zscore) > 3
#         DataSet[mod][outliers]= np.nan
#         print('%s outliers have been removed.' % DataSet[mod].isnull().sum()['air_store_id'])
#         DataSet[mod].dropna(inplace=True)       
    if (mod == 'tes'):
        DataSet[mod]['visit_date'] = DataSet[mod]['id'].map(lambda x: str(x).split('_')[2])
        DataSet[mod]['air_store_id'] = DataSet[mod]['id'].map(lambda x: '_'.join(x.split('_')[:2]))
        DataSet[mod]['visit_date'] = pd.to_datetime(DataSet[mod]['visit_date'])
    DataSet[mod]['dow'] = DataSet[mod]['visit_date'].dt.dayofweek
    DataSet[mod]['year'] = DataSet[mod]['visit_date'].dt.year
    DataSet[mod]['month'] = DataSet[mod]['visit_date'].dt.month
    DataSet[mod]['wom'] = DataSet[mod]['visit_date'].apply(week_of_month)
    ## remove noisy days, updated 2018/1/4 22:45
    print('before: ', len(DataSet[mod]))
    print('after: ', len(DataSet[mod]))
    DataSet[mod]['visit_date'] = DataSet[mod]['visit_date'].dt.date
    DataSet[mod]['woy'] = DataSet[mod]['visit_date'].apply(lambda x: datetime.date.isocalendar(x)[1])
    end0 = time.time()
    print('%s data: unique stores %s, total %s, time elased %.2fs.' %
            (mod, len(DataSet[mod]['air_store_id'].unique()), len(DataSet[mod]['air_store_id']), (end0 - start0)))
print('============= process date related done.\n')
######## store data
# add city feature
for mod in ['ar', 'hr']:
    DataSet[reserve2store[mod]]['%s_city' % reserve2id[mod]] = DataSet[reserve2store[mod]]['%s_area_name' % reserve2id[mod]].map(lambda x: str(x).split(' ')[0])
    DataSet[reserve2store[mod]]['%s_area_name' % reserve2id[mod]] = DataSet[reserve2store[mod]]['%s_area_name' % reserve2id[mod]].map(lambda x: '_'.join(x.split(' ')[1:]))
print('add city feature done.')
# area (store)count
for mod in ['ar', 'hr']:
    rec = []
    groupped = DataSet[reserve2store[mod]].groupby(['%s_area_name' % reserve2id[mod]])
    for g in groupped.groups:
        ac = {}
        ac['%s_area_name' % reserve2id[mod]] = g
        ac['%s_area_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= ['%s_area_name' % reserve2id[mod]])
# genre (store)count
for mod in ['ar', 'hr']:
    rec = []
    groupped = DataSet[reserve2store[mod]].groupby(['%s_genre_name' % reserve2id[mod]])
    for g in groupped.groups:
        ac = {}
        ac['%s_genre_name' % reserve2id[mod]] = g
        ac['%s_genre_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= ['%s_genre_name' % reserve2id[mod]])
#  area_genre (store) count 
for mod in ['ar', 'hr']:
    rec = []
    groupby_keys = ['%s_area_name' % reserve2id[mod], '%s_genre_name' % reserve2id[mod]]
    groupped = DataSet[reserve2store[mod]].groupby(groupby_keys)
    for g in groupped.groups:
        ac = {}
        ac['%s_area_name' % reserve2id[mod]] = g[0]
        ac['%s_genre_name' % reserve2id[mod]] = g[1]
        ac['%s_area_genre_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= groupby_keys)
# city (store)count
for mod in ['ar', 'hr']:
    rec = []
    groupped = DataSet[reserve2store[mod]].groupby(['%s_city' % reserve2id[mod]])
    for g in groupped.groups:
        ac = {}
        ac['%s_city' % reserve2id[mod]] = g
        ac['%s_city_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        #ac['%s_area_store_ratio' % reserve2id[mod]] = ac['%s_area_store_count' % reserve2id[mod]]/len(DataSet[reserve2store[mod]])
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= ['%s_city' % reserve2id[mod]])
#  city_genre (store) count 
for mod in ['ar', 'hr']:
    rec = []
    groupby_keys = ['%s_city' % reserve2id[mod], '%s_genre_name' % reserve2id[mod]]
    groupped = DataSet[reserve2store[mod]].groupby(groupby_keys)
    for g in groupped.groups:
        ac = {}
        ac['%s_city' % reserve2id[mod]] = g[0]
        ac['%s_genre_name' % reserve2id[mod]] = g[1]
        ac['%s_city_genre_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= groupby_keys)
print(' ================ add count features done.\n')
######### holiday data
data = DataSet['hol']
# updated 2018/1/4 21:45
data['day'] = data['visit_date'].dt.day # day of month
data['pom'] = data['day'].apply(lambda x: 'start' if(x < 10) else('mid' if(x < 20) else 'end')) # peroid of month
### add holiday days
data['visit_date'] = data['visit_date'].dt.date
data = data.sort_values(by= 'visit_date')
def TagHoliday(df):
    ''''''
    n = len(df)
    result = ['' for x in range(n)]
    for i in range(n):
        if(i == 0):
            result[i] = 'hid_%s' % 0
        elif((df[i] - df[i-1]).days == 1):
            result[i] = result[i - 1]
        else:
            result[i] = 'hid_%s' % (int(result[i - 1].split('_')[1]) + 1)
    return result
holidays = data[data['holiday_flg'] == 1][['visit_date']]
holidays['hol_l0'] = TagHoliday(holidays['visit_date'].values)
groupped = holidays.groupby(['hol_l0'])
recs = []
for g in groupped.groups:
    hol_days = {}
    hol_days['hol_l0'] = g
    hol_days['hol_days'] = len(groupped.get_group(g))
    recs.append(hol_days)
tmpdf = pd.DataFrame(data= recs, index= range(len(recs)))
holidays = holidays.merge(tmpdf, how= 'left', on= 'hol_l0')
data = data.merge(holidays, how= 'left', on= 'visit_date')
data.drop(['hol_l0'], axis= 1, inplace= True)
data['hol_days'].fillna(0, inplace= True)
print('add holiday type done.')
### reset holiday
wkend_holidays = data.apply((lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
data['is_weekends'] = (data['day_of_week'] == 'Sunday') | (data['day_of_week'] == 'Saturday')
data['is_weekends'] = data['is_weekends'].astype(int)
data.loc[wkend_holidays, 'holiday_flg'] = 0

# updated 2018/1/4 21:45
data['prevday'] = data['visit_date'] - datetime.timedelta(days= 1)
data['nextday'] = data['visit_date'] + datetime.timedelta(days= 1)
data.set_index('visit_date', inplace= True)
data['prev_is_holiday'] = 0
data['prev_is_holiday'] = data[data['prevday'] > datetime.datetime(2016, 1, 1).date()]['prevday'].apply(lambda x: data.loc[x, 'holiday_flg'])
data['next_is_holiday'] = 0
data['next_is_holiday'] = data[data['nextday'] < datetime.datetime(2017, 5, 31).date()]['nextday'].apply(lambda x: data.loc[x, 'holiday_flg'])
data.reset_index(inplace= True)
data.drop(['prevday', 'nextday'], axis= 1, inplace= True)

DataSet['hol'] = data
print('========== reset holiday done.\n')
######## join 
# join holiday data
for mod in ['tra', 'tes']:
    data = DataSet[mod]
    data = data.merge(DataSet['hol'], how='left', on=['visit_date'])
    data.drop(['day_of_week', 'year'], axis=1, inplace=True)
    DataSet[mod] = data
# join store data
for mod in ['tra', 'tes']:
    data = DataSet[mod]
    for rtype in ['ar', 'hr']: 
        if((rtype == 'hr') & (('%s_store_id' % reserve2id[rtype]) not in data.columns)):
            data = data.merge(DataSet['id'], how= 'left', on= ['air_store_id'])
        data = data.merge(DataSet[reserve2store[rtype]], how= 'left', on= ['%s_store_id' % reserve2id[rtype]])
    DataSet[mod] = data
print('================ join holiday, store data done.')
######### reservation data
for mod in ['hr', 'ar']:
    start1 = time.time()
    DataSet[mod]['visit_date'] = DataSet[mod]['visit_datetime'].dt.date
    DataSet[mod]['reserve_date'] = DataSet[mod]['reserve_datetime'].dt.date
    DataSet[mod].drop(['reserve_datetime', 'visit_datetime'], axis= 1, inplace= True)
    #### !!! delete dayoff while restain reserve visitors for other features later on, updated 2018/1/26 22:37
    #tmpdf = pd.DataFrame(data=ApplyDayoff(DataSet[mod]['visit_date'].values, DataSet[mod]['reserve_date'].values),index=DataSet[mod].index, columns=['reserve_date_diff'])
    #tmpdf = pd.concat([DataSet[mod], tmpdf], axis=1)
    #tmpdf = tmpdf.groupby(['%s_store_id' % reserve2id[mod], 'visit_date'], as_index=False).agg({'reserve_visitors': sum, 'reserve_date_diff': ['mean', 'median']})
    #tmpdf.columns = ['%s_store_id' % reserve2id[mod], 
#                    'visit_date', 
#                    '%s_reserved_visitors' % reserve2id[mod], 
#                    '%s_reserved_dayoff_mean' % reserve2id[mod], 
#                    '%s_reserved_dayoff_median' % reserve2id[mod]
#                   ]
    tmpdf = DataSet[mod].groupby(['%s_store_id' % reserve2id[mod], 'visit_date'], as_index=False).agg({'reserve_visitors': sum})
    tmpdf.columns = ['%s_store_id' % reserve2id[mod], 'visit_date', '%s_reserved_visitors' % reserve2id[mod]]
    end1 = time.time()
    DataSet[mod] = tmpdf
    DataSet[mod]['%s_reserved_visitors' % reserve2id[mod]] = np.log1p(DataSet[mod]['%s_reserved_visitors' % reserve2id[mod]])
print(' process reservation data done.\n')
# join reservation data
for mod in ['tra', 'tes']:
    data = DataSet[mod]
    # merge
    for rtype in ['ar', 'hr']: 
        if((rtype == 'hr') & (('%s_store_id' % reserve2id[rtype]) not in data.columns)):
            data = data.merge(DataSet['id'], how= 'left', on= ['air_store_id'])
        data = data.merge(DataSet[rtype], how= 'left', on= ['%s_store_id' % reserve2id[rtype], 'visit_date'])
#     data['air_reserved_visitors'] = np.log1p(data['air_reserved_visitors'])
#     data['hpg_reserved_visitors'] = np.log1p(data['hpg_reserved_visitors'])
    data['reserved_visitors'] = (data['air_reserved_visitors'] + data['hpg_reserved_visitors'])/2
#     data['reserved_dayoff_mean'] = (data['air_reserved_dayoff_mean'] + data['hpg_reserved_dayoff_mean'])/2
#     data['reserved_dayoff_median'] = (data['air_reserved_dayoff_median'] + data['hpg_reserved_dayoff_median'])/2
    for rtype in ['ar', 'hr']:
        # updated 2017/12/29 13:00
        data['%s_reserved_visitors' % reserve2id[rtype]].fillna(0, inplace= True)
#         data['%s_reserved_dayoff_mean' % reserve2id[rtype]].fillna(0, inplace= True)
#         data['%s_reserved_dayoff_median' % reserve2id[rtype]].fillna(0, inplace= True)
    data['reserved_visitors'].fillna(0, inplace= True)
#     data['reserved_dayoff_mean'].fillna(0, inplace= True)
#     data['reserved_dayoff_median'].fillna(0, inplace= True)
    DataSet[mod] = data
print('============= join reservation data done.\n')
####### time series related
s = time.time()
# mix train with test
DataSet['tra']['is_train'] = 1
DataSet['tes']['is_train'] = 0
AllData = pd.concat([DataSet['tra'], DataSet['tes']], axis= 0, ignore_index= True)
# !!! dividing into two pieces since 2016/7/1 is a corner point, update time 2017/12/22 15:45
DataParts = {
    '0': AllData[AllData['visit_date'] < datetime.date(2016, 7, 1)],
    '1': AllData[AllData['visit_date'] >= datetime.date(2016, 7, 1)]
}
## features for rolling average visitors
for pidx in DataParts.keys(): 
    groupped = DataParts[pidx].groupby(['air_store_id'])
    visitor_ticks = [39, 46, 53, 60, 67, 74]#, 81, 88, 95, 102, 109, 116, 123]  # for days
    print('total groups %s ' % len(groupped.groups))
    dfs = []
    for g in groupped.groups: 
        gdf = groupped.get_group(g).sort_values(by= ['visit_date'])
        for t in visitor_ticks:
            gdf['visitor_tick_sum_%s' % t] = np.log1p(gdf['visitors']).rolling(window= t).sum()
            gdf['visitor_tick_sum_%s' % t].fillna(0, inplace= True)
            gdf['reserve_visitor_tick_sum_%s' % t] = np.log1p(gdf['air_reserved_visitors']).rolling(window= t).sum()
            gdf['reserve_visitor_tick_sum_%s' % t].fillna(0, inplace= True)
        dfs.append(gdf)
    # concate
    tmpdf = pd.concat(dfs, axis= 0, ignore_index= True)
    join_cols = ['air_store_id', 'visit_date']
    rolling_visitors_cols = []
    rolling_reserve_visitors_cols = []
    for i in range(len(visitor_ticks)):
        if(i == 0):
            continue
        # rolling visitors mean for one week
        k_mean = 'rolling_visitors_%s_%s' % (visitor_ticks[i], visitor_ticks[i - 1])
        tmpdf[k_mean] = (tmpdf['visitor_tick_sum_%s' % visitor_ticks[i]] - tmpdf['visitor_tick_sum_%s' % visitor_ticks[i - 1]]) / (visitor_ticks[i] - visitor_ticks[i - 1])
        tmpdf.loc[tmpdf[k_mean] < 0, k_mean] = -1  ## negative values exists, need to be set zero, updated 2016/12/22 20:30
        join_cols.append(k_mean)
        rolling_visitors_cols.append(k_mean)
        # rolling reserve visitors mean for one week
        k_mean = 'rolling_reserve_visitors_%s_%s' % (visitor_ticks[i], visitor_ticks[i - 1])
        tmpdf[k_mean] = (tmpdf['reserve_visitor_tick_sum_%s' % visitor_ticks[i]] - tmpdf['reserve_visitor_tick_sum_%s' % visitor_ticks[i - 1]]) / (visitor_ticks[i] - visitor_ticks[i - 1])
        tmpdf.loc[tmpdf[k_mean] < 0, k_mean] = -1  ## negative values exists, need to be set zero, updated 2016/12/22 20:30
        join_cols.append(k_mean)
        rolling_reserve_visitors_cols.append(k_mean)
    tmpdf.drop(['visitor_tick_sum_%s' % col for col in visitor_ticks], axis= 1, inplace= True)
    tmpdf.drop(['reserve_visitor_tick_sum_%s' % col for col in visitor_ticks], axis= 1, inplace= True)
    #### !!! updated 2018/1/16 22:55
    ## gap for rolling_visitors/rolling_reserve_visitors respectively, grasp variance of rolling features
    for i in range(len(rolling_visitors_cols) - 1):
        rolling_gap = '%s_%s_var' % (rolling_visitors_cols[i], rolling_visitors_cols[i + 1])
        smooth_val = tmpdf[rolling_visitors_cols[i + 1]] + 1
        tmpdf[rolling_gap] = (tmpdf[rolling_visitors_cols[i]] - tmpdf[rolling_visitors_cols[i + 1]])/smooth_val
        join_cols.append(rolling_gap)
    for i in range(len(rolling_reserve_visitors_cols) - 1):
        rolling_gap = '%s_%s_var' % (rolling_reserve_visitors_cols[i], rolling_reserve_visitors_cols[i + 1])
        smooth_val = tmpdf[rolling_reserve_visitors_cols[i + 1]] + 1
        tmpdf[rolling_gap] = (tmpdf[rolling_reserve_visitors_cols[i]] - tmpdf[rolling_reserve_visitors_cols[i + 1]])/smooth_val
        join_cols.append(rolling_gap)
    ## difference between rolling_visitors and rolling_reserve_visitors, grasp difference between rolling visitors and reserve visitors
    rolling_diff_cols = []
    for i in range(len(rolling_visitors_cols)):
        rolling_diff = '%s_%s_diff' % (rolling_visitors_cols[i], rolling_reserve_visitors_cols[i])
        smooth_val = tmpdf[rolling_reserve_visitors_cols[i]] + 1
        tmpdf[rolling_diff] = (tmpdf[rolling_visitors_cols[i]] - tmpdf[rolling_reserve_visitors_cols[i]])/smooth_val
        join_cols.append(rolling_diff)
        rolling_diff_cols.append(rolling_diff)
    ## gap for rolling_visitors_reserve_visitors_gap, grasp variance of rolling diff
    for i in range(len(rolling_diff_cols) - 1):
        rolling_diff_gap = '%s_%s_var' % (rolling_diff_cols[i], rolling_diff_cols[i + 1])
        smooth_val = tmpdf[rolling_diff_cols[i + 1]] + 1
        tmpdf[rolling_diff_gap] = (tmpdf[rolling_diff_cols[i]] - tmpdf[rolling_diff_cols[i + 1]])/smooth_val
        join_cols.append(rolling_diff_gap)
    DataParts[pidx] = DataParts[pidx].merge(tmpdf[join_cols], how= 'left', on= ['air_store_id', 'visit_date'])
    print('part %s rolling done.' % pidx)
# concat after all is done
AllData = pd.concat([DataParts['0'], DataParts['1']], axis= 0, ignore_index= True)
# restore
DataSet['tra'] = AllData[AllData['is_train'] == 1]
DataSet['tes'] = AllData[AllData['is_train'] == 0]
DataSet['tra'].drop(['is_train'], axis= 1, inplace= True)
DataSet['tes'].drop(['is_train'], axis= 1, inplace= True)
del AllData
gc.collect()
end = time.time()
print('======== add time series features done. time elapsed %s' % (end - start))

#### add date_int
for mod in ['tra', 'tes']:
    DataSet[mod]['date_int'] = DataSet[mod]['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
    DataSet[mod]['date_int'] = DataSet[mod]['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
print('add date int features done.')
### add var_max_lat/var_max_long
for mod in ['tra', 'tes']:
    DataSet[mod]['lon_plus_lat_x'] = DataSet[mod]['longitude_x'] + DataSet[mod]['latitude_x'] 
    DataSet[mod]['var_max_long_x'] = DataSet[mod]['longitude_x'].max() - DataSet[mod]['longitude_x']
    DataSet[mod]['var_max_lat_x'] = DataSet[mod]['latitude_x'].max() - DataSet[mod]['latitude_x']
e = time.time()
print(' ============= add time series related features done. time elapsed %s' % (end - start))

## !!! add corner tag since 2016/7/1 is a corner point, update time 2017/12/22 15:45
for mod in ['tra', 'tes']:
    DataSet[mod]['is_up_corner'] = DataSet[mod]['visit_date'] < datetime.date(2016, 7, 1)
    DataSet[mod]['is_up_corner'] = DataSet[mod]['is_up_corner'].astype(int)
print('============ add is_up_corner feature done.')

## interactions
pairs = [('air_area_genre_store_count', 'air_area_store_count'), 
         ('air_city_genre_store_count', 'air_city_store_count'), 
         ('air_genre_store_count', 'air_area_genre_store_count'), 
         ('air_genre_store_count', 'air_city_genre_store_count'), 
         ('air_genre_store_count', 'air_city_store_count'), 
         ('air_genre_store_count', 'air_area_store_count'), 
         ('air_area_store_count', 'air_city_store_count'), 
         ('hpg_area_genre_store_count', 'hpg_area_store_count'), 
         ('hpg_city_genre_store_count', 'hpg_city_store_count'), 
         ('hpg_genre_store_count', 'hpg_city_genre_store_count'), 
         ('hpg_genre_store_count', 'hpg_area_genre_store_count')]
for mod in ['tra', 'tes']:
    for pair in pairs:
        DataSet[mod]['inter_%s_%s_multiply' % (pair[0], pair[1])] = DataSet[mod][pair[0]] * DataSet[mod][pair[1]]
        DataSet[mod]['inter_%s_%s_divide' % (pair[0], pair[1])] = DataSet[mod][pair[0]] // (1 + DataSet[mod][pair[1]])
pairs = [('air_area_genre_store_count', 'hpg_area_genre_store_count'), 
         ('air_area_store_count', 'hpg_area_store_count'), 
         ('air_city_genre_store_count', 'hpg_city_genre_store_count'), 
         ('air_city_store_count', 'hpg_city_store_count'), 
         ('air_genre_store_count', 'hpg_genre_store_count')]
for mod in ['tra', 'tes']:
    for pair in pairs:
        DataSet[mod]['inter_%s_%s_plus' % (pair[0], pair[1])] = DataSet[mod][pair[0]] + DataSet[mod][pair[1]]
        DataSet[mod]['inter_%s_%s_divide' % (pair[0], pair[1])] = DataSet[mod][pair[0]] // (1 + DataSet[mod][pair[1]])
end = time.time()
print('=========== add interaction count features done. time elapsed %s.' % (end - start))
###  remove outliers
# data_dfs = []
# astores = DataSet['tra']['air_store_id'].unique()
# print('before removing outliers, size %s, unique stores %s' % (len(DataSet['tra']), len(astores)))
# low = .00
# high = .99
# delete_num = []
# groupped = DataSet['tra'].groupby(['air_store_id', 'dow', 'is_up_corner'])
# for g in groupped.groups:
#     gdata = groupped.get_group(g)
#     n1 = len(gdata)
#     filt_df = gdata[['visitors']]
#     quant_df = filt_df.quantile([low, high])
#     filt_df = filt_df.apply(lambda x: x[(x>= quant_df.loc[low,x.name]) & (x <= quant_df.loc[high,x.name])], axis=0)
#     gdata = pd.concat([gdata.loc[:,['air_store_id', 'visit_date']], filt_df], axis=1)
#     gdata.dropna(inplace=True)
#     data_dfs.append(gdata)
#     n2 = len(gdata)
#     delete_num.append(n1 - n2)
# DataSet['tra'] = pd.concat(data_dfs, axis= 0, ignore_index= True)
# print(DataSet['tra']['visitors'].isnull().sum())
# bstores = DataSet['tra']['air_store_id'].unique()
# print('delete mean %s, sum %s' % (np.mean(delete_num), np.sum(delete_num)))
# print('after removing outliers, size %s, unique stores %s' % (len(DataSet['tra']), len(bstores)))
# 
# def calc_shifted_ewm(series, alpha, adjust=True):
#     return series.shift().ewm(alpha=alpha, adjust=adjust).mean()
# 
# for mod in ['tra', 'tes']:
#     DataSet[mod]['ewm'] = DataSet[mod].groupby(['air_store_id', 'is_up_corner', 'dow']).apply(lambda g: calc_shifted_ewm(g['visitors'], 0.1)).sort_index(level=['air_store_id', 'is_up_corner', 'dow']).values
# 
for c in ['day', 'pom', 'wom', 'woy', 'prev_is_holiday', 'next_is_holiday']:
    print(c, len(DataSet['tra'][c].value_counts()))


before:  252108
after:  252108
tra data: unique stores 829, total 252108, time elased 2.32s.
before:  32019
after:  32019
tes data: unique stores 821, total 32019, time elased 0.31s.

add city feature done.

add holiday type done.

 process reservation data done.


total groups 316 
part 0 rolling done.
total groups 829 
part 1 rolling done.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


add date int features done.
day 31
pom 3
wom 6
woy 53
prev_is_holiday 2
next_is_holiday 2


In [25]:
### fill nulls, updated 2016/12/26 14:58
# from fancyimpute.knn import KNN
# from fancyimpute import MatrixFactorization
# import time
# 
# knn_filled_cols = ['latitude_y', 'longitude_y']
# start = time.time()
# for mod in ['tra', 'tes']:
#     solver = MatrixFactorization(
#         learning_rate=0.01,
#         rank=3,
#         l2_penalty=0,
#         min_improvement=1e-6)
#     DataSet[mod][knn_filled_cols] = solver.complete(DataSet[mod][knn_filled_cols])
#     end = time.time()
#     print('%s done. time elapsed %.2fs' % (mod, (end - start)))
# print(DataSet['tra'].isnull().sum())
# end = time.time()
# print('Fill missings with MF done, time elapsed %.2fs' % (end - start))

from sklearn import *
cate_feats = ['genre_name', 'area_name', 'city']
cate_cols = ['%s_%s' % (m, cf) for m in ['air', 'hpg'] for cf in cate_feats]
DataSet['tra']['air_store_id_encoded'] = DataSet['tra']['air_store_id']
DataSet['tra']['hpg_store_id_encoded'] = DataSet['tra']['hpg_store_id']
DataSet['tes']['air_store_id_encoded'] = DataSet['tes']['air_store_id']
DataSet['tes']['hpg_store_id_encoded'] = DataSet['tes']['hpg_store_id']
cate_cols.extend(['air_store_id_encoded', 'hpg_store_id_encoded', 'pom'])#, 'day', 'wom', 'woy'])
for mod in ['tra', 'tes']:
    for col in DataSet[mod].columns:
        if(col in cate_cols):
            DataSet[mod][col].fillna('unknown', inplace= True)
        elif(col == 'latitude_y'):
            DataSet[mod][col].fillna(DataSet[mod]['latitude_x'], inplace= True)
        elif(col == 'longitude_y'):
            DataSet[mod][col].fillna(DataSet[mod]['longitude_x'], inplace= True)
        else:
            DataSet[mod][col].fillna(0, inplace= True)
print('filling missings done.')

print(DataSet['tra'].isnull().sum())

filling missings done.
air_area_genre_store_count                                            0
air_area_name                                                         0
air_area_store_count                                                  0
air_city                                                              0
air_city_genre_store_count                                            0
air_city_store_count                                                  0
air_genre_name                                                        0
air_genre_store_count                                                 0
air_reserved_visitors                                                 0
air_store_id                                                          0
day                                                                   0
dow                                                                   0
hol_days                                                              0
holiday_flg                              

In [26]:
# ### transformat skewed features
# from scipy.stats import norm, skew
# from scipy.special import boxcox1p

# drop_cols = ['id', 'air_store_id', 'visit_date', 'visitors', 'hpg_store_id', 
#              'is_train', 'hol_days', 'holiday_flg', 'is_weekends', 'latitude_x', 
#              'latitude_y']

# DataSet['tra']['is_train'] = 1
# DataSet['tes']['is_train'] = 0
# all_cols = DataSet['tra'].columns
# all_data = pd.concat([DataSet['tra'], DataSet['tes'][all_cols]], axis= 0)
# tmp_feats = all_data.dtypes[all_data.dtypes != "object"].index
# numeric_feats = [col for col in tmp_feats if col not in drop_cols]
# # Check the skew of all numerical features
# skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
# print("\nSkew in numerical features: \n")
# skewness = pd.DataFrame({'Skew' :skewed_feats})
# print(skewness)

# skewness = skewness[abs(skewness) > 0.75]
# print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

# skewed_features = skewness.index
# lam = 0.15
# for feat in skewed_features:
#     all_data[feat] = boxcox1p(all_data[feat], lam)
# DataSet['tra'] = all_data[all_data['is_train'] == 1]
# DataSet['tra'].drop(['is_train'], axis= 1, inplace= True)
# DataSet['test'] = all_data[all_data['is_train'] == 0]
# DataSet['tes'].drop(['is_train'], axis= 1, inplace= True)

In [27]:
import sys,os
from sklearn import model_selection

pca_factor = 10

def PCAFitTransform(data):
    n = len(data)
    rec_dict = {}
    for i in data.index:
        rid = data.loc[i, 'air_store_id']
        vdate = str(data.loc[i, 'visit_date'])
        visitor = data.loc[i, 'visitors']
        if(rid not in rec_dict):
            rec_dict[rid] = {}
        rec_dict[rid][vdate] = visitor
    records = []
    for r in rec_dict:
        rec = {'air_store_id': r}
        for d in rec_dict[r]:
            rec[d] = rec_dict[r][d]
        records.append(rec)
    tmpdf = pd.DataFrame(data= records, index= range(len(records)))    
    #tmpdf = tmpdf.dropna(axis= 1, how= 'any')
    tmpdf.fillna(0, inplace= True)
    pca= decomposition.PCA(n_components= pca_factor)
    pca_cols = [c for c in tmpdf.columns if(c != 'air_store_id')]
    transformed = pca.fit_transform(tmpdf[pca_cols])
    pcadf = pd.DataFrame(data= transformed, index= range(len(transformed)), columns= ['pca_%s' % i for i in range(pca_factor)])
    pcadf['air_store_id'] = tmpdf['air_store_id']
    return pcadf
    
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

#### Label encoding for categorial features
TrainData = DataSet['tra']
TestData = DataSet['tes']
for col in cate_cols:
    print(col)
    lbl = preprocessing.LabelEncoder()
    TrainData[col] = lbl.fit_transform(TrainData[col])
    TestData[col] = lbl.transform(TestData[col])
    
print('encoding for categorial features done.')

# split TrainData into train and holdout with random strategy
np.random.seed(2017)
msk = np.random.rand(len(TrainData)) < 0.1
holdout = TrainData[msk]
train = TrainData[~msk]
test = TestData
# Set up folds
K = 5
low = 0.001
high = 0.999
kf = model_selection.KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(1)
OutputDir = '../../data/l0'
if(os.path.exists('%s/kfold' % OutputDir) == False):
    os.makedirs('%s/kfold' % OutputDir)
#cate_cols.append('air_store_id')
for i, (train_index, test_index) in enumerate(kf.split(train)):
    FoldTrain, FoldValid = train.iloc[train_index].copy(), train.iloc[test_index].copy()
    FoldHoldout = holdout.copy()
    FoldTest = test.copy()
    
    FoldTrain['visitors'] = np.log1p(FoldTrain['visitors'])
    FoldValid['visitors'] = np.log1p(FoldValid['visitors'])
    FoldHoldout['visitors'] = np.log1p(FoldHoldout['visitors'])
    FoldTest['visitors'] = np.log1p(FoldTest['visitors'])
    
    FoldTrain['air_reserved_visitors'] = np.log1p(FoldTrain['air_reserved_visitors'])
    FoldValid['air_reserved_visitors'] = np.log1p(FoldValid['air_reserved_visitors'])
    FoldHoldout['air_reserved_visitors'] = np.log1p(FoldHoldout['air_reserved_visitors'])
    FoldTest['air_reserved_visitors'] = np.log1p(FoldTest['air_reserved_visitors'])
    #### dependent features which is extreemly subtle to data-leak
#     # pca transform
#     pca_cols = ['air_store_id', 'visit_date', 'visitors']
#     sd = datetime.date(2016, 4, 23)
#     ed = datetime.date(2016, 5, 31)
#     pcadf = PCAFitTransform(FoldTrain[(FoldTrain['visit_date'] >= sd) & (FoldTrain['visit_date'] <= ed)][pca_cols])
#     FoldValid = FoldValid.merge(pcadf, how= 'left', on= ['air_store_id'])
#     FoldHoldout = FoldHoldout.merge(pcadf, how= 'left', on= ['air_store_id'])
#     FoldTest = FoldTest.merge(pcadf, how= 'left', on= ['air_store_id'])
    # percentiles features
    agg_visitors_cols = []
    agg_reserve_visitors_cols = []
    tickles = ['mean', 'median', 'max', 'min', 'count']
    for feat in ['air_store_id']:
        gkeys = [feat, 'dow', 'is_up_corner']
        gprefix = '_'.join(gkeys)
        TmpDOW = FoldTrain.groupby(gkeys, as_index= False).agg({'visitors': tickles})
        tmpcols = gkeys.copy()
        tmpcols.extend(['%s_visitors_%s' % (gprefix, m) for m in tickles])
        agg_visitors_cols.extend(['%s_visitors_%s' % (gprefix, m) for m in tickles])
        TmpDOW.columns = tmpcols
        #FoldTrain = FoldTrain.merge(TmpDOW, how= 'left', on=gkeys) #### data-leak, prone to be overfitted
        FoldValid = FoldValid.merge(TmpDOW, how= 'left', on=gkeys)
        FoldHoldout = FoldHoldout.merge(TmpDOW, how= 'left', on=gkeys)
        FoldTest = FoldTest.merge(TmpDOW, how= 'left', on=gkeys)
        FoldValid.fillna(0, inplace= True)
        FoldHoldout.fillna(0, inplace= True)
        FoldTest.fillna(0, inplace= True)
    for feat in ['air_store_id']:
        gkeys = [feat, 'dow', 'is_up_corner']
        gprefix = '_'.join(gkeys)
        TmpDOW = FoldTrain.groupby(gkeys, as_index= False).agg({'air_reserved_visitors': tickles})
        tmpcols = gkeys.copy()
        tmpcols.extend(['%s_reserve_visitors_%s' % (gprefix, m) for m in tickles])
        agg_reserve_visitors_cols.extend(['%s_reserve_visitors_%s' % (gprefix, m) for m in tickles])
        TmpDOW.columns = tmpcols
        #FoldTrain = FoldTrain.merge(TmpDOW, how= 'left', on=gkeys) #### data-leak, prone to be overfitted
        FoldValid = FoldValid.merge(TmpDOW, how= 'left', on=gkeys)
        FoldHoldout = FoldHoldout.merge(TmpDOW, how= 'left', on=gkeys)
        FoldTest = FoldTest.merge(TmpDOW, how= 'left', on=gkeys)
        FoldValid.fillna(0, inplace= True)
        FoldHoldout.fillna(0, inplace= True)
        FoldTest.fillna(0, inplace= True)
    for idx in range(len(agg_visitors_cols)):
        agg_var = '%s_%s_diff' % (agg_visitors_cols[idx], agg_reserve_visitors_cols[idx])
        FoldValid[agg_var] = (FoldValid[agg_visitors_cols[idx]] - FoldValid[agg_reserve_visitors_cols[idx]])/(FoldValid[agg_reserve_visitors_cols[idx]] + 1)
        FoldHoldout[agg_var] = (FoldHoldout[agg_visitors_cols[idx]] - FoldHoldout[agg_reserve_visitors_cols[idx]])/(FoldHoldout[agg_reserve_visitors_cols[idx]] + 1)
        FoldTest[agg_var] = (FoldTest[agg_visitors_cols[idx]] - FoldTest[agg_reserve_visitors_cols[idx]])/(FoldTest[agg_reserve_visitors_cols[idx]] + 1)
    agg_visitors_cols = []
    agg_reserve_visitors_cols = []
    for feat in ['air_store_id']:
        gkeys = [feat, 'is_weekends', 'is_up_corner']
        gprefix = '_'.join(gkeys)
        TmpDOW = FoldTrain.groupby(gkeys, as_index= False).agg({'visitors': tickles})
        tmpcols = gkeys.copy()
        tmpcols.extend(['%s_visitors_%s' % (gprefix, m) for m in tickles])
        agg_visitors_cols.extend(['%s_visitors_%s' % (gprefix, m) for m in tickles])
        TmpDOW.columns = tmpcols
        #FoldTrain = FoldTrain.merge(TmpDOW, how= 'left', on=gkeys) #### data-leak, prone to be overfitted
        FoldValid = FoldValid.merge(TmpDOW, how= 'left', on=gkeys)
        FoldHoldout = FoldHoldout.merge(TmpDOW, how= 'left', on=gkeys)
        FoldTest = FoldTest.merge(TmpDOW, how= 'left', on=gkeys)
        FoldValid.fillna(0, inplace= True)
        FoldHoldout.fillna(0, inplace= True)
        FoldTest.fillna(0, inplace= True)
    for feat in ['air_store_id']:
        gkeys = [feat, 'is_weekends', 'is_up_corner']
        gprefix = '_'.join(gkeys)
        TmpDOW = FoldTrain.groupby(gkeys, as_index= False).agg({'air_reserved_visitors': tickles})
        tmpcols = gkeys.copy()
        tmpcols.extend(['%s_reserve_visitors_%s' % (gprefix, m) for m in tickles])
        agg_reserve_visitors_cols.extend(['%s_reserve_visitors_%s' % (gprefix, m) for m in tickles])
        TmpDOW.columns = tmpcols
        #FoldTrain = FoldTrain.merge(TmpDOW, how= 'left', on=gkeys) #### data-leak, prone to be overfitted
        FoldValid = FoldValid.merge(TmpDOW, how= 'left', on=gkeys)
        FoldHoldout = FoldHoldout.merge(TmpDOW, how= 'left', on=gkeys)
        FoldTest = FoldTest.merge(TmpDOW, how= 'left', on=gkeys)
        FoldValid.fillna(0, inplace= True)
        FoldHoldout.fillna(0, inplace= True)
        FoldTest.fillna(0, inplace= True)
    for idx in range(len(agg_visitors_cols)):
        agg_var = '%s_%s_diff' % (agg_visitors_cols[idx], agg_reserve_visitors_cols[idx])
        FoldValid[agg_var] = (FoldValid[agg_visitors_cols[idx]] - FoldValid[agg_reserve_visitors_cols[idx]])/(FoldValid[agg_reserve_visitors_cols[idx]] + 1)
        FoldHoldout[agg_var] = (FoldHoldout[agg_visitors_cols[idx]] - FoldHoldout[agg_reserve_visitors_cols[idx]])/(FoldHoldout[agg_reserve_visitors_cols[idx]] + 1)
        FoldTest[agg_var] = (FoldTest[agg_visitors_cols[idx]] - FoldTest[agg_reserve_visitors_cols[idx]])/(FoldTest[agg_reserve_visitors_cols[idx]] + 1)
    agg_visitors_cols = []
    agg_reserve_visitors_cols = []
    for feat in ['air_store_id']:
        gkeys = [feat, 'hol_days', 'is_up_corner']
        gprefix = '_'.join(gkeys)
        TmpDOW = FoldTrain.groupby(gkeys, as_index= False).agg({'visitors': tickles})
        tmpcols = gkeys.copy()
        tmpcols.extend(['%s_visitors_%s' % (gprefix, m) for m in tickles])
        agg_visitors_cols.extend(['%s_visitors_%s' % (gprefix, m) for m in tickles])
        TmpDOW.columns = tmpcols
        #FoldTrain = FoldTrain.merge(TmpDOW, how= 'left', on=gkeys) #### data-leak, prone to be overfitted
        FoldValid = FoldValid.merge(TmpDOW, how= 'left', on=gkeys)
        FoldHoldout = FoldHoldout.merge(TmpDOW, how= 'left', on=gkeys)
        FoldTest = FoldTest.merge(TmpDOW, how= 'left', on=gkeys)
        FoldValid.fillna(0, inplace= True)
        FoldHoldout.fillna(0, inplace= True)
        FoldTest.fillna(0, inplace= True)
    for feat in ['air_store_id']:
        gkeys = [feat, 'hol_days', 'is_up_corner']
        gprefix = '_'.join(gkeys)
        TmpDOW = FoldTrain.groupby(gkeys, as_index= False).agg({'air_reserved_visitors': tickles})
        tmpcols = gkeys.copy()
        tmpcols.extend(['%s_reserve_visitors_%s' % (gprefix, m) for m in tickles])
        agg_reserve_visitors_cols.extend(['%s_reserve_visitors_%s' % (gprefix, m) for m in tickles])
        TmpDOW.columns = tmpcols
        #FoldTrain = FoldTrain.merge(TmpDOW, how= 'left', on=gkeys) #### data-leak, prone to be overfitted
        FoldValid = FoldValid.merge(TmpDOW, how= 'left', on=gkeys)
        FoldHoldout = FoldHoldout.merge(TmpDOW, how= 'left', on=gkeys)
        FoldTest = FoldTest.merge(TmpDOW, how= 'left', on=gkeys)
        FoldValid.fillna(0, inplace= True)
        FoldHoldout.fillna(0, inplace= True)
        FoldTest.fillna(0, inplace= True)
    for idx in range(len(agg_visitors_cols)):
        agg_var = '%s_%s_diff' % (agg_visitors_cols[idx], agg_reserve_visitors_cols[idx])
        FoldValid[agg_var] = (FoldValid[agg_visitors_cols[idx]] - FoldValid[agg_reserve_visitors_cols[idx]])/(FoldValid[agg_reserve_visitors_cols[idx]] + 1)
        FoldHoldout[agg_var] = (FoldHoldout[agg_visitors_cols[idx]] - FoldHoldout[agg_reserve_visitors_cols[idx]])/(FoldHoldout[agg_reserve_visitors_cols[idx]] + 1)
        FoldTest[agg_var] = (FoldTest[agg_visitors_cols[idx]] - FoldTest[agg_reserve_visitors_cols[idx]])/(FoldTest[agg_reserve_visitors_cols[idx]] + 1)
    if(i == 0):
        for c in FoldValid.columns:
            print(c)
        print('----------------------------------')
#         print(FoldValid.isnull().sum())
    FoldOutputDir = '%s/kfold/%s' % (OutputDir, i)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    FoldValid.to_csv('%s/valid.csv' % FoldOutputDir, index= False)
    FoldHoldout.to_csv('%s/holdout.csv' % FoldOutputDir, index= False)
    FoldTest.to_csv('%s/test.csv' % FoldOutputDir, index= False)
    print('Fold %s done.' % i)

air_genre_name
air_area_name
air_city
hpg_genre_name
hpg_area_name
hpg_city
air_store_id_encoded
hpg_store_id_encoded
pom
encoding for categorial features done.
air_area_genre_store_count
air_area_name
air_area_store_count
air_city
air_city_genre_store_count
air_city_store_count
air_genre_name
air_genre_store_count
air_reserved_visitors
air_store_id
day
dow
hol_days
holiday_flg
hpg_area_genre_store_count
hpg_area_name
hpg_area_store_count
hpg_city
hpg_city_genre_store_count
hpg_city_store_count
hpg_genre_name
hpg_genre_store_count
hpg_reserved_visitors
hpg_store_id
id
is_weekends
latitude_x
latitude_y
longitude_x
longitude_y
month
next_is_holiday
pom
prev_is_holiday
reserved_visitors
visit_date
visitors
wom
woy
rolling_visitors_46_39
rolling_reserve_visitors_46_39
rolling_visitors_53_46
rolling_reserve_visitors_53_46
rolling_visitors_60_53
rolling_reserve_visitors_60_53
rolling_visitors_67_60
rolling_reserve_visitors_67_60
rolling_visitors_74_67
rolling_reserve_visitors_74_67
rolling_v