In [69]:
import numpy as np
import pandas as pd
import datetime
import time
import numba
import os,sys
import gc

def LoadData(InputDir):
    """"""
    ## load raw data
    data = {
        'tra': pd.read_csv('%s/air_visit_data.csv' % InputDir, parse_dates= ['visit_date']),
        'as': pd.read_csv('%s/air_store_info.csv' % InputDir),
        'hs': pd.read_csv('%s/hpg_store_info.csv' % InputDir),
        'ar': pd.read_csv('%s/air_reserve.csv' % InputDir, parse_dates= ['visit_datetime', 'reserve_datetime']),
        'hr': pd.read_csv('%s/hpg_reserve.csv' % InputDir, parse_dates= ['visit_datetime', 'reserve_datetime']),
        'id': pd.read_csv('%s/store_id_relation.csv' % InputDir),
        'tes': pd.read_csv('%s/sample_submission.csv' % InputDir),
        'hol': pd.read_csv('%s/date_info.csv' % InputDir, parse_dates=['calendar_date']).rename(columns={'calendar_date': 'visit_date'})
    }
    return data

@numba.jit
def ApplyDayoff(VisitCols, ReserveCols):
    """"""
    n = len(VisitCols)
    result = np.zeros((n, 1), dtype= 'int8')
    for i in range(n):
        result[i] = (VisitCols[i]- ReserveCols[i]).days
    return result

reserve2id = {'ar': 'air', 'hr': 'hpg'}
reserve2store = {'ar': 'as', 'hr': 'hs'}# load data set
InputDir = '../../data/raw'
DataSet = LoadData(InputDir)
#### 
# date related features
print('\n============')
for mod in ['tra', 'tes']:
    start0 = time.time()
    if (mod == 'tes'):
        DataSet[mod]['visit_date'] = DataSet[mod]['id'].map(lambda x: str(x).split('_')[2])
        DataSet[mod]['air_store_id'] = DataSet[mod]['id'].map(lambda x: '_'.join(x.split('_')[:2]))
        DataSet[mod]['visit_date'] = pd.to_datetime(DataSet[mod]['visit_date'])
    DataSet[mod]['dow'] = DataSet[mod]['visit_date'].dt.dayofweek
    DataSet[mod]['year'] = DataSet[mod]['visit_date'].dt.year
    DataSet[mod]['month'] = DataSet[mod]['visit_date'].dt.month
    DataSet[mod]['visit_date'] = DataSet[mod]['visit_date'].dt.date
    end0 = time.time()
    print('%s data: unique stores %s, total %s, time elased %.2fs.' %
            (mod, len(DataSet[mod]['air_store_id'].unique()), len(DataSet[mod]['air_store_id']), (end0 - start0)))
print('============= process date related done.\n')
######## store data
# add city feature
for mod in ['ar', 'hr']:
    DataSet[reserve2store[mod]]['%s_city' % reserve2id[mod]] = DataSet[reserve2store[mod]]['%s_area_name' % reserve2id[mod]].str[:5]
# area (store)count
for mod in ['ar', 'hr']:
    rec = []
    groupped = DataSet[reserve2store[mod]].groupby(['%s_area_name' % reserve2id[mod]])
    for g in groupped.groups:
        ac = {}
        ac['%s_area_name' % reserve2id[mod]] = g
        ac['%s_area_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        #ac['%s_area_store_ratio' % reserve2id[mod]] = ac['%s_area_store_count' % reserve2id[mod]]/len(DataSet[reserve2store[mod]])
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= ['%s_area_name' % reserve2id[mod]])
# genre (store)count
for mod in ['ar', 'hr']:
    rec = []
    groupped = DataSet[reserve2store[mod]].groupby(['%s_genre_name' % reserve2id[mod]])
    for g in groupped.groups:
        ac = {}
        ac['%s_genre_name' % reserve2id[mod]] = g
        ac['%s_genre_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= ['%s_genre_name' % reserve2id[mod]])
#  area_genre (store) count 
for mod in ['ar', 'hr']:
    rec = []
    groupby_keys = ['%s_area_name' % reserve2id[mod], '%s_genre_name' % reserve2id[mod]]
    groupped = DataSet[reserve2store[mod]].groupby(groupby_keys)
    for g in groupped.groups:
        ac = {}
        ac['%s_area_name' % reserve2id[mod]] = g[0]
        ac['%s_genre_name' % reserve2id[mod]] = g[1]
        ac['%s_area_genre_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= groupby_keys)
# city (store)count
for mod in ['ar', 'hr']:
    rec = []
    groupped = DataSet[reserve2store[mod]].groupby(['%s_city' % reserve2id[mod]])
    for g in groupped.groups:
        ac = {}
        ac['%s_city' % reserve2id[mod]] = g
        ac['%s_city_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        #ac['%s_area_store_ratio' % reserve2id[mod]] = ac['%s_area_store_count' % reserve2id[mod]]/len(DataSet[reserve2store[mod]])
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= ['%s_city' % reserve2id[mod]])
#  city_genre (store) count 
for mod in ['ar', 'hr']:
    rec = []
    groupby_keys = ['%s_city' % reserve2id[mod], '%s_genre_name' % reserve2id[mod]]
    groupped = DataSet[reserve2store[mod]].groupby(groupby_keys)
    for g in groupped.groups:
        ac = {}
        ac['%s_city' % reserve2id[mod]] = g[0]
        ac['%s_genre_name' % reserve2id[mod]] = g[1]
        ac['%s_city_genre_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= groupby_keys)
print(' ============= process store data done.\n')
######### holiday data
data = DataSet['hol']
data['visit_date'] = data['visit_date'].dt.date
data = data.sort_values(by= 'visit_date')
def TagHoliday(df):
    ''''''
    n = len(df)
    result = ['' for x in range(n)]
    for i in range(n):
        if(i == 0):
            result[i] = 'hid_%s' % 0
        elif((df[i] - df[i-1]).days == 1):
            result[i] = result[i - 1]
        else:
            result[i] = 'hid_%s' % (int(result[i - 1].split('_')[1]) + 1)
    return result
holidays = data[data['holiday_flg'] == 1][['visit_date']]
holidays['hol_l0'] = TagHoliday(holidays['visit_date'].values)
groupped = holidays.groupby(['hol_l0'])
recs = []
for g in groupped.groups:
    hol_days = {}
    hol_days['hol_l0'] = g
    hol_days['hol_days'] = len(groupped.get_group(g))
    recs.append(hol_days)
tmpdf = pd.DataFrame(data= recs, index= range(len(recs)))
holidays = holidays.merge(tmpdf, how= 'left', on= 'hol_l0')
data = data.merge(holidays, how= 'left', on= 'visit_date')
data.drop(['hol_l0'], axis= 1, inplace= True)
data['hol_days'].fillna(0, inplace= True)
DataSet['hol'] = data
print(' ============= process holiday data done.\n')
######## join 
# join holiday data
for mod in ['tra', 'tes']:
    data = DataSet[mod]
    data = data.merge(DataSet['hol'], how='left', on=['visit_date'])
    data.drop(['day_of_week', 'year'], axis=1, inplace=True)
    DataSet[mod] = data
# join store data
for mod in ['tra', 'tes']:
    data = DataSet[mod]
    for rtype in ['ar', 'hr']: 
        if((rtype == 'hr') & (('%s_store_id' % reserve2id[rtype]) not in data.columns)):
            data = data.merge(DataSet['id'], how= 'left', on= ['air_store_id'])
        data = data.merge(DataSet[reserve2store[rtype]], how= 'left', on= ['%s_store_id' % reserve2id[rtype]])
    DataSet[mod] = data
print('================ join holiday, store data done.')
######### reservation data
for mod in ['hr', 'ar']:
    start1 = time.time()
    DataSet[mod]['visit_date'] = DataSet[mod]['visit_datetime'].dt.date
    DataSet[mod]['reserve_date'] = DataSet[mod]['reserve_datetime'].dt.date
    DataSet[mod].drop(['reserve_datetime', 'visit_datetime'], axis= 1, inplace= True)
    tmpdf = pd.DataFrame(data=ApplyDayoff(DataSet[mod]['visit_date'].values, DataSet[mod]['reserve_date'].values),index=DataSet[mod].index, columns=['reserve_date_diff'])
    tmpdf = pd.concat([DataSet[mod], tmpdf], axis=1)
    tmpdf = tmpdf.groupby(['%s_store_id' % reserve2id[mod], 'visit_date'], as_index=False).agg({'reserve_visitors': sum, 'reserve_date_diff': ['mean', 'median']})
    tmpdf.columns = ['%s_store_id' % reserve2id[mod], 
                   'visit_date', 
                   '%s_reserved_visitors' % reserve2id[mod], 
                   '%s_reserved_dayoff_mean' % reserve2id[mod], 
                   '%s_reserved_dayoff_median' % reserve2id[mod]
                  ]
    end1 = time.time()
    DataSet[mod] = tmpdf
    DataSet[mod]['%s_reserved_visitors' % reserve2id[mod]] = np.log1p(DataSet[mod]['%s_reserved_visitors' % reserve2id[mod]])
print(' ============= process reservation data done.\n')
# join reservation data
for mod in ['tra', 'tes']:
    data = DataSet[mod]
    for rtype in ['ar', 'hr']: 
        if((rtype == 'hr') & (('%s_store_id' % reserve2id[rtype]) not in data.columns)):
            data = data.merge(DataSet['id'], how= 'left', on= ['air_store_id'])
        data = data.merge(DataSet[rtype], how= 'left', on= ['%s_store_id' % reserve2id[rtype], 'visit_date'])
    DataSet[mod] = data
#print(DataSet['tra'][['air_store_id', 'visit_date', 'air_reserved_visitors', 'air_reserved_dayoff_mean']].head(100))
print('============= join reservation data done.\n')
####### add rolling features
s = time.time()

# mix train with test
DataSet['tra']['is_train'] = 1
DataSet['tes']['is_train'] = 0
AllData = pd.concat([DataSet['tra'], DataSet['tes']], axis= 0, ignore_index= True)
groupped = AllData.groupby(['air_store_id'])
visitor_ticks = [39, 46, 60, 74]
reservation_ticks = [7, 14, 21, 28]
print('total groups %s ' % len(groupped.groups))
dfs = []
# rolling sum
for g in groupped.groups: 
    gdf = groupped.get_group(g).sort_values(by= ['visit_date'])
    for t in visitor_ticks:
        gdf['visitor_tick_%s' % t] = np.log1p(gdf['visitors']).rolling(window= t).sum()
        gdf['visitor_tick_%s' % t].fillna(0, inplace= True)
#     for t in reservation_ticks:
#         for mod in ['air', 'hpg']:
#             gdf['reservation_%s_rolling_%s_sum' % (mod, t)] = gdf['%s_reserved_visitors' % mod].rolling(window= t).sum()
#             gdf['reservation_%s_rolling_%s_sum' % (mod, t)].fillna(0, inplace= True)
    gdf['holiday_rolling_3'] = gdf['holiday_flg'].rolling(window= 3).sum()
    gdf['holiday_rolling_3'].fillna(0, inplace= True)
#     gdf['holiday_rolling_2'] = gdf['holiday_flg'].rolling(window= 2).sum()
#     gdf['holiday_rolling_2'].fillna(0, inplace= True)
    dfs.append(gdf)
# concate
tmpdf = pd.concat(dfs, axis= 0, ignore_index= True)
join_cols = ['air_store_id', 'visit_date', 'holiday_rolling_3']
for i in range(len(visitor_ticks)):
    if(i == 0):
        continue
    k = 'visitor_rolling_%s_%s_mean' % (visitor_ticks[i], visitor_ticks[i - 1])
    tmpdf[k] = (tmpdf['visitor_tick_%s' % visitor_ticks[i]] - tmpdf['visitor_tick_%s' % visitor_ticks[i - 1]]) / (visitor_ticks[i] - visitor_ticks[i - 1])
    tmpdf[k].fillna(0, inplace= True)
    join_cols.append(k)
# for t in reservation_ticks:
#     for mod in ['air', 'hpg']:
#         join_cols.append('reservation_%s_rolling_%s_sum' % (mod, t))
# merge
tmpdf.drop(['visitor_tick_%s' % col for col in visitor_ticks], axis= 1, inplace= True)
AllData = AllData.merge(tmpdf[join_cols], how= 'left', on= ['air_store_id', 'visit_date'])
# restore
DataSet['tra'] = AllData[AllData['is_train'] == 1]
DataSet['tes'] = AllData[AllData['is_train'] == 0]
DataSet['tra'].drop(['is_train'], axis= 1, inplace= True)
DataSet['tes'].drop(['is_train'], axis= 1, inplace= True)
del AllData
vdf = DataSet['tra'][DataSet['tra']['air_store_id'] == 'air_ba937bf13d40fb24']
#print(vdf[join_cols].head(200))

e = time.time()
print('time elapsed %s' % ((e - s) * 60))
print(' ============= add rolling features done.\n')


tra data: unique stores 829, total 252108, time elased 1.45s.
tes data: unique stores 821, total 32019, time elased 0.13s.





total groups 829 
time elapsed 596.9517660140991



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [70]:
#### Label encoding for categorial features
from sklearn import *

cate_feats = ['genre_name', 'area_name', 'city']
cate_cols = ['%s_%s' % (m, cf) for m in ['air', 'hpg'] for cf in cate_feats]
for mod in ['tra', 'tes']:
    for col in DataSet[mod].columns:
        if(col in cate_cols):
            DataSet[mod][col].fillna('unknown', inplace= True)
        else:
            DataSet[mod][col].fillna(-1, inplace= True)
print('Categorical features ', cate_cols)
TrainData = DataSet['tra']
TestData = DataSet['tes']
for col in cate_cols:
    lbl = preprocessing.LabelEncoder()
    TrainData[col] = lbl.fit_transform(TrainData[col])
    TestData[col] = lbl.transform(TestData[col])

Categorical features  ['air_genre_name', 'air_area_name', 'air_city', 'hpg_genre_name', 'hpg_area_name', 'hpg_city']


In [78]:
import lightgbm
import sys,os

def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

#print('All features ', col)
test = TestData

# split TrainData into train and holdout with random strategy
np.random.seed(2017)
msk = np.random.rand(len(TrainData)) < 0.1
holdout = TrainData[msk]
train = TrainData[~msk]

# split TrainData into train and holdout with date range strategy
# holdout = TrainData[(TrainData['visit_date'] >= datetime.date(2017, 3, 16))]
# train = TrainData[(TrainData['visit_date'] < datetime.date(2017, 3, 16)) & 
#                   (TrainData['visit_date'] >= datetime.date(2016, 4, 1))]

# for test
y_test_pred = 0
# Set up folds
K = 5
kf = model_selection.KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(1)
# parameters
params = {
    "boosting": "gbdt",
    "objective": "regression_l2",
    "lambda_l2": 0.0001,
#     "objective": "fair",
#     "fair_c": 200.0,

    "num_iterations": 400,
    "learning_rate": 0.2,
    "min_data_in_leaf": 100,
    "max_depth": 60, 

    "feature_fraction": 0.9,
    "bagging_fraction": 0.85,
    "bagging_freq": 12, 
    "min_hessian": 0.001,

    "max_bin": 127,
}
# Run CV
cv_score = .0
holdout_score = .0
start_time = datetime.datetime.now()
for i, (train_index, test_index) in enumerate(kf.split(train)):
    FoldTrain, FoldValid = train.iloc[train_index].copy(), train.iloc[test_index].copy()
    FoldHoldout = holdout.copy()
    FoldTest = test.copy()
    FoldTrain['visitors'] = np.log1p(FoldTrain['visitors'])
    FoldValid['visitors'] = np.log1p(FoldValid['visitors'])
    FoldHoldout['visitors'] = np.log1p(FoldHoldout['visitors'])
    FoldTest['visitors'] = np.log1p(FoldTest['visitors'])
    #### dependent features which is extreemly subtle to data-leak
    # percentiles features
    tickles = ['mean', 'median', 'max', 'min', 'count']
    for feat in ['air_store_id']:
        gkeys = [feat, 'dow']
        gprefix = '_'.join(gkeys)
        TmpDOW = FoldTrain.groupby(gkeys, as_index= False).agg({'visitors': tickles})    
        tmpcols = gkeys.copy()
        tmpcols.extend(['%s_%s' % (gprefix, m) for m in tickles])
        TmpDOW.columns = tmpcols
        FoldTrain = FoldTrain.merge(TmpDOW, how= 'left', on=gkeys)
        FoldValid = FoldValid.merge(TmpDOW, how= 'left', on=gkeys)
        FoldHoldout = FoldHoldout.merge(TmpDOW, how= 'left', on=gkeys)
        FoldTest = FoldTest.merge(TmpDOW, how= 'left', on=gkeys)
    for feat in ['air_city']:
        gkeys = [feat, 'air_genre_name']
        gprefix = '_'.join(gkeys)
        TmpDOW = FoldTrain.groupby(gkeys, as_index= False).agg({'visitors': tickles})    
        tmpcols = gkeys.copy()
        tmpcols.extend(['%s_%s' % (gprefix, m) for m in tickles])
        TmpDOW.columns = tmpcols
        FoldTrain = FoldTrain.merge(TmpDOW, how= 'left', on=gkeys)
        FoldValid = FoldValid.merge(TmpDOW, how= 'left', on=gkeys)
        FoldHoldout = FoldHoldout.merge(TmpDOW, how= 'left', on=gkeys)
        FoldTest = FoldTest.merge(TmpDOW, how= 'left', on=gkeys)
    
    col = [c for c in FoldTrain.columns if c not in ['id', 'air_store_id', 'visit_date', 'visitors', 'hpg_store_id']]
    # train
    d_cv = lightgbm.Dataset(FoldTrain[col], label= FoldTrain['visitors'].values, max_bin= params['max_bin'], silent= True, free_raw_data= True)
    model = lightgbm.train(params, d_cv)
    # for valid
    pred = model.predict(FoldValid[col])
    rmsle_valid = RMSLE(FoldValid['visitors'].values, pred)
    cv_score += rmsle_valid
    # for holdout
    pred = model.predict(FoldHoldout[col])
    rmsle_holdout = RMSLE(FoldHoldout['visitors'].values, pred)
    holdout_score += rmsle_holdout
    # for test
    pred = model.predict(FoldTest[col])
    y_test_pred += pred

    print('fold %s: valid score %.6f, holdout score %.6f, valid length %s' % (i, rmsle_valid, rmsle_holdout, len(FoldValid)))
y_test_pred /= K  # Average test set predictions
cv_score /= K # Average valid set predictions
holdout_score /= K # Average holdout set predictions

# Create submission file
sub = pd.DataFrame()
sub['id'] = test['id']
sub['visitors'] = np.expm1(y_test_pred)
OutputFileName = 'lgb_submit_%s' % (datetime.datetime.now().strftime("%Y-%m-%d"))
sub.to_csv('%s.csv' % OutputFileName, float_format='%.6f', index=False)
os.system('zip %s.zip %s.csv' % (OutputFileName, OutputFileName))

finish_time = datetime.datetime.now()
elapsed = (finish_time - start_time).seconds
print('\n======================')
print("CV score %.6f, Holdout score %.6f, Elapsed time: %.2fs" % (cv_score, holdout_score, elapsed))
print('======================\n')

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


fold 0: valid score 0.505957, holdout score 0.506731, valid length 45371
fold 1: valid score 0.510253, holdout score 0.513754, valid length 45371
fold 2: valid score 0.507397, holdout score 0.512314, valid length 45371
fold 3: valid score 0.511127, holdout score 0.512553, valid length 45370
fold 4: valid score 0.509971, holdout score 0.513811, valid length 45370

CV score 0.508941, Holdout score 0.511833, Elapsed time: 24.00s



In [81]:
# train = DataSet['tra']
# train['visitors'] = np.log1p(train['visitors'])
# test = DataSet['tes']
# #### dependent features which is extreemly subtle to data-leak
# # percentiles features
# tickles = ['mean', 'median', 'max', 'min', 'count']
# for feat in ['air_store_id']:
#     gkeys = [feat, 'dow']
#     gprefix = '_'.join(gkeys)
#     TmpDOW = train.groupby(gkeys, as_index= False).agg({'visitors': tickles})    
#     tmpcols = gkeys.copy()
#     tmpcols.extend(['%s_%s' % (gprefix, m) for m in tickles])
#     TmpDOW.columns = tmpcols
#     train = train.merge(TmpDOW, how= 'left', on=gkeys)
#     test = test.merge(TmpDOW, how= 'left', on=gkeys)
# for feat in ['air_city']:
#     gkeys = [feat, 'air_genre_name']
#     gprefix = '_'.join(gkeys)
#     TmpDOW = train.groupby(gkeys, as_index= False).agg({'visitors': tickles})    
#     tmpcols = gkeys.copy()
#     tmpcols.extend(['%s_%s' % (gprefix, m) for m in tickles])
#     TmpDOW.columns = tmpcols
#     train = train.merge(TmpDOW, how= 'left', on=gkeys)
#     test = test.merge(TmpDOW, how= 'left', on=gkeys)
    
# col = [c for c in train.columns if c not in ['id', 'air_store_id', 'visit_date', 'visitors', 'hpg_store_id']]
# # train
# d_cv = lightgbm.Dataset(train[col], label= train['visitors'].values, max_bin= params['max_bin'], silent= True, free_raw_data= True)
# model = lightgbm.train(params, d_cv)
# #
# pred = model.predict(test[col])
# # Create submission file
# sub = pd.DataFrame()
# sub['id'] = test['id']
# sub['visitors'] = np.expm1(y_test_pred)
# OutputFileName = 'lgb_submit_%s' % (datetime.datetime.now().strftime("%Y-%m-%d"))
# sub.to_csv('%s.csv' % OutputFileName, float_format='%.6f', index=False)
# os.system('zip %s.zip %s.csv' % (OutputFileName, OutputFileName))

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


0