# TODO: Performance wasn't very good. Predict week by week and calc moving average for next week!
Steps: 
- a) Split up test set by week
- b) Give week 1 the same EWMA as the last week in the training set.
- c) Predict week 1
- d) Calc EWMA for following week
- e) Predict following week
- f) Repeat d & e until all predictions are made.

# Import libraries

In [179]:
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-7.2.0-posix-seh-rt_v5-rev1\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import gc # Note: this is a garbage collector
%matplotlib inline

import warnings
warnings.filterwarnings('ignore') 

# Bring in data

In [180]:
PATH = r"..\Raw_Data"

In [181]:
data = {
    'airRes':    pd.read_csv(PATH + r"\air_reserve.csv"),
    'airStore':  pd.read_csv(PATH + r"\air_store_info.csv"),
    'airVisit':  pd.read_csv(PATH + r"\air_visit_data.csv"),
    'date':      pd.read_csv(PATH + r"\date_info.csv"),
    'hpgRes':    pd.read_csv(PATH + r"\hpg_reserve.csv"),
    'hpgStore':  pd.read_csv(PATH + r"\hpg_store_info.csv"),
    'sampleSub': pd.read_csv(PATH + r"\sample_submission.csv"),
    'storeIDs':  pd.read_csv(PATH + r"\store_id_relation.csv")    
}

In [182]:
#data['airStore'].tail(50)

In [183]:
#data['airVisit'].head(15)
#data['airRes'].reserve_datetime.max()

# Preprocess data

In [184]:
data['date']['visit_date'] = pd.to_datetime(data['date']['calendar_date'])
data['date'].drop('calendar_date', axis = 1 , inplace=True)

In [185]:
data['airVisit']['visit_date'] = pd.to_datetime(data['airVisit']['visit_date'])
data['airVisit']['dow'] = data['airVisit']['visit_date'].dt.dayofweek
data['airVisit']['year'] = data['airVisit']['visit_date'].dt.year
data['airVisit']['month'] = data['airVisit']['visit_date'].dt.month

## Add EWMA of visits as feature

In [186]:
# Function to calculate ewm (note, found this on the discussion forum):
def calc_shifted_ewm(series, alpha, adjust = True):
    return series.shift().ewm(alpha = alpha, adjust = adjust).mean()

In [187]:
# Step below adds the ewm by day of week. Right now I'm returning a separate series so I can look at what each step does if I want to
tmp = data['airVisit'].groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'], 0.1)) 
# This step backfills the 1st week's dow for each restaurant, otherwise it would be NaN since it's a 1-period ewma
tmp = tmp.fillna(method='bfill')
# The groupby function returns a multiIndex Series. I only need the 3rd level (original df index) to add column to original df
tmp.index = tmp.index.get_level_values(2)
# Sort index before adding back to original df
tmp = tmp.sort_index()

In [188]:
data['airVisit']['ewma'] = tmp

## Add 'days since last' and 'days until next' holiday

In [189]:
# Create a 'days since holiday' feature
daysSinceList = []
daysSinceHol = 0 # initialize daysSince counter
for row in data['date']['holiday_flg']:
    if row == 1:
        daysSinceHol = 0
        daysSinceList.append(daysSinceHol)
    else:
        daysSinceHol += 1
        daysSinceList.append(daysSinceHol)
data['date']['days_since_holiday'] = daysSinceList

In [190]:
# Create a 'days UNTIL next holiday' feature
holidayList = list(data['date']['holiday_flg'])
daysUntilHolList = np.zeros(len(holidayList),dtype=np.int)
daysUntilHol = 0 # initialize daysUntilHol counter
for i in range(len(holidayList)-1,0,-1):
    if holidayList[i] == 1:
        daysUntilHol = 0
        daysUntilHolList[i] = daysUntilHol
    else:
        daysUntilHol += 1
        daysUntilHolList[i] = daysUntilHol        
data['date']['days_until_holiday'] = daysUntilHolList

## Merge air_visits with date dframe to get holiday info

In [191]:
colsToMerge = ['holiday_flg','visit_date','days_until_holiday','days_since_holiday']
df_train = pd.merge(data['airVisit'], data['date'][colsToMerge], how = 'left', on = 'visit_date')

In [192]:
#df_train.head()

## Filter only the stores that must be predicted

In [193]:
df_test = data['sampleSub']

df_test['visit_date'] = df_test['id'].map(lambda x: str(x).split('_')[2])
df_test['air_store_id'] = df_test['id'].map(lambda x: '_'.join(str(x).split('_')[:2]))
df_test['visit_date'] = pd.to_datetime(df_test['visit_date'])
df_test['dow'] = df_test['visit_date'].dt.dayofweek
df_test['year'] = df_test['visit_date'].dt.year
df_test['month'] = df_test['visit_date'].dt.month

unique_stores = df_test['air_store_id'].unique()
stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'dow': [i]*len(unique_stores)}) for i in range(7)], axis=0, ignore_index=True).reset_index(drop=True)
#stores.head()

## Merge df_test with date dframe to get holiday info

In [194]:
df_test = pd.merge(df_test, data['date'][colsToMerge], how = 'left', on = 'visit_date')

In [195]:
stores = pd.merge(stores, data['airStore'], how='left', on=['air_store_id'])
#stores.head()

In [196]:
# Encode categorical string variables 
lbl = LabelEncoder()
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

In [197]:
stores.head()

Unnamed: 0,air_store_id,dow,air_genre_name,air_area_name,latitude,longitude
0,air_00a91d42b08b08d9,0,6,44,35.694003,139.753595
1,air_0164b9927d20bcc3,0,6,62,35.658068,139.751599
2,air_0241aa3964b7f861,0,7,82,35.712607,139.779996
3,air_0328696196e46f18,0,4,98,34.701279,135.52809
4,air_034a3d5b40d5b1b1,0,2,102,34.692337,135.472229


## Add genre and area to train and test data

In [198]:
df_train = pd.merge(df_train, stores, how = 'left', on = ['air_store_id','dow'])

### Add visitor statistics as features

In [199]:
tmp = df_train.groupby(['air_store_id','dow']).agg({'visitors' : [np.min,np.mean,np.median,np.max,np.size]}).reset_index()
tmp.columns = ['air_store_id', 'dow', 'min_visitors', 'mean_visitors', 'median_visitors','max_visitors','count_observations']
#stores = pd.merge(df_train, tmp, how='left', on=['air_store_id','dow'])
df_train = pd.merge(df_train, tmp, how='left', on=['air_store_id','dow'])
#print(df_train.columns)
#print(df_train.head())
df_train.isnull().any()
#print(stores.columns)

air_store_id          False
visit_date            False
visitors              False
dow                   False
year                  False
month                 False
ewma                  False
holiday_flg           False
days_until_holiday    False
days_since_holiday    False
air_genre_name         True
air_area_name          True
latitude               True
longitude              True
min_visitors          False
mean_visitors         False
median_visitors       False
max_visitors          False
count_observations    False
dtype: bool

# CONTINUE HERE: Add statistics to test set!

In [203]:
df_test.isnull().any()

id                    False
visitors              False
visit_date            False
air_store_id          False
dow                   False
year                  False
month                 False
holiday_flg           False
days_until_holiday    False
days_since_holiday    False
air_genre_name        False
air_area_name         False
latitude              False
longitude             False
dtype: bool

In [202]:
df_test = pd.merge(df_test, stores, how = 'left', on = ['air_store_id','dow'])

In [205]:
df_test = pd.merge(df_test, tmp, how='left', on=['air_store_id','dow'])
df_test.head()

Unnamed: 0,id,visitors,visit_date,air_store_id,dow,year,month,holiday_flg,days_until_holiday,days_since_holiday,air_genre_name,air_area_name,latitude,longitude,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,2017,4,0,6,34,6,44,35.694003,139.753595,2.0,2.0,2.0,2.0,1.0
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,2017,4,0,5,35,6,44,35.694003,139.753595,1.0,22.457143,19.0,47.0,35.0
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,1,2017,4,0,4,36,6,44,35.694003,139.753595,1.0,24.35,24.5,43.0,40.0
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,2,2017,4,0,3,37,6,44,35.694003,139.753595,15.0,28.125,28.0,52.0,40.0
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,3,2017,4,0,2,38,6,44,35.694003,139.753595,15.0,29.868421,30.0,47.0,38.0


For now, I'm just using fillna on the train set. TODO: investigate why there are NaN on train set

In [20]:
df_train = df_train.fillna(-1)
df_test = df_test.fillna(-1)

In [206]:
train = df_train
test = df_test
col = [c for c in train if c not in ['id', 'air_store_id','visit_date','visitors']]
#test.head()

# Initialize XGBoost
Note - code based on:
https://www.kaggle.com/jmbull/no-xgb-starter-here-s-one-lb-507

In [207]:
# XGB starter template borrowed from @anokas
# https://www.kaggle.com/anokas/simple-xgboost-starter-0-0655

print('Binding to float32')

for c, dtype in zip(df_train.columns, train.dtypes):
    if dtype == np.float64:
        train[c] = train[c].astype(np.float32)
        
for c, dtype in zip(df_test.columns, test.dtypes):
    if dtype == np.float64:
        test[c] = test[c].astype(np.float32)

Binding to float32


In [208]:
x_train = train.drop(['air_store_id','visit_date','visitors'], axis=1)
y_train = np.log1p(train['visitors'].values)

# Get Column order for x_test df
colOrder = x_train.columns

In [209]:
print(x_train.shape, y_train.shape)
#print(colOrder)

(252108, 16) (252108,)


In [210]:
#train.head()

In [211]:
# Create training / validation split
split = 200000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, x_valid; gc.collect()

Building DMatrix...


231

In [212]:
print('Training ...')

params = {}
params['objective'] = 'reg:linear'
params['eval_metric'] = 'rmse'
params['eta'] = 0.04
params['max_depth'] = 7
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

del d_train, d_valid

Training ...
[0]	train-rmse:2.35531	valid-rmse:2.31585
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[10]	train-rmse:1.61232	valid-rmse:1.58725
[20]	train-rmse:1.13761	valid-rmse:1.12118
[30]	train-rmse:0.84573	valid-rmse:0.834624
[40]	train-rmse:0.676642	valid-rmse:0.668327
[50]	train-rmse:0.585418	valid-rmse:0.578816
[60]	train-rmse:0.539181	valid-rmse:0.533684
[70]	train-rmse:0.516395	valid-rmse:0.511711
[80]	train-rmse:0.505054	valid-rmse:0.501227
[90]	train-rmse:0.499031	valid-rmse:0.496156
[100]	train-rmse:0.49554	valid-rmse:0.493567
[110]	train-rmse:0.493237	valid-rmse:0.492162
[120]	train-rmse:0.491513	valid-rmse:0.49128
[130]	train-rmse:0.490104	valid-rmse:0.490741
[140]	train-rmse:0.488671	valid-rmse:0.490254
[150]	train-rmse:0.4875	valid-rmse:0.489895
[160]	train-rmse:0.486507	valid-rmse:0.489691
[170]	train-rmse:0.485642	valid-rmse:0.489558
[180]	train-rmse:0.484919	valid-rms

In [213]:
print(clf.feature_names)
#xgb.plot_tree(clf)
#plt.show()

['dow', 'year', 'month', 'ewma', 'holiday_flg', 'days_until_holiday', 'days_since_holiday', 'air_genre_name', 'air_area_name', 'latitude', 'longitude', 'min_visitors', 'mean_visitors', 'median_visitors', 'max_visitors', 'count_observations']


### TODO: Use ewma feature in test set
1. Assign ewma to 1st week of test set using last week of training
2. Predict
3. Calc ewma for 2nd week of test set
4. predict again

In [214]:
# Get dataframe for last week of training set and only use store
x_train_lastWk = train[['air_store_id','visit_date','dow','ewma']]
x_train_lastWk = x_train_lastWk[x_train_lastWk['visit_date'] > '2017-04-15']
#print(x_train_LastWk.head(2))
#x_train_lastWk.head(10)
#print("total entries: " + str(len(x_train_LastWk)))
#print("total stores: " + str(len(x_train_LastWk['air_store_id'].unique())))

In [215]:
#x_train_LastWk['dow'] = x_train_LastWk['dow'].astype(str)
#x_train_LastWk['store_with_dow']=x_train_LastWk[['air_store_id','dow']].apply(lambda x: '_'.join(x),axis=1)
#x_train_LastWk.head()

In [216]:
#x_train_LastWk.groupby('dow')['air_store_id'].count()

In [217]:
# Split test by weeks [note that weeks start on day 6 (Sunday) rather than day 0 (Monday)]
#test['ewma'] = np.random.randint(1, 20, x_test.shape[0])
test.index = test['visit_date']
#test.head()
testWk1 = test['2017-04-23':'2017-04-29']
testWk2 = test['2017-04-30':'2017-05-06']
testWk3 = test['2017-05-07':'2017-05-13']
testWk4 = test['2017-05-14':'2017-05-20']
testWk5 = test['2017-05-21':'2017-05-27']
testWk6 = test['2017-05-28':'2017-06-01']
#testWk6.head(2)

In [218]:
testWk1_pred = pd.merge(testWk1, x_train_lastWk.drop('visit_date',axis=1),how='left',on=['air_store_id','dow'])

In [219]:
testWk1_pred['ewma'] = testWk1_pred['ewma'].fillna(value=0)
#testWk1_pred.head(10)

Command to put columns in the right order for the XGBoost prediction

In [225]:
# NOTE: USE COLUMN LIST BELOW IF NOT INCLUDING VISITOR STATISTICS IN TRAINING SET

#columnsForTest_df = ['dow', 'year', 'month', 'ewma', 'holiday_flg', 'days_until_holiday',
#       'days_since_holiday', 'air_genre_name', 'air_area_name', 'latitude',
#       'longitude']

# NOTE: USE COLUMN LIST BELOW IF INCLUDING VISITOR STATISTICS IN TRAINING SET
columnsForTest_df = ['dow', 'year', 'month', 'ewma', 'holiday_flg', 'days_until_holiday', 'days_since_holiday',
                     'air_genre_name', 'air_area_name', 'latitude', 'longitude', 'min_visitors', 'mean_visitors',
                     'median_visitors', 'max_visitors', 'count_observations']

In [226]:
x_testWk1_pred = testWk1_pred.drop(['id','air_store_id','visit_date','visitors'],axis=1)
x_testWk1_pred = x_testWk1_pred[columnsForTest_df]

In [227]:
d_test = xgb.DMatrix(x_testWk1_pred)
#del x_test; gc.collect()

In [228]:
print('Predicting on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

Predicting on test ...


308

In [229]:
np.expm1(p_test)

array([  1.90308571,  19.54558182,  24.12433434, ...,   3.55688095,
         4.78618813,   7.28006744], dtype=float32)

In [231]:
testWk1_pred['visitors'] = np.expm1(p_test)
testWk1_pred[['id','visitors']].to_csv('xgb_submission_Wk1.csv', index=False, float_format='%.3f')

### With test set Week 1 predicted, update ewma and assign to week 2

In [232]:
train_testWk1_concat = testWk1_pred[train.columns]
train_testWk1_concat = pd.concat([train, train_testWk1_concat])
train_testWk1_concat = train_testWk1_concat.reset_index()
tmp = train_testWk1_concat.groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'],0.1))
tmp = tmp.fillna(method='bfill')
tmp.index = tmp.index.get_level_values(2)
tmp = tmp.sort_index()

In [233]:
train_testWk1_concat['ewma'] = tmp

In [234]:
tmp = train_testWk1_concat[train_testWk1_concat['visit_date'] > testWk2.visit_date.min() - pd.to_timedelta(1, unit='d')]
tmp = tmp[['air_store_id','visit_date','dow','ewma']]
testWk2_pred = pd.merge(testWk2, tmp.drop('visit_date',axis=1),how='left',on=['air_store_id','dow'])
testWk2_pred['ewma'] = testWk2_pred['ewma'].fillna(value=0)

In [235]:
x_testWk2_pred = testWk2_pred.drop(['id','air_store_id','visit_date','visitors'],axis=1)
x_testWk2_pred = x_testWk2_pred[columnsForTest_df]
x_testWk2_pred.columns

Index(['dow', 'year', 'month', 'ewma', 'holiday_flg', 'days_until_holiday',
       'days_since_holiday', 'air_genre_name', 'air_area_name', 'latitude',
       'longitude', 'min_visitors', 'mean_visitors', 'median_visitors',
       'max_visitors', 'count_observations'],
      dtype='object')

In [236]:
d_test = xgb.DMatrix(x_testWk2_pred)
#del x_test; gc.collect()

In [237]:
print('Predictin on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

Predictin on test ...


209

In [238]:
testWk2_pred['visitors'] = np.expm1(p_test)
testWk2_pred[['id','visitors']].to_csv('xgb_submission_Wk2.csv',index=False,float_format='%.3f')

### With test set Week 2 predicted, update ewma and assign to week 3

In [239]:
#train_testWk1_concat.tail(2)

In [240]:
train_testWk2_concat = testWk2_pred[train.columns]
train_testWk2_concat = pd.concat([train_testWk1_concat, train_testWk2_concat])
train_testWk2_concat = train_testWk2_concat.reset_index()
tmp = train_testWk2_concat.groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'],0.1))
tmp = tmp.fillna(method='bfill')
tmp.index = tmp.index.get_level_values(2)
tmp = tmp.sort_index()

In [241]:
train_testWk2_concat['ewma'] = tmp

In [242]:
tmp = train_testWk2_concat[train_testWk2_concat['visit_date'] > testWk3.visit_date.min() - pd.to_timedelta(1, unit='d')]
tmp = tmp[['air_store_id','visit_date','dow','ewma']]
testWk3_pred = pd.merge(testWk3, tmp.drop('visit_date',axis=1),how='left',on=['air_store_id','dow'])
testWk3_pred['ewma'] = testWk3_pred['ewma'].fillna(value=0)

In [243]:
x_testWk3_pred = testWk3_pred.drop(['id','air_store_id','visit_date','visitors'],axis=1)
x_testWk3_pred = x_testWk3_pred[columnsForTest_df]

In [244]:
d_test = xgb.DMatrix(x_testWk3_pred)
#del x_test; gc.collect()

In [245]:
print('Predictin on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

Predictin on test ...


101

In [246]:
testWk3_pred['visitors'] = np.expm1(p_test)
testWk3_pred[['id','visitors']].to_csv('xgb_submission_Wk3.csv',index=False,float_format='%.3f')

### With test set Week 3 predicted, update ewma and assign to week 4

In [247]:
train_testWk3_concat = pd.concat([train_testWk2_concat[train.columns], testWk3_pred[train.columns]])
train_testWk3_concat = train_testWk3_concat.reset_index()
tmp = train_testWk3_concat.groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'],0.1))
tmp = tmp.fillna(method='bfill')
tmp.index = tmp.index.get_level_values(2)
tmp = tmp.sort_index()

In [248]:
train_testWk3_concat['ewma'] = tmp

In [249]:
tmp = train_testWk3_concat[train_testWk3_concat['visit_date'] > testWk4.visit_date.min() - pd.to_timedelta(1, unit='d')]
tmp = tmp[['air_store_id','visit_date','dow','ewma']]
testWk4_pred = pd.merge(testWk4, tmp.drop('visit_date',axis=1),how='left',on=['air_store_id','dow'])
testWk4_pred['ewma'] = testWk4_pred['ewma'].fillna(value=0)

In [250]:
x_testWk4_pred = testWk4_pred.drop(['id','air_store_id','visit_date','visitors'],axis=1)
x_testWk4_pred = x_testWk4_pred[columnsForTest_df]

In [251]:
d_test = xgb.DMatrix(x_testWk4_pred)
#del x_test; gc.collect()

In [252]:
print('Predictin on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

Predictin on test ...


101

In [253]:
testWk4_pred['visitors'] = np.expm1(p_test)
testWk4_pred[['id','visitors']].to_csv('xgb_submission_Wk4.csv',index=False,float_format='%.3f')

### With test set Week 4 predicted, update ewma and assign to week 5

In [254]:
train_testWk4_concat = pd.concat([train_testWk3_concat[train.columns], testWk4_pred[train.columns]])
train_testWk4_concat = train_testWk4_concat.reset_index()
tmp = train_testWk4_concat.groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'],0.1))
tmp = tmp.fillna(method='bfill')
tmp.index = tmp.index.get_level_values(2)
tmp = tmp.sort_index()

In [255]:
train_testWk4_concat['ewma'] = tmp

In [256]:
tmp = train_testWk4_concat[train_testWk4_concat['visit_date'] > testWk5.visit_date.min() - pd.to_timedelta(1, unit='d')]
tmp = tmp[['air_store_id','visit_date','dow','ewma']]
testWk5_pred = pd.merge(testWk5, tmp.drop('visit_date',axis=1),how='left',on=['air_store_id','dow'])
testWk5_pred['ewma'] = testWk5_pred['ewma'].fillna(value=0)

In [257]:
x_testWk5_pred = testWk5_pred.drop(['id','air_store_id','visit_date','visitors'],axis=1)
x_testWk5_pred = x_testWk5_pred[columnsForTest_df]

In [258]:
d_test = xgb.DMatrix(x_testWk5_pred)
#del x_test; gc.collect()

In [259]:
print('Predictin on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

Predictin on test ...


101

In [260]:
testWk5_pred['visitors'] = np.expm1(p_test)
testWk5_pred[['id','visitors']].to_csv('xgb_submission_Wk5.csv',index=False,float_format='%.3f')

### With test set Week 5 predicted, update ewma and assign to week 6

In [261]:
train_testWk5_concat = pd.concat([train_testWk4_concat[train.columns], testWk5_pred[train.columns]])
train_testWk5_concat = train_testWk5_concat.reset_index()
tmp = train_testWk5_concat.groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'],0.1))
tmp = tmp.fillna(method='bfill')
tmp.index = tmp.index.get_level_values(2)
tmp = tmp.sort_index()

In [262]:
train_testWk5_concat['ewma'] = tmp

In [263]:
tmp = train_testWk5_concat[train_testWk5_concat['visit_date'] > testWk6.visit_date.min() - pd.to_timedelta(1, unit='d')]
tmp = tmp[['air_store_id','visit_date','dow','ewma']]
testWk6_pred = pd.merge(testWk6, tmp.drop('visit_date',axis=1),how='left',on=['air_store_id','dow'])
testWk6_pred['ewma'] = testWk6_pred['ewma'].fillna(value=0)

In [264]:
x_testWk6_pred = testWk6_pred.drop(['id','air_store_id','visit_date','visitors'],axis=1)
x_testWk6_pred = x_testWk6_pred[columnsForTest_df]

In [265]:
d_test = xgb.DMatrix(x_testWk6_pred)
#del x_test; gc.collect()

In [266]:
print('Predictin on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

Predictin on test ...


101

In [267]:
testWk6_pred['visitors'] = np.expm1(p_test)
testWk6_pred[['id','visitors']].to_csv('xgb_submission_Wk6.csv',index=False,float_format='%.3f')

### Compile all prediction csv and sort in order needed for submission

In [268]:
pred_wk1 = pd.read_csv('xgb_submission_Wk1.csv')
pred_wk2 = pd.read_csv('xgb_submission_Wk2.csv')
pred_wk3 = pd.read_csv('xgb_submission_Wk3.csv')
pred_wk4 = pd.read_csv('xgb_submission_Wk4.csv')
pred_wk5 = pd.read_csv('xgb_submission_Wk5.csv')
pred_wk6 = pd.read_csv('xgb_submission_Wk6.csv')

In [269]:
compiled_predictions = pd.concat([pred_wk1,pred_wk2,pred_wk3,pred_wk4,pred_wk5,pred_wk6]).sort_values(by='id')

In [270]:
compiled_predictions[['id','visitors']].to_csv('xgb_submission.csv',index=False,float_format='%.3f')

In [407]:
#print(compiled_predictions['id'].tail(20))
#print(data['sampleSub']['id'].tail(20))

# Note about cells below:

Cell below were an attempt to modify visitors manually on restaurants that did not have data for particular day-of-week on the training set, under the assumption that if the day-of-week was not included it was due to the restaurant being closed. Thus, visitors would intuitively be 0 for that date. However, it worsened the RMSE score by about 5-6%

In [404]:
chk = np.where(compiled_predictions['id'] != data['sampleSub']['id'])
print(compiled_predictions[['id']].equals(data['sampleSub'][['id']]))
print(compiled_predictions.shape)
print(data['sampleSub'].shape)

False
(32019, 2)
(32019, 7)


In [130]:
chk_days_open = data['airVisit'].groupby('air_store_id')['dow'].unique()
chk_days_open.head()

air_store_id
air_00a91d42b08b08d9    [4, 5, 0, 1, 2, 3, 6]
air_0164b9927d20bcc3       [0, 1, 2, 3, 4, 5]
air_0241aa3964b7f861    [6, 0, 1, 2, 4, 5, 3]
air_0328696196e46f18    [6, 0, 1, 2, 3, 4, 5]
air_034a3d5b40d5b1b1    [4, 5, 6, 0, 2, 3, 1]
Name: dow, dtype: object

In [133]:
chk_days_closed = pd.DataFrame(chk_days_open.values.tolist(), index=chk_days_open.index)
chk_days_closed = ~chk_days_closed.isnull()*1
chk_days_closed.head() 
#chk_days_closed['air_store_id']=chk_days_closed.index

Unnamed: 0_level_0,0,1,2,3,4,5,6
air_store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
air_00a91d42b08b08d9,1,1,1,1,1,1,1
air_0164b9927d20bcc3,1,1,1,1,1,1,0
air_0241aa3964b7f861,1,1,1,1,1,1,1
air_0328696196e46f18,1,1,1,1,1,1,1
air_034a3d5b40d5b1b1,1,1,1,1,1,1,1


In [134]:
chk_days_closed = chk_days_closed.stack()
chk_days_closed.head(20)

air_store_id           
air_00a91d42b08b08d9  0    1
                      1    1
                      2    1
                      3    1
                      4    1
                      5    1
                      6    1
air_0164b9927d20bcc3  0    1
                      1    1
                      2    1
                      3    1
                      4    1
                      5    1
                      6    0
air_0241aa3964b7f861  0    1
                      1    1
                      2    1
                      3    1
                      4    1
                      5    1
dtype: int32

In [135]:
#chk_days_closed['air_store_id'] = chk_days_closed.index
chk_days_closed = chk_days_closed.reset_index().rename(columns={'level_1': 'dow', 0: 'open_closed'})

In [147]:
chk_days_closed.head(20)

Unnamed: 0,air_store_id,dow,open_closed
0,air_00a91d42b08b08d9,0,1
1,air_00a91d42b08b08d9,1,1
2,air_00a91d42b08b08d9,2,1
3,air_00a91d42b08b08d9,3,1
4,air_00a91d42b08b08d9,4,1
5,air_00a91d42b08b08d9,5,1
6,air_00a91d42b08b08d9,6,1
7,air_0164b9927d20bcc3,0,1
8,air_0164b9927d20bcc3,1,1
9,air_0164b9927d20bcc3,2,1


In [142]:
compiled_sub = pd.DataFrame.from_csv('xgb_submission.csv')
compiled_sub = compiled_sub.reset_index()

In [143]:
compiled_sub.head()

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,2.368
1,air_00a91d42b08b08d9_2017-04-24,23.76
2,air_00a91d42b08b08d9_2017-04-25,26.364
3,air_00a91d42b08b08d9_2017-04-26,28.635
4,air_00a91d42b08b08d9_2017-04-27,28.899


In [144]:
compiled_sub['visit_date'] = compiled_sub['id'].map(lambda x: str(x).split('_')[2])
compiled_sub['air_store_id'] = compiled_sub['id'].map(lambda x: '_'.join(str(x).split('_')[:2]))
compiled_sub['visit_date'] = pd.to_datetime(compiled_sub['visit_date'])
compiled_sub['dow'] = compiled_sub['visit_date'].dt.dayofweek
compiled_sub.head()

Unnamed: 0,id,visitors,visit_date,air_store_id,dow
0,air_00a91d42b08b08d9_2017-04-23,2.368,2017-04-23,air_00a91d42b08b08d9,6
1,air_00a91d42b08b08d9_2017-04-24,23.76,2017-04-24,air_00a91d42b08b08d9,0
2,air_00a91d42b08b08d9_2017-04-25,26.364,2017-04-25,air_00a91d42b08b08d9,1
3,air_00a91d42b08b08d9_2017-04-26,28.635,2017-04-26,air_00a91d42b08b08d9,2
4,air_00a91d42b08b08d9_2017-04-27,28.899,2017-04-27,air_00a91d42b08b08d9,3


In [148]:
updated_sub = pd.merge(compiled_sub, chk_days_closed, how='left', on=['air_store_id','dow'])

In [155]:
#updated_sub.head(50)

In [153]:
corrected_visitors = updated_sub['visitors'] * updated_sub['open_closed']

In [158]:
corrected_visitors.head(10)

0     2.368
1    23.760
2    26.364
3    28.635
4    28.899
5    34.935
6    13.303
7     4.340
8     3.381
9     5.168
dtype: float64

In [164]:
updated_pred = compiled_sub
updated_pred['visitors'] = corrected_visitors
#updated_pred.head(39)
updated_pred[['id','visitors']].to_csv('xgb_updated_submission.csv',index=False,float_format='%.3f')