# Import libraries

In [220]:
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-7.2.0-posix-seh-rt_v5-rev1\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import gc # Note: this is a garbage collector
%matplotlib inline

# Bring in data

In [221]:
PATH = r"..\Raw_Data"

In [222]:
data = {
    'airRes':    pd.read_csv(PATH + r"\air_reserve.csv"),
    'airStore':  pd.read_csv(PATH + r"\air_store_info.csv"),
    'airVisit':  pd.read_csv(PATH + r"\air_visit_data.csv"),
    'date':      pd.read_csv(PATH + r"\date_info.csv"),
    'hpgRes':    pd.read_csv(PATH + r"\hpg_reserve.csv"),
    'hpgStore':  pd.read_csv(PATH + r"\hpg_store_info.csv"),
    'sampleSub': pd.read_csv(PATH + r"\sample_submission.csv"),
    'storeIDs':  pd.read_csv(PATH + r"\store_id_relation.csv")    
}

# Preprocess data

In [223]:
data['date']['visit_date'] = pd.to_datetime(data['date']['calendar_date'])
data['date'].drop('calendar_date', axis = 1 , inplace=True)

In [224]:
data['airVisit']['visit_date'] = pd.to_datetime(data['airVisit']['visit_date'])
data['airVisit']['dow'] = data['airVisit']['visit_date'].dt.dayofweek
data['airVisit']['year'] = data['airVisit']['visit_date'].dt.year
data['airVisit']['month'] = data['airVisit']['visit_date'].dt.month

## Add EWMA of visits as feature

In [225]:
# Function to calculate ewm (note, found this on the discussion forum):
def calc_shifted_ewm(series, alpha, adjust = True):
    return series.shift().ewm(alpha = alpha, adjust = adjust).mean()

In [226]:
# Step below adds the ewm by day of week. Right now I'm returning a separate series so I can look at what each step does if I want to
tmp = data['airVisit'].groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'], 0.1)) 
# This step backfills the 1st week's dow for each restaurant, otherwise it would be NaN since it's a 1-period ewma
tmp = tmp.fillna(method='bfill')
# The groupby function returns a multiIndex Series. I only need the 3rd level (original df index) to add column to original df
tmp.index = tmp.index.get_level_values(2)
# Sort index before adding back to original df
tmp = tmp.sort_index()

In [227]:
data['airVisit']['ewma'] = tmp

## Add 'days since last' and 'days until next' holiday

In [228]:
# Create a 'days since holiday' feature
daysSinceList = []
daysSinceHol = 0 # initialize daysSince counter
for row in data['date']['holiday_flg']:
    if row == 1:
        daysSinceHol = 0
        daysSinceList.append(daysSinceHol)
    else:
        daysSinceHol += 1
        daysSinceList.append(daysSinceHol)
data['date']['days_since_holiday'] = daysSinceList

In [229]:
# Create a 'days UNTIL next holiday' feature
holidayList = list(data['date']['holiday_flg'])
daysUntilHolList = np.zeros(len(holidayList),dtype=np.int)
daysUntilHol = 0 # initialize daysUntilHol counter
for i in range(len(holidayList)-1,0,-1):
    if holidayList[i] == 1:
        daysUntilHol = 0
        daysUntilHolList[i] = daysUntilHol
    else:
        daysUntilHol += 1
        daysUntilHolList[i] = daysUntilHol        
data['date']['days_until_holiday'] = daysUntilHolList

## Merge air_visits with date dframe to get holiday info

In [230]:
colsToMerge = ['holiday_flg','visit_date','days_until_holiday','days_since_holiday']
df_train = pd.merge(data['airVisit'], data['date'][colsToMerge], how = 'left', on = 'visit_date')

In [231]:
#df_train.head()

## Filter only the stores that must be predicted

In [232]:
df_test = data['sampleSub']

df_test['visit_date'] = df_test['id'].map(lambda x: str(x).split('_')[2])
df_test['air_store_id'] = df_test['id'].map(lambda x: '_'.join(str(x).split('_')[:2]))
df_test['visit_date'] = pd.to_datetime(df_test['visit_date'])
df_test['dow'] = df_test['visit_date'].dt.dayofweek
df_test['year'] = df_test['visit_date'].dt.year
df_test['month'] = df_test['visit_date'].dt.month

unique_stores = df_test['air_store_id'].unique()
stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'dow': [i]*len(unique_stores)}) for i in range(7)], axis=0, ignore_index=True).reset_index(drop=True)
#stores.head()

## Merge df_test with date dframe to get holiday info

In [233]:
df_test = pd.merge(df_test, data['date'][colsToMerge], how = 'left', on = 'visit_date')

In [234]:
stores = pd.merge(stores, data['airStore'], how='left', on=['air_store_id'])
#stores.head()

In [235]:
# Encode categorical string variables 
lbl = LabelEncoder()
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

## Add genre and area to train and test data

In [236]:
df_train = pd.merge(df_train, stores, how = 'left', on = ['air_store_id','dow'])

In [237]:
df_test = pd.merge(df_test, stores, how = 'left', on = ['air_store_id','dow'])

For now, I'm just using fillna on the train set. TODO: investigate why there are NaN on train set

In [238]:
df_train = df_train.fillna(-1)
df_test = df_test.fillna(-1)

In [239]:
train = df_train
test = df_test
col = [c for c in train if c not in ['id', 'air_store_id','visit_date','visitors']]

# Initialize XGBoost
Note - code based on:
https://www.kaggle.com/jmbull/no-xgb-starter-here-s-one-lb-507

In [240]:
# XGB starter template borrowed from @anokas
# https://www.kaggle.com/anokas/simple-xgboost-starter-0-0655

print('Binding to float32')

for c, dtype in zip(df_train.columns, train.dtypes):
    if dtype == np.float64:
        train[c] = train[c].astype(np.float32)
        
for c, dtype in zip(df_test.columns, test.dtypes):
    if dtype == np.float64:
        test[c] = test[c].astype(np.float32)

Binding to float32


In [241]:
x_train = train.drop(['air_store_id','visit_date','visitors'], axis=1)
y_train = np.log1p(train['visitors'].values)

# Get Column order for x_test df
colOrder = x_train.columns

In [242]:
print(x_train.shape, y_train.shape)
#print(colOrder)

(252108, 11) (252108,)


In [243]:
#x_train.head()

In [244]:
#x_test.head()

In [245]:
# Create training / validation split
split = 200000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, x_valid; gc.collect()

Building DMatrix...


465

In [246]:
print('Training ...')

params = {}
params['objective'] = 'reg:linear'
params['eval_metric'] = 'rmse'
params['eta'] = 0.04
params['max_depth'] = 7
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

del d_train, d_valid

Training ...
[0]	train-rmse:2.35565	valid-rmse:2.31619
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[10]	train-rmse:1.61626	valid-rmse:1.59091
[20]	train-rmse:1.14568	valid-rmse:1.12901
[30]	train-rmse:0.858336	valid-rmse:0.847057
[40]	train-rmse:0.693608	valid-rmse:0.684861
[50]	train-rmse:0.605777	valid-rmse:0.598358
[60]	train-rmse:0.5618	valid-rmse:0.555074
[70]	train-rmse:0.540291	valid-rmse:0.534093
[80]	train-rmse:0.52968	valid-rmse:0.524118
[90]	train-rmse:0.524161	valid-rmse:0.519344
[100]	train-rmse:0.521035	valid-rmse:0.516998
[110]	train-rmse:0.518927	valid-rmse:0.51569
[120]	train-rmse:0.517322	valid-rmse:0.514943
[130]	train-rmse:0.515913	valid-rmse:0.514407
[140]	train-rmse:0.514747	valid-rmse:0.514137
[150]	train-rmse:0.513602	valid-rmse:0.514036
[160]	train-rmse:0.512643	valid-rmse:0.513881
[170]	train-rmse:0.511806	valid-rmse:0.513836
[180]	train-rmse:0.511065	valid-rm

### Assign random ewma to df_test just to get a score. TODO: Figure out how to use feature in test set!
Note: Assignment could be iterative, i.e: 
1. Assign 
2. Predict
3. run ewma
4. predict again
5. run ewma, etc..

In [249]:
x_test = test.drop(['id','air_store_id','visit_date','visitors'], axis=1)
x_test['ewma'] = np.random.randint(1, 20, x_test.shape[0])
columnsForTest_df = ['dow', 'year', 'month', 'ewma', 'holiday_flg', 'days_until_holiday',
       'days_since_holiday', 'air_genre_name', 'air_area_name', 'latitude',
       'longitude']
x_test = x_test[columnsForTest_df] # Put columns in the right order

In [250]:
d_test = xgb.DMatrix(x_test)

del x_test; gc.collect()

157

In [251]:
print('Predicting on test ...')

p_test = clf.predict(d_test)

del d_test; gc.collect()

Predicting on test ...


0

In [252]:
np.expm1(p_test)

array([  3.87995052,   8.98664856,  12.44323063, ...,   2.58029413,
         5.66217661,  19.99444962], dtype=float32)

In [253]:
test['visitors'] = np.expm1(p_test)

test[['id','visitors']].to_csv('xgb_submission.csv', index=False, float_format='%.3f')

# TODO: Performance wasn't very good. Predict week by week and calc moving average for next week!