# Import libraries

In [86]:
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-7.2.0-posix-seh-rt_v5-rev1\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import gc # Note: this is a garbage collector
%matplotlib inline

import warnings
warnings.filterwarnings('ignore') 

# Bring in data

In [87]:
PATH = r"..\Raw_Data"

In [88]:
data = {
    'airStore':  pd.read_csv(PATH + r"\air_store_info.csv"),
    'airVisit':  pd.read_csv(PATH + r"\air_visit_data.csv"),
    'date':      pd.read_csv(PATH + r"\date_info.csv"),
    'sampleSub': pd.read_csv(PATH + r"\sample_submission.csv"),
    'storeIDs':  pd.read_csv(PATH + r"\store_id_relation.csv")    
}

# Preprocess data

In [89]:
data['date']['visit_date'] = pd.to_datetime(data['date']['calendar_date'])
data['date'].drop('calendar_date', axis = 1 , inplace=True)

In [90]:
data['airVisit']['visit_date'] = pd.to_datetime(data['airVisit']['visit_date'])
data['airVisit']['dow'] = data['airVisit']['visit_date'].dt.dayofweek
data['airVisit']['year'] = data['airVisit']['visit_date'].dt.year
data['airVisit']['month'] = data['airVisit']['visit_date'].dt.month

## Add EWMA of visits as feature

Function to calculate ewm (note, found this on the competition discussion forum):

In [91]:
def calc_shifted_ewm(series, alpha, adjust = True):
    return series.shift().ewm(alpha = alpha, adjust = adjust).mean()

In [92]:
# Step below adds the ewm by day of week. Right now I'm returning a separate series so I can look at what each step does if I want to
tmp = data['airVisit'].groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'], 0.1)) 
# This step backfills the 1st week's dow for each restaurant, otherwise it would be NaN since it's a 1-period ewma
tmp = tmp.fillna(method='bfill')
# The groupby function returns a multiIndex Series. I only need the 3rd level (original df index) to add column to original df
tmp.index = tmp.index.get_level_values(2)
# Sort index before adding back to original df
tmp = tmp.sort_index()

In [93]:
data['airVisit']['ewma'] = tmp

## Add 'days since last' and 'days until next' holiday

In [94]:
# Create a 'days since holiday' feature
daysSinceList = []
daysSinceHol = 0 # initialize daysSince counter
for row in data['date']['holiday_flg']:
    if row == 1:
        daysSinceHol = 0
        daysSinceList.append(daysSinceHol)
    else:
        daysSinceHol += 1
        daysSinceList.append(daysSinceHol)
data['date']['days_since_holiday'] = daysSinceList

In [95]:
# Create a 'days UNTIL next holiday' feature
holidayList = list(data['date']['holiday_flg'])
daysUntilHolList = np.zeros(len(holidayList),dtype=np.int)
daysUntilHol = 0 # initialize daysUntilHol counter
for i in range(len(holidayList)-1,0,-1):
    if holidayList[i] == 1:
        daysUntilHol = 0
        daysUntilHolList[i] = daysUntilHol
    else:
        daysUntilHol += 1
        daysUntilHolList[i] = daysUntilHol        
data['date']['days_until_holiday'] = daysUntilHolList

## Merge air_visits with date dframe to get holiday info

In [96]:
colsToMerge = ['holiday_flg','visit_date','days_until_holiday','days_since_holiday']
df_train = pd.merge(data['airVisit'], data['date'][colsToMerge], how = 'left', on = 'visit_date')

## Filter only the stores that must be predicted

In [97]:
df_test = data['sampleSub']

df_test['visit_date'] = df_test['id'].map(lambda x: str(x).split('_')[2])
df_test['air_store_id'] = df_test['id'].map(lambda x: '_'.join(str(x).split('_')[:2]))
df_test['visit_date'] = pd.to_datetime(df_test['visit_date'])
df_test['dow'] = df_test['visit_date'].dt.dayofweek
df_test['year'] = df_test['visit_date'].dt.year
df_test['month'] = df_test['visit_date'].dt.month

unique_stores = df_test['air_store_id'].unique()
stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'dow': [i]*len(unique_stores)}) for i in range(7)], axis=0, ignore_index=True).reset_index(drop=True)
#stores.head()

## Merge df_test with date dframe to get holiday info

In [98]:
df_test = pd.merge(df_test, data['date'][colsToMerge], how = 'left', on = 'visit_date')

In [99]:
stores = pd.merge(stores, data['airStore'], how='left', on=['air_store_id'])
#stores.head()

In [100]:
# Encode categorical string variables 
lbl = LabelEncoder()
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

### Add genre and area to train and test data

In [101]:
df_train = pd.merge(df_train, stores, how = 'left', on = ['air_store_id','dow'])

### Add visitor statistics as features

In [102]:
tmp = df_train.groupby(['air_store_id','dow']).agg({'visitors' : [np.min,np.mean,np.median,np.max,np.size]}).reset_index()
tmp.columns = ['air_store_id', 'dow', 'min_visitors', 'mean_visitors', 'median_visitors','max_visitors','count_observations']
df_train = pd.merge(df_train, tmp, how='left', on=['air_store_id','dow'])

In [103]:
df_test = pd.merge(df_test, stores, how = 'left', on = ['air_store_id','dow'])

In [104]:
df_test = pd.merge(df_test, tmp, how='left', on=['air_store_id','dow'])

Note: For XGB, didn't fillna (1/28/2018)
Other algorithms need fillna. TODO: 1) investigate why there are NaN on train set. 2) Check if XGBoost gets better score with fillna

In [105]:
df_train = df_train.fillna(-1)
df_test = df_test.fillna(-1)

In [106]:
train = df_train
test = df_test
col = [c for c in train if c not in ['id', 'air_store_id','visit_date','visitors']]

# Initialize XGBoost
Note - code based on:
https://www.kaggle.com/jmbull/no-xgb-starter-here-s-one-lb-507

In [107]:
# XGB starter template borrowed from @anokas
# https://www.kaggle.com/anokas/simple-xgboost-starter-0-0655

print('Binding to float32')

for c, dtype in zip(df_train.columns, train.dtypes):
    if dtype == np.float64:
        train[c] = train[c].astype(np.float32)
        
for c, dtype in zip(df_test.columns, test.dtypes):
    if dtype == np.float64:
        test[c] = test[c].astype(np.float32)

Binding to float32


In [108]:
x_train = train.drop(['air_store_id','visit_date','visitors'], axis=1)
y_train = np.log1p(train['visitors'].values)

# Get Column order for x_test df
colOrder = x_train.columns

In [109]:
# Create training / validation split
split = 200000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

#del x_train, x_valid; gc.collect()

Building DMatrix...


In [110]:
print('Training ...')

params = {}
params['objective'] = 'reg:linear'
params['eval_metric'] = 'rmse'
params['eta'] = 0.04
params['max_depth'] = 7
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=25)

#del d_train, d_valid

Training ...
[0]	train-rmse:2.35531	valid-rmse:2.31585
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[25]	train-rmse:0.973041	valid-rmse:0.959437
[50]	train-rmse:0.585434	valid-rmse:0.578736
[75]	train-rmse:0.50985	valid-rmse:0.505621
[100]	train-rmse:0.495646	valid-rmse:0.493645
[125]	train-rmse:0.490914	valid-rmse:0.49115
[150]	train-rmse:0.487584	valid-rmse:0.490177
[175]	train-rmse:0.485131	valid-rmse:0.489855
[200]	train-rmse:0.483508	valid-rmse:0.489624
[225]	train-rmse:0.481949	valid-rmse:0.489472
[250]	train-rmse:0.480349	valid-rmse:0.489633
[275]	train-rmse:0.478961	valid-rmse:0.489721
[300]	train-rmse:0.477639	valid-rmse:0.489728
[325]	train-rmse:0.476233	valid-rmse:0.48994
Stopping. Best iteration:
[225]	train-rmse:0.481949	valid-rmse:0.489472



In [28]:
#print(clf.feature_names)
#xgb.plot_tree(clf, num_trees=0,rankdir='LR')
#fig = plt.gcf()
#fig.set_size_inches(150, 100)
#fig.savefig('tree.png')

In [29]:
#xgb.plot_importance(clf)

#### Use ewma feature in test set
1. Assign ewma to 1st week of test set using last week of training
2. Predict
3. Calc ewma for 2nd week of test set
4. predict again

In [30]:
# Get dataframe for last week of training set and only use store
x_train_lastWk = train[['air_store_id','visit_date','dow','ewma']]
x_train_lastWk = x_train_lastWk[x_train_lastWk['visit_date'] > '2017-04-15']

In [31]:
# Split test by weeks [note that weeks start on day 6 (Sunday) rather than day 0 (Monday)]
test.index = test['visit_date']

testWk1 = test['2017-04-23':'2017-04-29']
testWk2 = test['2017-04-30':'2017-05-06']
testWk3 = test['2017-05-07':'2017-05-13']
testWk4 = test['2017-05-14':'2017-05-20']
testWk5 = test['2017-05-21':'2017-05-27']
testWk6 = test['2017-05-28':'2017-06-01']

In [36]:
testWk1_pred = pd.merge(testWk1, x_train_lastWk.drop('visit_date',axis=1),how='left',on=['air_store_id','dow'])

In [35]:
#testWk1_pred['ewma'] = testWk1_pred['ewma'].fillna(value=0)

Command to put columns in the right order for the XGBoost prediction

In [37]:
# NOTE: USE COLUMN LIST BELOW IF NOT INCLUDING VISITOR STATISTICS IN TRAINING SET

#columnsForTest_df = ['dow', 'year', 'month', 'ewma', 'holiday_flg', 'days_until_holiday',
#       'days_since_holiday', 'air_genre_name', 'air_area_name', 'latitude',
#       'longitude']

# NOTE: USE COLUMN LIST BELOW IF INCLUDING VISITOR STATISTICS IN TRAINING SET
columnsForTest_df = ['dow', 'year', 'month', 'ewma', 'holiday_flg', 'days_until_holiday', 'days_since_holiday',
                     'air_genre_name', 'air_area_name', 'latitude', 'longitude', 'min_visitors', 'mean_visitors',
                     'median_visitors', 'max_visitors', 'count_observations']

In [38]:
x_testWk1_pred = testWk1_pred.drop(['id','air_store_id','visit_date','visitors'],axis=1)
x_testWk1_pred = x_testWk1_pred[columnsForTest_df]

In [39]:
d_test = xgb.DMatrix(x_testWk1_pred)
#del x_test; gc.collect()

In [40]:
print('Predicting on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

Predicting on test ...


159

In [41]:
testWk1_pred['visitors'] = np.expm1(p_test)
testWk1_pred[['id','visitors']].to_csv('xgb_submission_Wk1.csv', index=False, float_format='%.3f')

### With test set Week 1 predicted, update ewma and assign to week 2

In [42]:
train_testWk1_concat = testWk1_pred[train.columns]
train_testWk1_concat = pd.concat([train, train_testWk1_concat])
train_testWk1_concat = train_testWk1_concat.reset_index()
tmp = train_testWk1_concat.groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'],0.1))
tmp = tmp.fillna(method='bfill')
tmp.index = tmp.index.get_level_values(2)
tmp = tmp.sort_index()

In [43]:
train_testWk1_concat['ewma'] = tmp

In [44]:
tmp = train_testWk1_concat[train_testWk1_concat['visit_date'] > testWk2.visit_date.min() - pd.to_timedelta(1, unit='d')]
tmp = tmp[['air_store_id','visit_date','dow','ewma']]
testWk2_pred = pd.merge(testWk2, tmp.drop('visit_date',axis=1),how='left',on=['air_store_id','dow'])
testWk2_pred['ewma'] = testWk2_pred['ewma'].fillna(value=0)

In [45]:
x_testWk2_pred = testWk2_pred.drop(['id','air_store_id','visit_date','visitors'],axis=1)
x_testWk2_pred = x_testWk2_pred[columnsForTest_df]
#x_testWk2_pred.columns

In [46]:
d_test = xgb.DMatrix(x_testWk2_pred)
#del x_test; gc.collect()

In [47]:
print('Predictin on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

Predictin on test ...


104

In [48]:
testWk2_pred['visitors'] = np.expm1(p_test)
testWk2_pred[['id','visitors']].to_csv('xgb_submission_Wk2.csv',index=False,float_format='%.3f')

### With test set Week 2 predicted, update ewma and assign to week 3

In [49]:
train_testWk2_concat = testWk2_pred[train.columns]
train_testWk2_concat = pd.concat([train_testWk1_concat, train_testWk2_concat])
train_testWk2_concat = train_testWk2_concat.reset_index()
tmp = train_testWk2_concat.groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'],0.1))
tmp = tmp.fillna(method='bfill')
tmp.index = tmp.index.get_level_values(2)
tmp = tmp.sort_index()

In [50]:
train_testWk2_concat['ewma'] = tmp

In [51]:
tmp = train_testWk2_concat[train_testWk2_concat['visit_date'] > testWk3.visit_date.min() - pd.to_timedelta(1, unit='d')]
tmp = tmp[['air_store_id','visit_date','dow','ewma']]
testWk3_pred = pd.merge(testWk3, tmp.drop('visit_date',axis=1),how='left',on=['air_store_id','dow'])
testWk3_pred['ewma'] = testWk3_pred['ewma'].fillna(value=0)

In [52]:
x_testWk3_pred = testWk3_pred.drop(['id','air_store_id','visit_date','visitors'],axis=1)
x_testWk3_pred = x_testWk3_pred[columnsForTest_df]

In [53]:
d_test = xgb.DMatrix(x_testWk3_pred)
#del x_test; gc.collect()

In [54]:
print('Predictin on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

Predictin on test ...


101

In [55]:
testWk3_pred['visitors'] = np.expm1(p_test)
testWk3_pred[['id','visitors']].to_csv('xgb_submission_Wk3.csv',index=False,float_format='%.3f')

### With test set Week 3 predicted, update ewma and assign to week 4

In [56]:
train_testWk3_concat = pd.concat([train_testWk2_concat[train.columns], testWk3_pred[train.columns]])
train_testWk3_concat = train_testWk3_concat.reset_index()
tmp = train_testWk3_concat.groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'],0.1))
tmp = tmp.fillna(method='bfill')
tmp.index = tmp.index.get_level_values(2)
tmp = tmp.sort_index()

In [57]:
train_testWk3_concat['ewma'] = tmp

In [58]:
tmp = train_testWk3_concat[train_testWk3_concat['visit_date'] > testWk4.visit_date.min() - pd.to_timedelta(1, unit='d')]
tmp = tmp[['air_store_id','visit_date','dow','ewma']]
testWk4_pred = pd.merge(testWk4, tmp.drop('visit_date',axis=1),how='left',on=['air_store_id','dow'])
testWk4_pred['ewma'] = testWk4_pred['ewma'].fillna(value=0)

In [59]:
x_testWk4_pred = testWk4_pred.drop(['id','air_store_id','visit_date','visitors'],axis=1)
x_testWk4_pred = x_testWk4_pred[columnsForTest_df]

In [60]:
d_test = xgb.DMatrix(x_testWk4_pred)
#del x_test; gc.collect()

In [61]:
print('Predictin on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

Predictin on test ...


101

In [62]:
testWk4_pred['visitors'] = np.expm1(p_test)
testWk4_pred[['id','visitors']].to_csv('xgb_submission_Wk4.csv',index=False,float_format='%.3f')

### With test set Week 4 predicted, update ewma and assign to week 5

In [63]:
train_testWk4_concat = pd.concat([train_testWk3_concat[train.columns], testWk4_pred[train.columns]])
train_testWk4_concat = train_testWk4_concat.reset_index()
tmp = train_testWk4_concat.groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'],0.1))
tmp = tmp.fillna(method='bfill')
tmp.index = tmp.index.get_level_values(2)
tmp = tmp.sort_index()

In [64]:
train_testWk4_concat['ewma'] = tmp

In [65]:
tmp = train_testWk4_concat[train_testWk4_concat['visit_date'] > testWk5.visit_date.min() - pd.to_timedelta(1, unit='d')]
tmp = tmp[['air_store_id','visit_date','dow','ewma']]
testWk5_pred = pd.merge(testWk5, tmp.drop('visit_date',axis=1),how='left',on=['air_store_id','dow'])
testWk5_pred['ewma'] = testWk5_pred['ewma'].fillna(value=0)

In [66]:
x_testWk5_pred = testWk5_pred.drop(['id','air_store_id','visit_date','visitors'],axis=1)
x_testWk5_pred = x_testWk5_pred[columnsForTest_df]

In [67]:
d_test = xgb.DMatrix(x_testWk5_pred)
#del x_test; gc.collect()

In [68]:
print('Predictin on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

Predictin on test ...


101

In [69]:
testWk5_pred['visitors'] = np.expm1(p_test)
testWk5_pred[['id','visitors']].to_csv('xgb_submission_Wk5.csv',index=False,float_format='%.3f')

### With test set Week 5 predicted, update ewma and assign to week 6

In [70]:
train_testWk5_concat = pd.concat([train_testWk4_concat[train.columns], testWk5_pred[train.columns]])
train_testWk5_concat = train_testWk5_concat.reset_index()
tmp = train_testWk5_concat.groupby(['air_store_id','dow']).apply(lambda x: calc_shifted_ewm(x['visitors'],0.1))
tmp = tmp.fillna(method='bfill')
tmp.index = tmp.index.get_level_values(2)
tmp = tmp.sort_index()

In [71]:
train_testWk5_concat['ewma'] = tmp

In [72]:
tmp = train_testWk5_concat[train_testWk5_concat['visit_date'] > testWk6.visit_date.min() - pd.to_timedelta(1, unit='d')]
tmp = tmp[['air_store_id','visit_date','dow','ewma']]
testWk6_pred = pd.merge(testWk6, tmp.drop('visit_date',axis=1),how='left',on=['air_store_id','dow'])
testWk6_pred['ewma'] = testWk6_pred['ewma'].fillna(value=0)

In [73]:
x_testWk6_pred = testWk6_pred.drop(['id','air_store_id','visit_date','visitors'],axis=1)
x_testWk6_pred = x_testWk6_pred[columnsForTest_df]

In [74]:
d_test = xgb.DMatrix(x_testWk6_pred)
#del x_test; gc.collect()

In [75]:
print('Predictin on test ...')
p_test = clf.predict(d_test)
del d_test; gc.collect()

Predictin on test ...


101

In [76]:
testWk6_pred['visitors'] = np.expm1(p_test)
testWk6_pred[['id','visitors']].to_csv('xgb_submission_Wk6.csv',index=False,float_format='%.3f')

### Compile all prediction csv and sort in order needed for submission

In [77]:
pred_wk1 = pd.read_csv('xgb_submission_Wk1.csv')
pred_wk2 = pd.read_csv('xgb_submission_Wk2.csv')
pred_wk3 = pd.read_csv('xgb_submission_Wk3.csv')
pred_wk4 = pd.read_csv('xgb_submission_Wk4.csv')
pred_wk5 = pd.read_csv('xgb_submission_Wk5.csv')
pred_wk6 = pd.read_csv('xgb_submission_Wk6.csv')

In [78]:
compiled_predictions = pd.concat([pred_wk1,pred_wk2,pred_wk3,pred_wk4,pred_wk5,pred_wk6]).sort_values(by='id')

In [79]:
compiled_predictions[['id','visitors']].to_csv('xgb_submission.csv',index=False,float_format='%.3f')

# Check tree characteristics for hyperparameter turning

In [None]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain.get_labels())
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='rmse', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [130]:
clf.get_xgb_params()

AttributeError: 'Booster' object has no attribute 'get_xgb_params'

In [128]:
y_train

array([ 3.25809654,  3.49650756,  3.40119738, ...,  3.29583687,
        3.09104245,  3.61091791])