Submission用のテストデータに対して予測し、実際にサブミットして評価、順位を確認する

In [1]:
import numpy as np
import pandas as pd
import pickle
import sklearn

import datetime


In [2]:
sklearn.__version__

'0.24.1'

In [3]:
df_sample_submission = pd.read_csv('./raw_data/sample_submission.csv/sample_submission.csv')

In [4]:
df_sample_submission

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,0
1,air_00a91d42b08b08d9_2017-04-24,0
2,air_00a91d42b08b08d9_2017-04-25,0
3,air_00a91d42b08b08d9_2017-04-26,0
4,air_00a91d42b08b08d9_2017-04-27,0
...,...,...
32014,air_fff68b929994bfbd_2017-05-27,0
32015,air_fff68b929994bfbd_2017-05-28,0
32016,air_fff68b929994bfbd_2017-05-29,0
32017,air_fff68b929994bfbd_2017-05-30,0


## submission exampleのidをair_store_idとvisit_dateに分ける

In [5]:
df_sample_submission['air_store_id'] = df_sample_submission['id'].apply(lambda x:x[:20])
df_sample_submission['visit_date'] = df_sample_submission['id'].apply(lambda x:x[21:])

In [6]:
df_sample_submission.head()

Unnamed: 0,id,visitors,air_store_id,visit_date
0,air_00a91d42b08b08d9_2017-04-23,0,air_00a91d42b08b08d9,2017-04-23
1,air_00a91d42b08b08d9_2017-04-24,0,air_00a91d42b08b08d9,2017-04-24
2,air_00a91d42b08b08d9_2017-04-25,0,air_00a91d42b08b08d9,2017-04-25
3,air_00a91d42b08b08d9_2017-04-26,0,air_00a91d42b08b08d9,2017-04-26
4,air_00a91d42b08b08d9_2017-04-27,0,air_00a91d42b08b08d9,2017-04-27


In [7]:
df_sample_submission['visit_date'] = df_sample_submission['visit_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

In [8]:
df_date_info = pd.read_csv('./raw_data/date_info.csv/date_info.csv')

In [9]:
df_date_info['calendar_date'] = df_date_info['calendar_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

In [10]:
df_air_visit_data = pd.read_csv('./raw_data/air_visit_data.csv/air_visit_data.csv')

In [11]:
df_air_visit_data['visit_date'] = df_air_visit_data['visit_date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

## df_sample_submissionとdf_data_infoから、submission用のアウトサンプルの説明変数データを作る

In [12]:
df_outsample = df_sample_submission.groupby('visit_date', as_index=False).sum()
df_outsample.head()

Unnamed: 0,visit_date,visitors
0,2017-04-23,0
1,2017-04-24,0
2,2017-04-25,0
3,2017-04-26,0
4,2017-04-27,0


In [13]:
df_outsample = pd.merge(df_outsample, df_date_info.rename(columns={'calendar_date':'visit_date'}), how='inner', on='visit_date')
df_outsample.head()

Unnamed: 0,visit_date,visitors,day_of_week,holiday_flg
0,2017-04-23,0,Sunday,0
1,2017-04-24,0,Monday,0
2,2017-04-25,0,Tuesday,0
3,2017-04-26,0,Wednesday,0
4,2017-04-27,0,Thursday,0


In [14]:
df_outsample['prev_holiday_flg'] = df_outsample['holiday_flg'].shift(-1).fillna(0)

In [15]:
df_outsample_dummy = pd.get_dummies(df_outsample, columns=['day_of_week'])
df_outsample_dummy.head()

Unnamed: 0,visit_date,visitors,holiday_flg,prev_holiday_flg,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,2017-04-23,0,0,0.0,0,0,0,1,0,0,0
1,2017-04-24,0,0,0.0,0,1,0,0,0,0,0
2,2017-04-25,0,0,0.0,0,0,0,0,0,1,0
3,2017-04-26,0,0,0.0,0,0,0,0,0,0,1
4,2017-04-27,0,0,0.0,0,0,0,0,1,0,0


In [16]:
var_list = [
    'holiday_flg',
    'prev_holiday_flg',
    'day_of_week_Friday',
    'day_of_week_Monday',
    'day_of_week_Saturday',
    'day_of_week_Sunday',
    'day_of_week_Thursday',
    'day_of_week_Tuesday',
    'day_of_week_Wednesday'
]

In [17]:
X_outsmp = df_outsample_dummy.drop(columns=['visit_date', 'visitors'])
X_outsmp = X_outsmp[var_list]
X_outsmp.head()

Unnamed: 0,holiday_flg,prev_holiday_flg,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,0,0.0,0,0,0,1,0,0,0
1,0,0.0,0,1,0,0,0,0,0
2,0,0.0,0,0,0,0,0,1,0
3,0,0.0,0,0,0,0,0,0,1
4,0,0.0,0,0,0,0,1,0,0


# 予測

## GridSearchCVで作ったモデル

In [18]:
gscv_lgbm_model = pickle.load(open('gscv_lgbm_model.pkl','rb') )

In [19]:
y_outsmp_gscv = gscv_lgbm_model.predict(X_outsmp)

## hyperoptで作ったモデル

In [20]:
hyperopt_lgbm_model =  pickle.load(open('hyperopt_lgbm_model.pkl','rb') )

In [21]:
y_outsmp_hyperopt = hyperopt_lgbm_model.predict(X_outsmp)

In [22]:
df_outsample['forecast_gscv'] = y_outsmp_gscv
df_outsample['forecast_hyperopt'] = y_outsmp_hyperopt

# 上記の平均に対する予測から、店ごとの平均に応じて按分する

In [25]:
df_air_visit_by_store = df_air_visit_data.groupby('air_store_id').mean()
df_air_visit_by_store = df_air_visit_by_store.reset_index()
df_air_visit_by_store = df_air_visit_by_store.rename(columns={'visitors':'store_mean_visitors'}) 
df_air_visit_by_store

Unnamed: 0,air_store_id,store_mean_visitors
0,air_00a91d42b08b08d9,26.081897
1,air_0164b9927d20bcc3,9.248322
2,air_0241aa3964b7f861,9.896465
3,air_0328696196e46f18,7.939655
4,air_034a3d5b40d5b1b1,14.828685
...,...,...
824,air_fea5dc9594450608,14.485401
825,air_fee8dcf4d619598e,26.027778
826,air_fef9ccb3ba0da2f7,9.620408
827,air_ffcc2d5087e1b476,20.242798


In [26]:
df_submission = pd.merge(df_sample_submission, df_outsample[['visit_date', 'forecast_gscv', 'forecast_hyperopt']], how='inner', on='visit_date')
df_submission.head()

Unnamed: 0,id,visitors,air_store_id,visit_date,forecast_gscv,forecast_hyperopt
0,air_00a91d42b08b08d9_2017-04-23,0,air_00a91d42b08b08d9,2017-04-23,23.698132,23.690383
1,air_0164b9927d20bcc3_2017-04-23,0,air_0164b9927d20bcc3,2017-04-23,23.698132,23.690383
2,air_0241aa3964b7f861_2017-04-23,0,air_0241aa3964b7f861,2017-04-23,23.698132,23.690383
3,air_0328696196e46f18_2017-04-23,0,air_0328696196e46f18,2017-04-23,23.698132,23.690383
4,air_034a3d5b40d5b1b1_2017-04-23,0,air_034a3d5b40d5b1b1,2017-04-23,23.698132,23.690383


In [27]:
df_submission = pd.merge(df_submission.drop(columns=['visitors'],axis=1), df_air_visit_by_store, how='left', on='air_store_id')
print(df_submission.shape)
df_submission

(32019, 6)


Unnamed: 0,id,air_store_id,visit_date,forecast_gscv,forecast_hyperopt,store_mean_visitors
0,air_00a91d42b08b08d9_2017-04-23,air_00a91d42b08b08d9,2017-04-23,23.698132,23.690383,26.081897
1,air_0164b9927d20bcc3_2017-04-23,air_0164b9927d20bcc3,2017-04-23,23.698132,23.690383,9.248322
2,air_0241aa3964b7f861_2017-04-23,air_0241aa3964b7f861,2017-04-23,23.698132,23.690383,9.896465
3,air_0328696196e46f18_2017-04-23,air_0328696196e46f18,2017-04-23,23.698132,23.690383,7.939655
4,air_034a3d5b40d5b1b1_2017-04-23,air_034a3d5b40d5b1b1,2017-04-23,23.698132,23.690383,14.828685
...,...,...,...,...,...,...
32014,air_fea5dc9594450608_2017-05-31,air_fea5dc9594450608,2017-05-31,18.946111,18.952458,14.485401
32015,air_fee8dcf4d619598e_2017-05-31,air_fee8dcf4d619598e,2017-05-31,18.946111,18.952458,26.027778
32016,air_fef9ccb3ba0da2f7_2017-05-31,air_fef9ccb3ba0da2f7,2017-05-31,18.946111,18.952458,9.620408
32017,air_ffcc2d5087e1b476_2017-05-31,air_ffcc2d5087e1b476,2017-05-31,18.946111,18.952458,20.242798


In [29]:
# 全期間、全店舗での１日当り1店舗の平均訪問客数
df_air_visit_mean = df_air_visit_data.groupby('visit_date', as_index=False).mean()
all_mean_visitors = df_air_visit_mean['visitors'].mean()
print(all_mean_visitors)

21.146428038899096


In [30]:
df_submission['forecast_gscv_adj'] = df_submission['forecast_gscv'] * df_submission['store_mean_visitors']/all_mean_visitors
df_submission['forecast_hyperopt_adj'] = df_submission['forecast_hyperopt'] * df_submission['store_mean_visitors']/all_mean_visitors

In [31]:
df_submission

Unnamed: 0,id,air_store_id,visit_date,forecast_gscv,forecast_hyperopt,store_mean_visitors,forecast_gscv_adj,forecast_hyperopt_adj
0,air_00a91d42b08b08d9_2017-04-23,air_00a91d42b08b08d9,2017-04-23,23.698132,23.690383,26.081897,29.229155,29.219598
1,air_0164b9927d20bcc3_2017-04-23,air_0164b9927d20bcc3,2017-04-23,23.698132,23.690383,9.248322,10.364302,10.360913
2,air_0241aa3964b7f861_2017-04-23,air_0241aa3964b7f861,2017-04-23,23.698132,23.690383,9.896465,11.090654,11.087028
3,air_0328696196e46f18_2017-04-23,air_0328696196e46f18,2017-04-23,23.698132,23.690383,7.939655,8.897720,8.894811
4,air_034a3d5b40d5b1b1_2017-04-23,air_034a3d5b40d5b1b1,2017-04-23,23.698132,23.690383,14.828685,16.618038,16.612604
...,...,...,...,...,...,...,...,...
32014,air_fea5dc9594450608_2017-05-31,air_fea5dc9594450608,2017-05-31,18.946111,18.952458,14.485401,12.978174,12.982522
32015,air_fee8dcf4d619598e_2017-05-31,air_fee8dcf4d619598e,2017-05-31,18.946111,18.952458,26.027778,23.319549,23.327361
32016,air_fef9ccb3ba0da2f7_2017-05-31,air_fef9ccb3ba0da2f7,2017-05-31,18.946111,18.952458,9.620408,8.619390,8.622278
32017,air_ffcc2d5087e1b476_2017-05-31,air_ffcc2d5087e1b476,2017-05-31,18.946111,18.952458,20.242798,18.136505,18.142581


In [34]:
df_submission_gscv = df_submission[['id', 'forecast_gscv_adj']].rename(columns={'forecast_gscv_adj':'visitors'})
df_submission_gscv

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,29.229155
1,air_0164b9927d20bcc3_2017-04-23,10.364302
2,air_0241aa3964b7f861_2017-04-23,11.090654
3,air_0328696196e46f18_2017-04-23,8.897720
4,air_034a3d5b40d5b1b1_2017-04-23,16.618038
...,...,...
32014,air_fea5dc9594450608_2017-05-31,12.978174
32015,air_fee8dcf4d619598e_2017-05-31,23.319549
32016,air_fef9ccb3ba0da2f7_2017-05-31,8.619390
32017,air_ffcc2d5087e1b476_2017-05-31,18.136505


In [38]:
df_submission_gscv.to_csv('submission_gscv.csv', index=False)

In [39]:
df_submission_hyperopt = df_submission[['id', 'forecast_hyperopt_adj']].rename(columns={'forecast_hyperopt_adj':'visitors'})
df_submission_hyperopt

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,29.219598
1,air_0164b9927d20bcc3_2017-04-23,10.360913
2,air_0241aa3964b7f861_2017-04-23,11.087028
3,air_0328696196e46f18_2017-04-23,8.894811
4,air_034a3d5b40d5b1b1_2017-04-23,16.612604
...,...,...
32014,air_fea5dc9594450608_2017-05-31,12.982522
32015,air_fee8dcf4d619598e_2017-05-31,23.327361
32016,air_fef9ccb3ba0da2f7_2017-05-31,8.622278
32017,air_ffcc2d5087e1b476_2017-05-31,18.142581


In [40]:
df_submission_hyperopt.to_csv('submission_hyperopt.csv', index=False)