In [1]:
#!/usr/bin/python3 -B

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from scipy.stats import mode

In [84]:
from sklearn.metrics import mean_squared_error

In [2]:
# Data wrangling brought to you by the1owl
# https://www.kaggle.com/the1owl/surprise-me

data = {
    'tra':
    pd.read_csv('../input/air_visit_data.csv'),
    'as':
    pd.read_csv('../input/air_store_info.csv'),
    'hs':
    pd.read_csv('../input/hpg_store_info.csv'),
    'ar':
    pd.read_csv('../input/air_reserve.csv'),
    'hr':
    pd.read_csv('../input/hpg_reserve.csv'),
    'id':
    pd.read_csv('../input/store_id_relation.csv'),
    'tes':
    pd.read_csv('../input/sample_submission.csv'),
    'hol':
    pd.read_csv('../input/date_info.csv').rename(columns={
        'calendar_date': 'visit_date'
    })
}

In [3]:
data['hr'] = pd.merge(data['hr'], data['id'], how='inner', on=['hpg_store_id'])

In [4]:
for df in ['ar', 'hr']:
    data[df]['visit_datetime'] = pd.to_datetime(data[df]['visit_datetime'])
    data[df]['visit_datetime'] = data[df]['visit_datetime'].dt.date
    data[df]['reserve_datetime'] = pd.to_datetime(data[df]['reserve_datetime'])
    data[df]['reserve_datetime'] = data[df]['reserve_datetime'].dt.date
    data[df]['reserve_datetime_diff'] = data[df].apply(
        lambda r: (r['visit_datetime'] - r['reserve_datetime']).days, axis=1)
    data[df] = data[df].groupby(
        ['air_store_id', 'visit_datetime'], as_index=False)[[
            'reserve_datetime_diff', 'reserve_visitors'
        ]].sum().rename(columns={
            'visit_datetime': 'visit_date'
        })
    print(data[df].head())

           air_store_id  visit_date  reserve_datetime_diff  reserve_visitors
0  air_00a91d42b08b08d9  2016-10-31                      0                 2
1  air_00a91d42b08b08d9  2016-12-05                      4                 9
2  air_00a91d42b08b08d9  2016-12-14                      6                18
3  air_00a91d42b08b08d9  2016-12-17                      6                 2
4  air_00a91d42b08b08d9  2016-12-20                      2                 4
           air_store_id  visit_date  reserve_datetime_diff  reserve_visitors
0  air_00a91d42b08b08d9  2016-01-14                      3                 2
1  air_00a91d42b08b08d9  2016-01-15                      6                 4
2  air_00a91d42b08b08d9  2016-01-16                      3                 2
3  air_00a91d42b08b08d9  2016-01-22                      3                 2
4  air_00a91d42b08b08d9  2016-01-29                      6                 5


In [5]:
data['tra']['visit_date'] = pd.to_datetime(data['tra']['visit_date'])

In [6]:
data['tra']['day'] = data['tra']['visit_date'].dt.day
data['tra']['dow'] = data['tra']['visit_date'].dt.dayofweek
data['tra']['year'] = data['tra']['visit_date'].dt.year
data['tra']['month'] = data['tra']['visit_date'].dt.month
data['tra']['visit_date'] = data['tra']['visit_date'].dt.date

In [7]:
data['tra'].head(10)

Unnamed: 0,air_store_id,visit_date,visitors,day,dow,year,month
0,air_ba937bf13d40fb24,2016-01-13,25,13,2,2016,1
1,air_ba937bf13d40fb24,2016-01-14,32,14,3,2016,1
2,air_ba937bf13d40fb24,2016-01-15,29,15,4,2016,1
3,air_ba937bf13d40fb24,2016-01-16,22,16,5,2016,1
4,air_ba937bf13d40fb24,2016-01-18,6,18,0,2016,1
5,air_ba937bf13d40fb24,2016-01-19,9,19,1,2016,1
6,air_ba937bf13d40fb24,2016-01-20,31,20,2,2016,1
7,air_ba937bf13d40fb24,2016-01-21,21,21,3,2016,1
8,air_ba937bf13d40fb24,2016-01-22,18,22,4,2016,1
9,air_ba937bf13d40fb24,2016-01-23,26,23,5,2016,1


In [8]:
data['tes']['visit_date'] = data['tes']['id'].map(
    lambda x: str(x).split('_')[2])
data['tes']['air_store_id'] = data['tes']['id'].map(
    lambda x: '_'.join(x.split('_')[:2]))

In [10]:
data['tes']['visit_date'] = pd.to_datetime(data['tes']['visit_date'])
data['tes']['day'] = data['tes']['visit_date'].dt.day
data['tes']['dow'] = data['tes']['visit_date'].dt.dayofweek
data['tes']['year'] = data['tes']['visit_date'].dt.year
data['tes']['month'] = data['tes']['visit_date'].dt.month
data['tes']['visit_date'] = data['tes']['visit_date'].dt.date

In [11]:
data['tes'].head(10)

Unnamed: 0,id,visitors,visit_date,air_store_id,day,dow,year,month
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,23,6,2017,4
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,24,0,2017,4
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,25,1,2017,4
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,26,2,2017,4
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,27,3,2017,4
5,air_00a91d42b08b08d9_2017-04-28,0,2017-04-28,air_00a91d42b08b08d9,28,4,2017,4
6,air_00a91d42b08b08d9_2017-04-29,0,2017-04-29,air_00a91d42b08b08d9,29,5,2017,4
7,air_00a91d42b08b08d9_2017-04-30,0,2017-04-30,air_00a91d42b08b08d9,30,6,2017,4
8,air_00a91d42b08b08d9_2017-05-01,0,2017-05-01,air_00a91d42b08b08d9,1,0,2017,5
9,air_00a91d42b08b08d9_2017-05-02,0,2017-05-02,air_00a91d42b08b08d9,2,1,2017,5


In [12]:
unique_stores = data['tes']['air_store_id'].unique()

In [13]:
stores = pd.concat(
    [
        pd.DataFrame({
            'air_store_id': unique_stores,
            'dow': [i] * len(unique_stores)
        }) for i in range(7)
    ],
    axis=0,
    ignore_index=True).reset_index(drop=True)

In [14]:
#sure it can be compressed...
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].min().rename(columns={
        'visitors': 'min_visitors'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])

In [15]:
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].mean().rename(columns={
        'visitors': 'mean_visitors'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])

In [16]:
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].median().rename(columns={
        'visitors': 'median_visitors'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])

tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].max().rename(columns={
        'visitors': 'max_visitors'
    })
stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])

In [17]:
tmp = data['tra'].groupby(
    ['air_store_id', 'dow'],
    as_index=False)['visitors'].count().rename(columns={
        'visitors': 'count_observations'
    })

tmp['mode_visitors'] = pd.DataFrame(
    data['tra'].groupby(['air_store_id', 'dow'],
    as_index=False)['visitors'].apply(lambda r: mode(r)[0][0]).values, columns=['mode_visitors'])['mode_visitors']

stores = pd.merge(stores, tmp, how='left', on=['air_store_id', 'dow'])

In [19]:
stores = pd.merge(stores, data['as'], how='left', on=['air_store_id'])

In [22]:
lbl = preprocessing.LabelEncoder()
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

In [23]:
stores.head(10)

Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,mode_visitors,air_genre_name,air_area_name,latitude,longitude
0,air_00a91d42b08b08d9,0,1.0,22.457143,19.0,47.0,35.0,18.0,6,44,35.694003,139.753595
1,air_0164b9927d20bcc3,0,2.0,7.5,6.0,19.0,20.0,2.0,6,62,35.658068,139.751599
2,air_0241aa3964b7f861,0,2.0,8.920635,8.0,23.0,63.0,6.0,7,82,35.712607,139.779996
3,air_0328696196e46f18,0,2.0,6.416667,4.0,27.0,12.0,2.0,4,98,34.701279,135.52809
4,air_034a3d5b40d5b1b1,0,1.0,11.864865,10.0,66.0,37.0,6.0,2,102,34.692337,135.472229
5,air_036d4f1ee7285390,0,4.0,19.6,19.0,38.0,40.0,19.0,2,31,34.799767,135.360073
6,air_0382c794b73b51ad,0,1.0,20.795455,21.0,47.0,44.0,21.0,2,68,35.602125,139.671958
7,air_03963426c9312048,0,2.0,26.030303,26.0,70.0,66.0,11.0,7,15,34.386245,132.455018
8,air_04341b588bde96cd,0,5.0,35.41791,33.0,76.0,67.0,30.0,7,66,35.735623,139.651658
9,air_049f6d5b402a31b2,0,2.0,9.027778,9.0,20.0,36.0,2.0,8,0,33.589216,130.392813


In [24]:
data['hol']['visit_date'] = pd.to_datetime(data['hol']['visit_date'])
data['hol']['day_of_week'] = lbl.fit_transform(data['hol']['day_of_week'])
data['hol']['visit_date'] = data['hol']['visit_date'].dt.date

In [25]:
data['hol'].head()

Unnamed: 0,visit_date,day_of_week,holiday_flg
0,2016-01-01,0,1
1,2016-01-02,2,1
2,2016-01-03,3,1
3,2016-01-04,1,0
4,2016-01-05,5,0


In [98]:
train = pd.merge(data['tra'], data['hol'], how='left', on=['visit_date'])
test = pd.merge(data['tes'], data['hol'], how='left', on=['visit_date'])

train = pd.merge(train, stores, how='left', on=['air_store_id', 'dow'])
test = pd.merge(test, stores, how='left', on=['air_store_id', 'dow'])

for df in ['ar', 'hr']:
    train = pd.merge(
        train, data[df], how='left', on=['air_store_id', 'visit_date'])
    test = pd.merge(
        test, data[df], how='left', on=['air_store_id', 'visit_date'])

col = [
    c for c in train
    if c not in ['id', 'air_store_id', 'visit_date', 'visitors']
]
train = train.fillna(-1)
test = test.fillna(-1)

In [99]:
print('Binding to float32')

for c, dtype in zip(train.columns, train.dtypes):
    if dtype == np.float64:
        train[c] = train[c].astype(np.float32)

for c, dtype in zip(test.columns, test.dtypes):
    if dtype == np.float64:
        test[c] = test[c].astype(np.float32)

Binding to float32


In [100]:
train_x = train.drop(['air_store_id', 'visit_date', 'visitors'], axis=1)
train_y = np.log1p(train['visitors'].values)
print(train_x.shape, train_y.shape)

(252108, 20) (252108,)


In [101]:
test_x = test.drop(['id', 'air_store_id', 'visit_date', 'visitors'], axis=1)

In [103]:
train_x = train_x.drop(['day_of_week', 'mode_visitors', 'day'], axis=1)

In [104]:
test_x = test_x.drop(['day_of_week', 'mode_visitors', 'day'], axis=1)

In [105]:
train_size = int(len(train_x) * 0.7)
test_size = len(train_x) - train_size

my_train_x = train_x[:train_size]
my_train_y = train_y[:train_size]
my_test_x  = train_x[train_size:]
my_test_y = train_y[train_size:]

In [106]:
my_train.shape

(176475, 16)

In [107]:
# Submissions are evaluated using RMSLE:
def RMSLE(y, pred):
    return mean_squared_error(y, pred)**0.5

In [108]:
# parameter tuning of lightgbm
# start from default setting
gbm0 = lgb.LGBMRegressor(
    objective='regression',
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=10000)
gbm0.fit(my_train_x, my_train_y, eval_metric='rmse')

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


LGBMRegressor(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.05,
       max_bin=255, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=10000,
       n_jobs=-1, num_leaves=31, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [109]:
my_test_predict_y = gbm0.predict(my_test_x)
rmsle = RMSLE(my_test_y, my_test_predict_y)
print('Test RMSLE: %.3f' % rmsle)
my_train_predict_y = gbm0.predict(my_train_x)
rmsle = RMSLE(my_train_y, my_train_predict_y)
print('Train RMSLE: %.3f' % rmsle)

Test RMSLE: 0.504
Train RMSLE: 0.417


In [112]:
# parameter tuning of lightgbm
gbm1 = lgb.LGBMRegressor(
        num_leaves = 30,
        max_depth = 13,
        learning_rate =0.01, 
        n_estimators = 10000, 
        objective = 'regression', 
        min_child_weight = 1, 
        subsample = 0.8,
        colsample_bytree=0.8,
        nthread = 7,
        reg_lambda=0.5)
gbm1.fit(my_train_x, my_train_y, eval_metric='rmse')

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


LGBMRegressor(boosting_type='gbdt', colsample_bytree=0.8, learning_rate=0.01,
       max_bin=255, max_depth=13, min_child_samples=20, min_child_weight=1,
       min_split_gain=0.0, n_estimators=10000, n_jobs=-1, nthread=7,
       num_leaves=30, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.5, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=1)

In [113]:
my_test_predict_y = gbm1.predict(my_test_x)
rmsle = RMSLE(my_test_y, my_test_predict_y)
print('Test RMSLE: %.3f' % rmsle)
my_train_predict_y = gbm1.predict(my_train_x)
rmsle = RMSLE(my_train_y, my_train_predict_y)
print('Train RMSLE: %.3f' % rmsle)

Test RMSLE: 0.494
Train RMSLE: 0.464


In [115]:
# parameter tuning of lightgbm
gbm2 = lgb.LGBMRegressor(
        num_leaves = 31,
        max_depth = 13,
        learning_rate =0.05, 
        n_estimators = 10000, 
        objective = 'regression', 
        min_child_weight = 0.001, 
        subsample = 0.8,
        colsample_bytree=0.8,
        nthread = 7,
        reg_lambda=0.05)
gbm2.fit(my_train_x, my_train_y, eval_metric='rmse')

my_test_predict_y = gbm2.predict(my_test_x)
rmsle = RMSLE(my_test_y, my_test_predict_y)
print('Test RMSLE: %.3f' % rmsle)
my_train_predict_y = gbm2.predict(my_train_x)
rmsle = RMSLE(my_train_y, my_train_predict_y)
print('Train RMSLE: %.3f' % rmsle)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Test RMSLE: 0.503
Train RMSLE: 0.424


In [116]:
gbm1.fit(train_x, train_y, eval_metric='rmse')
predict_y = gbm1.predict(test_x)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


In [117]:
predict_y[predict_y<0] = 0

In [120]:
test['visitors'] =np.expm1(predict_y)
#test['visitors'] = np.ceil(np.expm1(predict_y))
print(test['visitors'].min())
print(test['visitors'].max())
print(test['visitors'].mean())
test['visitors'].head()

0.657599841078
130.888205035
17.6505965875


0     1.640625
1    19.210160
2    22.160179
3    27.278195
4    29.357224
Name: visitors, dtype: float64

In [121]:
#test['visitors'] = np.around(np.expm1(predict_y))
test[['id', 'visitors']].to_csv(
    'gbm0_submission_ceil_willen.csv', index=False, float_format='%.3f')