In [10]:
import sys
import re
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_log_error

sys.path.append("../")
from general.preprocess import data_preparation
from general.clf_wrappers import LgbWrapper
from features.f0 import features_set_f0
%matplotlib inline

In [2]:
data = {
    'ar': pd.read_csv('../data/air_reserve.csv'),
    'as': pd.read_csv('../data/air_store_info.csv'),
    'hs': pd.read_csv('../data/hpg_store_info.csv'),
    'trn': pd.read_csv('../data/air_visit_data.csv'),  # with the visitors column, which is the target
    'hr': pd.read_csv('../data/hpg_reserve.csv'),
    'id': pd.read_csv('../data/store_id_relation.csv'),
    'tst': pd.read_csv('../data/sample_submission.csv'),
    'hol': pd.read_csv('../data/date_info.csv').rename(columns={'calendar_date':'visit_date'}),  # advanced features
    }
print(1)

1


In [77]:
data['ar']['visit_datetime'] = pd.to_datetime(data['ar']['visit_datetime'])
data['ar']['reserve_datetime'] = pd.to_datetime(data['ar']['reserve_datetime'])
data['ar']['visit_date'] = data['ar']['visit_datetime'].dt.date
data['ar']['visit_time'] = data['ar']['visit_datetime'].dt.hour
data['ar']['reserve_date'] = data['ar']['reserve_datetime'].dt.date
data['ar']['reserve_time'] = data['ar']['reserve_datetime'].dt.hour
data['ar']['reserve_to_visit_dow'] = data['ar']['visit_datetime'].dt.dayofweek
data['ar']['visit_minus_reverse_hours'] = (data['ar']['visit_date'] - data['ar']['reserve_date']).dt.days * \
                                          24 + (data['ar']['visit_time'] - data['ar']['reserve_time'])
data['ar']['visit_minus_reverse_days'] = (data['ar']['visit_date'] - data['ar']['reserve_date']).dt.days
# for later merge operation
data['ar']['reserve_date'] = data['ar']['reserve_date'].apply(lambda x: str(x))
data['ar']['visit_date'] = data['ar']['visit_date'].apply(lambda x: str(x))
data['ar'].head()

Unnamed: 0,air_store_id,visit_datetime,reserve_datetime,reserve_visitors,visit_date,visit_time,reserve_date,reserve_time,visit_minus_reverse_hours,visit_minus_reverse_days,reserve_to_visit_dow
0,air_877f79706adbfb06,2016-01-01 19:00:00,2016-01-01 16:00:00,1,2016-01-01,19,2016-01-01,16,3,0,4
1,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,3,2016-01-01,19,2016-01-01,19,0,0,4
2,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,6,2016-01-01,19,2016-01-01,19,0,0,4
3,air_877f79706adbfb06,2016-01-01 20:00:00,2016-01-01 16:00:00,2,2016-01-01,20,2016-01-01,16,4,0,4
4,air_db80363d35f10926,2016-01-01 20:00:00,2016-01-01 01:00:00,5,2016-01-01,20,2016-01-01,1,19,0,4


In [82]:
data['as'] = pd.merge(
    data['as'],
    data['ar'].groupby('air_store_id')['reserve_to_visit_dow'].agg(lambda x: x.value_counts().index[0]).
        reset_index(drop=False).rename(columns={'reserve_to_visit_dow': 'reserve_to_visit_dow_mode'}),
    how='left', on='air_store_id')

data['as'] = pd.merge(
    data['as'],
    data['ar'].groupby('air_store_id')['reserve_visitors'].agg('count').
        reset_index(drop=False).rename(columns={'reserve_visitors': 'reserve_tot_count_by_store'}),
    how='left', on='air_store_id')

data['as'] = pd.merge(
    data['as'],
    data['ar'].groupby('air_store_id')['visit_minus_reverse_hours'].agg('mean').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_hours': 'avg_reserve_hr_day_by_store'}),
    how='left', on='air_store_id')

data['as'] = pd.merge(
    data['as'],
    data['ar'].groupby('air_store_id')['visit_minus_reverse_hours'].agg('max').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_hours': 'max_reserve_hr_by_store'}),
    how='left', on='air_store_id')

data['as'] = pd.merge(
    data['as'],
    data['ar'].groupby('air_store_id')['visit_minus_reverse_hours'].agg('min').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_hours': 'min_reserve_hr_by_store'}),
    how='left', on='air_store_id')

data['as'] = pd.merge(
    data['as'],
    data['ar'].groupby('air_store_id')['visit_minus_reverse_hours'].agg('mean').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_hours': 'mean_reserve_hr_by_store'}),
    how='left', on='air_store_id')

data['as'] = pd.merge(
    data['as'],
    data['ar'].groupby('air_store_id')['visit_minus_reverse_days'].agg('min').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_days': 'min_reserve_dy_by_store'}),
    how='left', on='air_store_id')

data['as'] = pd.merge(
    data['as'],
    data['ar'].groupby('air_store_id')['visit_minus_reverse_days'].agg('mean').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_days': 'mean_reserve_dy_by_store'}),
    how='left', on='air_store_id')

data['as'] = pd.merge(
    data['as'],
    data['ar'].groupby('air_store_id')['visit_minus_reverse_days'].agg('max').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_days': 'max_reserve_dy_by_store'}),
    how='left', on='air_store_id')
data['as'].head()

Unnamed: 0,air_store_id,air_genre_name,air_area_name,latitude,longitude,reserve_to_visit_dow_mode,reserve_tot_count_by_store,avg_reserve_hr_day_by_store,max_reserve_hr_by_store,min_reserve_hr_by_store,mean_reserve_hr_by_store,min_reserve_dy_by_store,mean_reserve_dy_by_store,max_reserve_dy_by_store
0,air_0f0cdeee6c9bf3d7,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,5.0,487.0,162.86037,1299.0,0.0,162.86037,0.0,6.747433,54.0
1,air_7cc17a324ae5c7dc,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,5.0,161.0,235.913043,3259.0,1.0,235.913043,0.0,9.875776,136.0
2,air_fee8dcf4d619598e,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,0.0,2.0,196.0,317.0,75.0,196.0,3.0,8.0,13.0
3,air_a17f0778617c76e2,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,3.0,784.0,2501.855867,7540.0,2.0,2501.855867,0.0,104.122449,314.0
4,air_83db5aff8f50478e,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,4.0,193.0,303.466321,2189.0,1.0,303.466321,0.0,12.544041,91.0


In [87]:
def see_it(df):
    df['ap'] = data['as']

In [88]:
see_it(data)

In [28]:
data['ar_g'].isnull().any(axis=0)

air_store_id          False
visit_date            False
reserve_ppl_count     False
reserve_tot_count     False
avg_reserve_hr_day    False
max_reserve_hr        False
min_reserve_hr        False
mean_reserve_hr       False
min_reserve_dy        False
mean_reserve_dy       False
max_reserve_dy        False
dtype: bool

In [19]:
data['hr']['visit_datetime'] = pd.to_datetime(data['hr']['visit_datetime'])
data['hr']['reserve_datetime'] = pd.to_datetime(data['hr']['reserve_datetime'])
data['hr']['visit_date'] = data['hr']['visit_datetime'].dt.date
data['hr']['visit_time'] = data['hr']['visit_datetime'].dt.hour
data['hr']['reserve_date'] = data['hr']['reserve_datetime'].dt.date
data['hr']['reserve_time'] = data['hr']['reserve_datetime'].dt.hour
data['hr']['visit_minus_reverse_hours'] = (data['hr']['visit_date'] - data['hr']['reserve_date']).dt.days * 24 + (
    data['hr']['visit_time'] - data['hr']['reserve_time'])
data['hr']['visit_minus_reverse_days'] = (data['hr']['visit_date'] - data['hr']['reserve_date']).dt.days
# for later merge operation
data['hr']['reserve_date'] = data['hr']['reserve_date'].apply(lambda x: str(x))
data['hr']['visit_date'] = data['hr']['visit_date'].apply(lambda x: str(x))

# groupby operations under hr_g
data['hr_g'] = data['hr'].groupby(['hpg_store_id', 'visit_date'])['reserve_visitors'].agg('sum'). \
    reset_index(drop=False).rename(columns={'reserve_visitors': 'reserve_ppl_count'})

data['hr_g'] = pd.merge(
    data['hr_g'],
    data['hr'].groupby(['hpg_store_id', 'visit_date'])['reserve_visitors'].agg('count').
        reset_index(drop=False).rename(columns={'reserve_visitors': 'reserve_tot_count'}),
    on=['hpg_store_id', 'visit_date'])

data['hr_g'] = pd.merge(
    data['hr_g'],
    data['hr'].groupby(['hpg_store_id', 'visit_date'])['visit_minus_reverse_hours'].agg('mean').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_hours': 'avg_reserve_hr_day'}),
    on=['hpg_store_id', 'visit_date'])

data['hr_g'] = pd.merge(
    data['hr_g'],
    data['hr'].groupby(['hpg_store_id', 'visit_date'])['visit_minus_reverse_hours'].agg('max').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_hours': 'max_reserve_hr'}),
    on=['hpg_store_id', 'visit_date'])

data['hr_g'] = pd.merge(
    data['hr_g'],
    data['hr'].groupby(['hpg_store_id', 'visit_date'])['visit_minus_reverse_hours'].agg('min').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_hours': 'min_reserve_hr'}),
    on=['hpg_store_id', 'visit_date'])

data['hr_g'] = pd.merge(
    data['hr_g'],
    data['hr'].groupby(['hpg_store_id', 'visit_date'])['visit_minus_reverse_hours'].agg('mean').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_hours': 'mean_reserve_hr'}),
    on=['hpg_store_id', 'visit_date'])

data['hr_g'] = pd.merge(
    data['hr_g'],
    data['hr'].groupby(['hpg_store_id', 'visit_date'])['visit_minus_reverse_days'].agg('min').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_days': 'min_reserve_dy'}),
    on=['hpg_store_id', 'visit_date'])

data['hr_g'] = pd.merge(
    data['hr_g'],
    data['hr'].groupby(['hpg_store_id', 'visit_date'])['visit_minus_reverse_days'].agg('mean').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_days': 'mean_reserve_dy'}),
    on=['hpg_store_id', 'visit_date'])

data['hr_g'] = pd.merge(
    data['hr_g'],
    data['hr'].groupby(['hpg_store_id', 'visit_date'])['visit_minus_reverse_days'].agg('max').
        reset_index(drop=False).rename(columns={'visit_minus_reverse_days': 'max_reserve_dy'}),
    on=['hpg_store_id', 'visit_date'])

In [45]:
data['ar_g'].shape

(29830, 11)

In [47]:
import matplotlib.pyplot as plt

In [63]:
def log1p_transform(df, cols):
    for col in cols:
        df[col+'_log1p'] = np.log1p(df[col])    

In [59]:
columns = [ 
    'reserve_ppl_count', 'reserve_tot_count', 'avg_reserve_hr_day']
temp = log1p_transform(data['ar_g'], cols=columns)