In [2]:
import numpy as np
import pandas as pd
import datetime
import time
import numba
import os,sys

def LoadData(InputDir):
    """"""
    ## load raw data
    data = {
        'tra': pd.read_csv('%s/air_visit_data.csv' % InputDir, parse_dates= ['visit_date']),
        'as': pd.read_csv('%s/air_store_info.csv' % InputDir),
        'hs': pd.read_csv('%s/hpg_store_info.csv' % InputDir),
        'ar': pd.read_csv('%s/air_reserve.csv' % InputDir, parse_dates= ['visit_datetime', 'reserve_datetime']),
        'hr': pd.read_csv('%s/hpg_reserve.csv' % InputDir, parse_dates= ['visit_datetime', 'reserve_datetime']),
        'id': pd.read_csv('%s/store_id_relation.csv' % InputDir),
        'tes': pd.read_csv('%s/sample_submission.csv' % InputDir),
        'hol': pd.read_csv('%s/date_info.csv' % InputDir, parse_dates=['calendar_date']).rename(columns={'calendar_date': 'visit_date'})
    }
    return data

@numba.jit
def ApplyDayoff(VisitCols, ReserveCols):
    """"""
    n = len(VisitCols)
    result = np.zeros((n, 1), dtype= 'int8')
    for i in range(n):
        result[i] = (VisitCols[i]- ReserveCols[i]).days
    return result

reserve2id = {'ar': 'air', 'hr': 'hpg'}
reserve2store = {'ar': 'as', 'hr': 'hs'}# load data set
InputDir = '../../data/raw'
DataSet = LoadData(InputDir)
#### 
# date related features
print('\n============')
for mod in ['tra', 'tes']:
    start0 = time.time()
    if (mod == 'tes'):
        DataSet[mod]['visit_date'] = DataSet[mod]['id'].map(lambda x: str(x).split('_')[2])
        DataSet[mod]['air_store_id'] = DataSet[mod]['id'].map(lambda x: '_'.join(x.split('_')[:2]))
        DataSet[mod]['visit_date'] = pd.to_datetime(DataSet[mod]['visit_date'])
    DataSet[mod]['dow'] = DataSet[mod]['visit_date'].dt.dayofweek
    DataSet[mod]['year'] = DataSet[mod]['visit_date'].dt.year
    DataSet[mod]['month'] = DataSet[mod]['visit_date'].dt.month
    DataSet[mod]['visit_date'] = DataSet[mod]['visit_date'].dt.date
    end0 = time.time()
    print('%s data: unique stores %s, total %s, time elased %.2fs.' %
            (mod, len(DataSet[mod]['air_store_id'].unique()), len(DataSet[mod]['air_store_id']), (end0 - start0)))
print('')
# for reservation data
# for mod in ['hr', 'ar']:
#     start1 = time.time()
#     DataSet[mod]['visit_date'] = DataSet[mod]['visit_datetime'].dt.date
#     DataSet[mod].drop(['visit_datetime'], axis= 1, inplace= True)
#     DataSet[mod]['reserve_date'] = DataSet[mod]['reserve_datetime'].dt.date
#     DataSet[mod].drop(['reserve_datetime'], axis= 1, inplace= True)
#     end1 = time.time()
#     print('time-consuming part %.2f.' % (end1 - start1))

DataSet['hol']['visit_date'] = DataSet['hol']['visit_date'].dt.date
end0 = time.time()
print('=============')
print('process date done, time consumed %.2f.\n' % (end0 - start0))
######## store data
# add city feature
for mod in ['ar', 'hr']:
    DataSet[reserve2store[mod]]['%s_city' % reserve2id[mod]] = DataSet[reserve2store[mod]]['%s_area_name' % reserve2id[mod]].str[:5]
# area (store)count
for mod in ['ar', 'hr']:
    rec = []
    groupped = DataSet[reserve2store[mod]].groupby(['%s_area_name' % reserve2id[mod]])
    for g in groupped.groups:
        ac = {}
        ac['%s_area_name' % reserve2id[mod]] = g
        ac['%s_area_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        #ac['%s_area_store_ratio' % reserve2id[mod]] = ac['%s_area_store_count' % reserve2id[mod]]/len(DataSet[reserve2store[mod]])
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= ['%s_area_name' % reserve2id[mod]])
# genre (store)count
for mod in ['ar', 'hr']:
    rec = []
    groupped = DataSet[reserve2store[mod]].groupby(['%s_genre_name' % reserve2id[mod]])
    for g in groupped.groups:
        ac = {}
        ac['%s_genre_name' % reserve2id[mod]] = g
        ac['%s_genre_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= ['%s_genre_name' % reserve2id[mod]])
#  area_genre (store) count 
for mod in ['ar', 'hr']:
    rec = []
    groupby_keys = ['%s_area_name' % reserve2id[mod], '%s_genre_name' % reserve2id[mod]]
    groupped = DataSet[reserve2store[mod]].groupby(groupby_keys)
    for g in groupped.groups:
        ac = {}
        ac['%s_area_name' % reserve2id[mod]] = g[0]
        ac['%s_genre_name' % reserve2id[mod]] = g[1]
        ac['%s_area_genre_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= groupby_keys)
# city (store)count
for mod in ['ar', 'hr']:
    rec = []
    groupped = DataSet[reserve2store[mod]].groupby(['%s_city' % reserve2id[mod]])
    for g in groupped.groups:
        ac = {}
        ac['%s_city' % reserve2id[mod]] = g
        ac['%s_city_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        #ac['%s_area_store_ratio' % reserve2id[mod]] = ac['%s_area_store_count' % reserve2id[mod]]/len(DataSet[reserve2store[mod]])
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= ['%s_city' % reserve2id[mod]])
#  city_genre (store) count 
for mod in ['ar', 'hr']:
    rec = []
    groupby_keys = ['%s_city' % reserve2id[mod], '%s_genre_name' % reserve2id[mod]]
    groupped = DataSet[reserve2store[mod]].groupby(groupby_keys)
    for g in groupped.groups:
        ac = {}
        ac['%s_city' % reserve2id[mod]] = g[0]
        ac['%s_genre_name' % reserve2id[mod]] = g[1]
        ac['%s_city_genre_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= groupby_keys)
######### holiday data
data = DataSet['hol']
data = data.sort_values(by= 'visit_date')
def TagHoliday(df):
    ''''''
    n = len(df)
    result = ['' for x in range(n)]
    for i in range(n):
        if(i == 0):
            result[i] = 'hid_%s' % 0
        elif((df[i] - df[i-1]).days == 1):
            result[i] = result[i - 1]
        else:
            result[i] = 'hid_%s' % (int(result[i - 1].split('_')[1]) + 1)
    return result
holidays = data[data['holiday_flg'] == 1][['visit_date']]
holidays['hol_l0'] = TagHoliday(holidays['visit_date'].values)
groupped = holidays.groupby(['hol_l0'])
recs = []
for g in groupped.groups:
    hol_days = {}
    hol_days['hol_l0'] = g
    hol_days['hol_days'] = len(groupped.get_group(g))
    recs.append(hol_days)
tmpdf = pd.DataFrame(data= recs, index= range(len(recs)))
holidays = holidays.merge(tmpdf, how= 'left', on= 'hol_l0')
data = data.merge(holidays, how= 'left', on= 'visit_date')
#data['hol_l0'] = data['hol_l0'].fillna('hid_-1')
data.drop(['hol_l0'], axis= 1, inplace= True)
data['hol_days'] = data['hol_days'].fillna(0)
DataSet['hol'] = data
######## join begins
# join holiday data
for mod in ['tra', 'tes']:
    data = DataSet[mod]
    data = data.merge(DataSet['hol'], how='left', on=['visit_date'])
    data.drop(['day_of_week', 'year'], axis=1, inplace=True)
    DataSet[mod] = data
# join (air)store data
for mod in ['tra', 'tes']:
    data = DataSet[mod]
    print('size of %s is %s\n' % (mod, len(data)))
    for rtype in ['ar', 'hr']: 
        if(rtype == 'hr'):
            data = data.merge(DataSet['id'], how= 'left', on= ['air_store_id'])
        data = data.merge(DataSet[reserve2store[rtype]], how= 'left', on= ['%s_store_id' % reserve2id[rtype]])
        data.drop(['latitude', 'longitude'], axis= 1, inplace= True)
    #data.fillna(-1)
    DataSet[mod] = data
    #print(DataSet[mod].head())
    #print(data.isnull().sum())


tra data: unique stores 829, total 252108, time elased 0.76s.
tes data: unique stores 821, total 32019, time elased 0.14s.

process date done, time consumed 0.14.

size of tra is 252108

size of tes is 32019



In [3]:
#### encoding for categorial features
from sklearn import *

cate_feats = ['genre_name', 'area_name', 'city']
cate_cols = ['%s_%s' % (m, cf) for m in ['air', 'hpg'] for cf in cate_feats]
for mod in ['tra', 'tes']:
    for col in DataSet[mod].columns:
        if(col in cate_cols):
            DataSet[mod][col].fillna('unknown', inplace= True)
        else:
            DataSet[mod][col].fillna(-1, inplace= True)
print('Categorical features ', cate_cols)
TrainData = DataSet['tra']
TestData = DataSet['tes']
for col in cate_cols:
    lbl = preprocessing.LabelEncoder()
    TrainData[col] = lbl.fit_transform(TrainData[col])
    TestData[col] = lbl.transform(TestData[col])

Categorical features  ['air_genre_name', 'air_area_name', 'air_city', 'hpg_genre_name', 'hpg_area_name', 'hpg_city']


In [4]:
import lightgbm
import sys,os

def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

## choose 2016/1/1 - 2016/4/22 as train, then 2016/4/22 - 2016/5/31 as test
## split train into 3 folds
K = 3
test = TrainData[(TrainData['visit_date'] >= datetime.date(2016, 4, 22)) & 
                    (TrainData['visit_date'] < datetime.date(2016, 5, 31)) ]
train = TrainData[TrainData['visit_date'] < datetime.date(2016, 4, 22)]
conditions = [
    train['visit_date'] < datetime.date(2016, 1, 22),
    train['visit_date'] < datetime.date(2016, 2, 22),
    train['visit_date'] < datetime.date(2016, 3, 22),
    train['visit_date'] < datetime.date(2016, 4, 22)
]
choices = list(range(K + 1))
train['fold'] = np.select(conditions, choices, default= -1)
params = {
    "boosting": "gbdt",
    "objective": "regression_l2",
    "lambda_l2": 10,

    "num_iterations": 100,
    "learning_rate": 0.2,
    "min_data_in_leaf": 100,
    "max_depth": 20, 

    "feature_fraction": 0.9,
    "bagging_fraction": 0.85,
    "bagging_freq": 12, 
    "min_hessian": 0.001,

    "max_bin": 127,
}
## 3-Fold 
col = [c for c in TrainData.columns if c not in ['id', 'air_store_id', 'visit_date', 'visitors', 'hpg_store_id', 'fold']]
print('All features ', col)
start_time = datetime.datetime.now()
cv_score = .0
test_score = .0
for i in range(K):
    # Create data for this fold
    X_train = train[train['fold'] <= i][col]
    y_train = np.log1p(train[train['fold'] <= i]['visitors'])
    X_valid = train[train['fold'] == i+1][col]
    y_valid = np.log1p(train[train['fold'] == i+1]['visitors'])
    # train with fold data
    d_cv = lightgbm.Dataset(X_train, label= y_train.values, max_bin= params['max_bin'], silent= True, free_raw_data= True)
    model = lightgbm.train(params, d_cv)
    # for valid
    pred = model.predict(X_valid)
    rmsle_valid = RMSLE(y_valid, pred)
    cv_score += rmsle_valid

    print('fold %s: valid score %.6f, train lenght %s, valid length %s' % (i, rmsle_valid, len(X_train), len(X_valid)))

# training with whole data of 2016/1/1 - 2016/4/22
X_test = test[col]
y_test = np.log1p(test['visitors'])
d_cv = lightgbm.Dataset(train[col], label= np.log1p(train['visitors']).values, max_bin= params['max_bin'], silent= True, free_raw_data= True)
model = lightgbm.train(params, d_cv)
# for test
pred = model.predict(X_test)
rmsle_test = RMSLE(y_test, pred)
#print('test score %.6f' % rmsle_test)

finish_time = datetime.datetime.now()
elapsed = (finish_time - start_time).seconds
cv_score /= K # Average valid set predictions
test_score = rmsle_test # Average holdout set predictions
print('\n======================')
print("CV score %.6f, Holdout score %.6f, Elapsed time: %.2fs" % (cv_score, test_score, elapsed))
print('======================\n')

#### train with whole data of 2017/1/1 - 2017/4/22
train = TrainData[(TrainData['visit_date'] >= datetime.date(2017, 1, 1)) & 
                  (TrainData['visit_date'] < datetime.date(2017, 4, 22))]
X_test = TestData[col]
d_cv = lightgbm.Dataset(train[col], label= np.log1p(train['visitors']).values, max_bin= params['max_bin'], silent= True, free_raw_data= True)
model = lightgbm.train(params, d_cv)
pred = model.predict(X_test)
# Create submission file
sub = pd.DataFrame()
sub['id'] = TestData['id']
sub['visitors'] = np.expm1(pred)
sub.to_csv('gbm_submit.csv', float_format='%.6f', index=False)
os.system('zip lgb.zip gbm_submit.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


All features  ['dow', 'month', 'holiday_flg', 'hol_days', 'air_genre_name', 'air_area_name', 'air_city', 'air_area_store_count', 'air_genre_store_count', 'air_area_genre_store_count', 'air_city_store_count', 'air_city_genre_store_count', 'hpg_genre_name', 'hpg_area_name', 'hpg_city', 'hpg_area_store_count', 'hpg_genre_store_count', 'hpg_area_genre_store_count', 'hpg_city_store_count', 'hpg_city_genre_store_count']
fold 0: valid score 0.658411, train lenght 4691, valid length 8142
fold 1: valid score 0.641554, train lenght 12833, valid length 7663
fold 2: valid score 0.644245, train lenght 20496, valid length 8319

CV score 0.648070, Holdout score 0.660875, Elapsed time: 3.00s



0