In [1]:
import sys
import re
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_log_error

sys.path.append("../")
from general.preprocess import data_preparation
from general.clf_wrappers import LgbWrapper
from features.f0 import features_set_f0

In [2]:
full_data, ntrain, ntest = data_preparation()

air_store dataframe shape: (829, 57)


In [4]:
trn = full_data[:ntrain]
tst = full_data[ntrain:]
print('train data size:', trn.shape, 'test data size:', tst.shape)

train data size: (252108, 123) test data size: (32019, 123)


In [5]:
TARGET = 'visitors'
FEATURES = features_set_f0()
SEED = 177

lgb_params = {}
lgb_params['objective'] = 'regression'
lgb_params['metric'] = 'l2_root'
lgb_params['n_jobs'] = -1
lgb_params['learning_rate'] = 0.1 # shrinkage_rate
lgb_params['random_state'] = 177
lgb_params['n_estimators'] = 50

lgb_params['num_leaves'] = 512 
lgb_params['lambda_l1'] = 1
lgb_params['lambda_l2'] = 1
lgb_params['bagging_fraction'] = 0.85 # sub_row
lgb_params['feature_fraction'] = 0.85
lgb_params['bagging_freq'] = 3


clf = LgbWrapper(seed=SEED, params=lgb_params)

In [6]:
def score_valid(y_true, y_valid):
    score = np.sqrt(mean_squared_log_error(y_true, y_valid))
    return score


def oof_score_analysis():
    '''
        simple analysis of the oof_score and score in each folds
    :return:
    '''
    return 0

def get_store_ids():
    df = pd.read_csv('../data/air_store_info.csv')
    return df['air_store_id']

In [33]:
def cross_validate(full_data, clf, seed, ntrain, ntest, features, target, nfolds=5):
    # define
    lst_vld_date = pd.to_datetime('2017-4-22')  # one day before test
    v06_str = lst_vld_date - pd.DateOffset(days=38)
    v33_str = lst_vld_date - pd.DateOffset(days=32)
    
    # setup a validation set from 3/15 -3/20 and 3/21 - 4/22.
    trn = full_data[:ntrain]
    tst = full_data[ntrain:]   
    tvd = trn[trn.visit_date < v06_str]  # tvd: train set on valid set
    v06 = trn[trn.visit_date.between(v06_str, v33_str - pd.DateOffset(days=1))]
    v33 = trn[trn.visit_date.between(v33_str, lst_vld_date)]
    
    # assertions
    assert(v33.visit_date.max() == lst_vld_date)
    assert((v33.visit_date.min() - v06.visit_date.max()).days == 1)
    assert((v06.visit_date.max() - v06.visit_date.min()).days == 5)
    assert((v33.visit_date.max() - v33.visit_date.min()).days == 32)

    oof_v06 = np.zeros((v06.shape[0],))
    oof_v33 = np.zeros((v33.shape[0],))
    oof_tst = np.zeros((ntest,))
    oof_tst_fld = np.empty((ntest, nfolds))
    oof_score = [None]*2
    print('oof_v06 shape: ', oof_v06.shape, 'oof_v33 shape:', oof_v33.shape)
    
    # split stores
    store_ids = get_store_ids()
    folds = KFold(n_splits=nfolds, shuffle=True, random_state=seed).split(store_ids)

    for i, ids in enumerate(folds):
        # for predicting the test
        trn_idx = trn.air_store_id.isin(store_ids[ids[0]])
        
        x_trn = trn[trn_idx][features].values
        y_trn = trn[trn_idx][target].values
        
        x_tst = tst[features].values

        # for predicting the valid
        tvd_idx = tvd.air_store_id.isin(store_ids[ids[0]])
        v06_idx = v06.air_store_id.isin(store_ids[ids[1]])
        v33_idx = v33.air_store_id.isin(store_ids[ids[1]])
        
        x_tvd = tvd[tvd_idx][features].values
        y_tvd = tvd[tvd_idx][target].values
        
        x_v06 = v06[v06_idx][features].values        
        x_v33 = v33[v33_idx][features].values
        
        # train on train_valid set and predict on v06/v33 set
        clf.train(x_tvd, y_tvd)
        oof_v06[v06_idx] = clf.predict(x_v06)
        oof_v33[v33_idx] = clf.predict(x_v33)   
        
        # train on train_full set and predict on test
        clf.train(x_trn, y_trn)
        oof_tst_fld[:, i] = clf.predict(x_tst)
        
        del x_trn, y_trn, x_tst, x_tvd, y_tvd, x_v06, x_v33
        gc.collect()

    oof_tst[:] = oof_tst_fld.mean(axis=1)
    oof_score[0] = score_valid(v06[target].values, oof_v06)
    oof_score[1] = score_valid(v33[target].values, oof_v33)

    return oof_tst.reshape(-1, 1), oof_v06.reshape(-1, 1), oof_v33.reshape(-1, 1), oof_score

In [34]:
results = cross_validate(full_data, clf, SEED, ntrain, ntest, FEATURES, TARGET)

oof_v06 shape:  (1615,) oof_v33 shape: (8721,)


In [35]:
np.sqrt(results[3])

array([0.52541361, 0.4935467 ])

In [None]:
0.47 0.48