In [16]:
import json
import pandas as pd
import dill as pickle

class DataUtil2:
    """"""
    @classmethod
    def load(cls, file, format, date_cols= None):
        """"""
        data = ''
        if(format== 'csv'):
            data = pd.read_csv(file, parse_dates= date_cols)
        elif(format== 'json'):
            with open(file, 'r') as i_file:
                data = json.load(file)
            i_file.close()
        elif(format== 'pkl'):
            with open(file, 'rb') as i_file:
                data = pickle.load(i_file)
            i_file.close()
        elif(format == 'hdf'):
            data = pd.read_hdf(path_or_buf= file, key='undefined')

        return  data

    @classmethod
    def save(cls, data, file, format, precision= 8):
        """"""
        if(format == 'csv'):
            data.to_csv(file, float_format= '%%.%df' % precision, index= False)
        elif(format == 'json'):
            with open(file, 'w') as o_file:
                json.dump(data, o_file, ensure_ascii= True, indent= 4)
            o_file.close()
        elif(format == 'pkl'):
            with open(file, 'wb') as o_file:
                pickle.dump(data, o_file, -1)
            o_file.close()
        elif(format== 'hdf'):
            data.to_hdf(path_or_buf= file, key='undefined', mode='w', complib='blosc')

        return



In [17]:
#######################
# LigthGBM Regression #
#######################
import numpy as np
import pandas as pd
import datetime
import time
import os,sys
import gc
from sklearn import *
import lightgbm
import random

drop_cols = ['id', 'visit_date', 'visitors', 'hpg_store_id', 'fold', 'air_store_id']

cate_cols = ['store_id_encoded', 'area_name', 'city', 'genre_name']
cate_feats = ['dow', 'hol_days', 'day', 'pom', 'prev_is_holiday', 'next_is_holiday', 
              'wom', 'woy', 'is_weekends', 'holiday_flg', 'month', 'is_up_corner']
for mod in ['air', 'hpg']:
    cate_feats.extend(['%s_%s' % (mod, c) for c in cate_cols])
    
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

DataBaseDir = '../../../data'
InputDir = '%s/l0/kfold' % DataBaseDir
kfold = 5
strategy = 'lgb_l2'
#### load data
valid_dfs = []
holdout_dfs = []
test_dfs = []
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    holdout = pd.read_csv('%s/holdout.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    test = pd.read_csv('%s/test.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    #
    valid['fold'] = fold
    valid_dfs.append(valid)
    holdout_dfs.append(holdout)
    test_dfs.append(test)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
##### model selection with CV
# score
cv_score = .0
holdout_score = .0
# parameters
params = {
    "boosting": "gbdt",
    "objective": "regression_l2",
    "lambda_l2": [500, 200, 100],
    
    "num_iterations": 150,
    "learning_rate": [0.8, 0.4, 0.2],
    "min_data_in_leaf": 20,
    'num_leaves': 255,
    "max_depth": [12, 16, 20],

    "feature_fraction": 0.90,
    "bagging_fraction": 0.85,
    "bagging_freq": 20,
    "min_hessian": 0.001,

    "max_bin": 63,
}
col = [c for c in TrainData.columns if c not in drop_cols]
K = int(0.3 * len(col))
selected_cols = [ col[i] for i in sorted(random.sample(range(len(col)), K))]
OutputDir = '%s/MM/l1/0' % DataBaseDir
if(os.path.exists(OutputDir) == False):
    os.makedirs(OutputDir)
with open('%s/sub_feats.txt' % OutputDir, 'w') as o_file:
    for feat in selected_cols:
        o_file.write('%s\n' % feat)
o_file.close()
start = time.time()
##
BestParmas = {}
BestScore = 1.0
for l2 in params['lambda_l2']:
    for lr in params['learning_rate']:
        for depth in params['max_depth']:
            cv_rmsle = .0
            for fold in range(kfold):
                FoldData = {
                    'train': TrainData[TrainData['fold'] != fold],
                    'valid': TrainData[TrainData['fold'] == fold]
                }
                col = [c for c in FoldData['train'].columns if(c not in drop_cols)]
                d_cv = lightgbm.Dataset(FoldData['train'][col], 
                                        label= FoldData['train']['visitors'], 
                                        max_bin= params['max_bin'], 
                                        silent= True, 
                                        free_raw_data= True)
                param = {
                    'boosting': 'gbdt',
                    'objective': 'regression_l2',
                        
                    'lambda_l2': l2,
                    'learning_rate': lr,
                    'max_depth': depth,

                    'num_iterations': params['num_iterations'],
                    'feature_fraction': params['feature_fraction'],
                    'bagging_fraction': params['bagging_fraction'],
                    'bagging_freq': params['bagging_freq'],
                    'min_hessian': params['min_hessian'],
                    'max_bin': params['max_bin'],
                }
                model = lightgbm.train(param, d_cv)
                FoldData['valid'][strategy] = model.predict(FoldData['valid'][col])
                rmsle_valid = RMSLE(FoldData['valid']['visitors'], FoldData['valid'][strategy])
                cv_rmsle += rmsle_valid
            cv_rmsle /= kfold
            if(cv_rmsle < BestScore):
                BestScore = cv_rmsle
                BestParmas['lambda_l2'] = l2
                BestParmas['learning_rate'] = lr
                BestParmas['max_depth'] = depth
            end = time.time()
            print('running for params, l2 %s, lr %s, depth %s done, cv score %.5f, time elapsed %.2fs' % (l2, lr, depth, cv_rmsle, (end - start)))
end = time.time()
print('grid search done, time elapsed %.2fs' % (end - start))            
## retrain and store
for fold in range(kfold):
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'holdout': holdout_dfs[fold],
        'test': test_dfs[fold]
    }
    # train
    d_cv = lightgbm.Dataset(FoldData['train'][selected_cols], label= FoldData['train']['visitors'].values, max_bin= params['max_bin'], silent= True, free_raw_data= True)
    param = {
        'boosting': 'gbdt',
        'objective': 'regression_l2',
                
        'lambda_l2': BestParmas['lambda_l2'],
        'learning_rate': BestParmas['learning_rate'],
        'max_depth': BestParmas['max_depth'],
                        
        'num_iterations': 500,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.9,
        'bagging_freq': 20,
        'min_hessian': 0.001,
        'max_bin': 63
    }
    model = lightgbm.train(param, d_cv)
    # for valid
    FoldData['valid'][strategy] = model.predict(FoldData['valid'][selected_cols])
    rmsle_valid = RMSLE(FoldData['valid']['visitors'].values, FoldData['valid'][strategy])
    cv_score += rmsle_valid
    # for holdout
    FoldData['holdout'][strategy] = model.predict(FoldData['holdout'][selected_cols])
    rmsle_holdout = RMSLE(FoldData['holdout']['visitors'].values, FoldData['holdout'][strategy])
    holdout_score += rmsle_holdout
    # for test
    FoldData['test'][strategy] = model.predict(FoldData['test'][selected_cols])

    print('fold %s: valid score %.6f, holdout score %.6f, valid length %s' % (fold, rmsle_valid, rmsle_holdout, len(FoldData['valid'])))  
    #### output
    FoldOutputDir = '%s/kfold/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    for mod in FoldData.keys():
        if(mod == 'train'):
            continue
        OutCols = []
        if(mod == 'test'):
            OutCols.append('id')
        OutCols.extend(['air_store_id', 'visit_date', 'visitors', strategy])
        OutputFile = '%s/%s_%s.csv' % (FoldOutputDir, mod, strategy)
        OutFoldData = FoldData[mod][OutCols]
        OutFoldData.to_csv(OutputFile, index= False)
    
cv_score /= kfold # Average valid set predictions
holdout_score /= kfold # Average holdout set predictions

end = time.time()
print('\n======================')
print("CV score %.4f, Holdout score %.4f, Elapsed time: %.2fs" % (cv_score, holdout_score, (end - start)))
print('======================\n')

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


l2 200.000000, learning rate 0.800000, depth 8, cv score 0.50779
running for params, l2 200, lr 0.8, depth 8 done, time elapsed 17.71s
l2 200.000000, learning rate 0.800000, depth 12, cv score 0.50843
running for params, l2 200, lr 0.8, depth 12 done, time elapsed 35.33s
l2 200.000000, learning rate 0.800000, depth 16, cv score 0.50890
running for params, l2 200, lr 0.8, depth 16 done, time elapsed 53.89s
l2 200.000000, learning rate 0.200000, depth 8, cv score 0.50072
running for params, l2 200, lr 0.2, depth 8 done, time elapsed 72.22s
l2 200.000000, learning rate 0.200000, depth 12, cv score 0.50079
running for params, l2 200, lr 0.2, depth 12 done, time elapsed 91.36s
l2 200.000000, learning rate 0.200000, depth 16, cv score 0.50042
running for params, l2 200, lr 0.2, depth 16 done, time elapsed 110.38s
l2 200.000000, learning rate 0.020000, depth 8, cv score 0.51387
running for params, l2 200, lr 0.02, depth 8 done, time elapsed 132.92s
l2 200.000000, learning rate 0.020000, depth

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


fold 0: valid score 0.538332, holdout score 0.530405, valid length 45371
fold 1: valid score 0.531885, holdout score 0.531837, valid length 45371
fold 2: valid score 0.536929, holdout score 0.530826, valid length 45371
fold 3: valid score 0.539145, holdout score 0.530691, valid length 45370
fold 4: valid score 0.534091, holdout score 0.531284, valid length 45370

CV score 0.5361, Holdout score 0.5310, Elapsed time: 541.90s

