In [22]:
#######################
# meta-tree models    #
#######################
import numpy as np
import pandas as pd
import datetime
import time
import os,sys
import gc
from sklearn import *
import lightgbm
import random
import json

drop_cols = ['id', 'visit_date', 'visitors', 'hpg_store_id', 'fold', 'air_store_id', 
             'air_store_id_encoded', 'hpg_store_id_encoded', 
             'air_reserved_visitors', 'hpg_reserved_visitors','reserved_visitors']    
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'rmsle', RMSLE(labels, preds), True

start = time.time()
DataBaseDir = '../../data'
InputDir = '%s/l0/kfold' % DataBaseDir
MetaInputDir = '%s/meta/kfold' % DataBaseDir
kfold = 5
meta_ratio = 0.25
use_selected = True
TreeNum = 200
#### load data
valid_dfs = []
holdout_dfs = []
test_dfs = []
meta_feats = ['nn_ef', 'knn_2', 'knn_4', 'knn_8', 'knn_16', 'knn_32', 'knn_64', 'knn_128', 'knn_256', 'knn_512', 'knn_1024']
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    holdout = pd.read_csv('%s/holdout.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    test = pd.read_csv('%s/test.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    for t in meta_feats:
        # load meta-feature
        FoldOutputDir = '%s/%s' % (MetaInputDir, fold)
        valid_cb_ef = pd.read_csv('%s/valid_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        holdout_cb_ef = pd.read_csv('%s/holdout_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        test_cb_ef = pd.read_csv('%s/test_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        # concate
        valid = pd.concat([valid, valid_cb_ef[[t]]], axis= 1)
        holdout = pd.concat([holdout, holdout_cb_ef[[t]]], axis= 1)
        test = pd.concat([test, test_cb_ef[[t]]], axis= 1)
    #
    valid['fold'] = fold
    valid_dfs.append(valid)
    holdout_dfs.append(holdout)
    test_dfs.append(test)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
end = time.time()
print('Load data done. time elapsed %.2fs' % (end - start))
##### model selection with CV
# score
holdout_score = .0
# parameters
params = {
    "boosting": "gbdt",
    "objective": "regression_l2",
    "lambda_l2": [200],
    "learning_rate": [0.15],
    
    "num_iterations": 100,
#     "min_data_in_leaf": 150,
    #'num_leaves': 255,
    "max_depth": 6,

    "feature_fraction": 0.6,
    "bagging_fraction": 0.85,
    "bagging_freq": 20,
    "min_hessian": 0.001,
    
    "verbose": False,
    "max_bin": 63,
}
total_cols = []
## feature options
if(use_selected):
    with open('../../data/gfs/all/good_features_2018-01-30.txt', 'r') as i_file:
        for line in i_file:
            total_cols.append(line.rstrip())
    i_file.close()
    others = ['nn_ef']#, 'latitude_x', 'latitude_y', 'longitude_x', 'longitude_y', 'lon_plus_lat_x', 'var_max_long_x', 'var_max_lat_x']
    total_cols.extend(others)
K = int(meta_ratio * len(total_cols))
for idx in range(TreeNum):
    strategy = 'lgb_l2_meta_trees_%s_%s_%s' % (int(meta_ratio * 100), TreeNum, idx)
    selected_cols = [total_cols[i] for i in sorted(np.random.choice(range(len(total_cols)), K, replace= False))]
    OutputDir = '%s/meta_trees_%s_%s/l1/kfold' % (DataBaseDir, int(meta_ratio * 100), TreeNum)
    if(os.path.exists(OutputDir) == False):
        os.makedirs(OutputDir)
    # save feature space
    with open('%s/sub_feats.txt' % OutputDir, 'w') as o_file:
        for feat in selected_cols:
            o_file.write('%s\n' % feat)
    o_file.close()
    ## retrain and store
    cv_score = .0
    holdout_score = .0
    for fold in range(kfold):
        FoldData = {
            'train': TrainData[TrainData['fold'] != fold],
            'valid': TrainData[TrainData['fold'] == fold],
            'holdout': holdout_dfs[fold],
            'test': test_dfs[fold]
        }
        # train
        dtrain = lightgbm.Dataset(FoldData['train'][selected_cols], label= FoldData['train']['visitors'], silent= True, free_raw_data= True)
        dvalid = lightgbm.Dataset(FoldData['valid'][selected_cols], FoldData['valid']['visitors'], reference=dtrain)
        param = {
            'boosting': 'gbdt',
            'objective': 'regression_l2',
                
            'lambda_l2': params['lambda_l2'][0],
            'learning_rate': params['learning_rate'][0],
                        
            'num_iterations': params['num_iterations'],
            'feature_fraction': params['feature_fraction'],
            'bagging_fraction': params['bagging_fraction'],
            'bagging_freq': params['bagging_freq'],
            'min_hessian': params['min_hessian'],
            'max_bin': params['max_bin'],
        }
        model = lightgbm.train(param, 
                        train_set= dtrain, 
                        num_boost_round= params['num_iterations'], 
                        valid_sets= dvalid,
                        feval= evalerror,
                        verbose_eval= False,
                        early_stopping_rounds= 100)        
        # for valid
        FoldData['valid'][strategy] = model.predict(FoldData['valid'][selected_cols])
        rmsle_valid = RMSLE(FoldData['valid']['visitors'].values, FoldData['valid'][strategy])
        cv_score += rmsle_valid
        # for holdout
        FoldData['holdout'][strategy] = model.predict(FoldData['holdout'][selected_cols])
        rmsle_holdout = RMSLE(FoldData['holdout']['visitors'].values, FoldData['holdout'][strategy])
        holdout_score += rmsle_holdout
        # for test
        FoldData['test'][strategy] = model.predict(FoldData['test'][selected_cols])

        print('fold %s: valid score %.6f, holdout score %.6f, valid length %s' % (fold, rmsle_valid, rmsle_holdout, len(FoldData['valid'])))  
        #### output
        FoldOutputDir = '%s/%s' % (OutputDir, fold)
        if(os.path.exists(FoldOutputDir) == False):
            os.makedirs(FoldOutputDir)
        for mod in FoldData.keys():
            if(mod == 'train'):
                continue
            OutCols = []
            if(mod == 'test'):
                OutCols.append('id')
            OutCols.extend(['air_store_id', 'visit_date', 'visitors', strategy])
            OutputFile = '%s/%s_%s.csv' % (FoldOutputDir, mod, strategy)
            OutFoldData = FoldData[mod][OutCols]
            OutFoldData.to_csv(OutputFile, index= False)
    
    cv_score /= kfold # Average valid set predictions
    holdout_score /= kfold # Average holdout set predictions

    end = time.time()
    print('\n======================')
    print("CV score %.4f, Holdout score %.4f, Elapsed time: %.2fs" % (cv_score, holdout_score, (end - start)))
    print('======================\n')
    
    with open('%s/result.txt' % OutputDir, 'w') as o_file:
        o_file.write('cv score %.4f, holdout score %.4f' % (cv_score, holdout_score))
    o_file.close()
    
    print('------- part %s done. time elapsed %.2fs ----------\n' % (idx, (end - start)))
    if(idx == 3):
        break

  interactivity=interactivity, compiler=compiler, result=result)


Load data done. time elapsed 65.72s


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


fold 0: valid score 0.577717, holdout score 0.575676, valid length 45371
fold 1: valid score 0.571019, holdout score 0.574696, valid length 45371
fold 2: valid score 0.574212, holdout score 0.574440, valid length 45371
fold 3: valid score 0.579512, holdout score 0.576302, valid length 45370
fold 4: valid score 0.576145, holdout score 0.575380, valid length 45370

CV score 0.5757, Holdout score 0.5753, Elapsed time: 86.91s

------- part 0 done. time elapsed 86.91s ----------

fold 0: valid score 0.544778, holdout score 0.538225, valid length 45371
fold 1: valid score 0.535579, holdout score 0.537350, valid length 45371
fold 2: valid score 0.539769, holdout score 0.537341, valid length 45371
fold 3: valid score 0.545230, holdout score 0.537498, valid length 45370
fold 4: valid score 0.538406, holdout score 0.537679, valid length 45370

CV score 0.5408, Holdout score 0.5376, Elapsed time: 104.65s

------- part 1 done. time elapsed 104.65s ----------

fold 0: valid score 0.520617, holdout 