In [13]:
#######################
# LigthGBM Regression #
#######################
import numpy as np
import pandas as pd
import datetime
import time
import os,sys
import gc
from sklearn import *
import lightgbm
from itertools import combinations
import math
from scipy.special import erfinv
from sklearn.preprocessing import OneHotEncoder

drop_cols = ['id', 'visit_date', 'visitors', 'hpg_store_id', 'fold', 'air_store_id', 
             'air_store_id_encoded', 'hpg_store_id_encoded', 
             'air_reserved_visitors', 'hpg_reserved_visitors','reserved_visitors']

cate_cols = ['store_id_encoded', 'area_name', 'city', 'genre_name']
cate_feats = ['dow', 'hol_days', 'is_weekends', 'holiday_flg', 'month', 'is_up_corner', 'prev_is_holiday', 'next_is_holiday']
for mod in ['air', 'hpg']:
    cate_feats.extend(['%s_%s' % (mod, c) for c in cate_cols])
## count related
count_cols = ['area_genre_store_count', 'area_store_count', 'city_genre_store_count', 'city_store_count', 'genre_store_count']
num_feats = []
for mod in ['air', 'hpg']:
    num_feats.extend(['%s_%s' % (mod, c) for c in count_cols])

def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

DataBaseDir = '../../data'
InputDir = '%s/l0/kfold' % DataBaseDir
MetaInputDir = '%s/meta/kfold' % DataBaseDir
OutputDir = '%s/l1/kfold' % DataBaseDir
kfold = 5
use_selected = True
seed_num = 1
strategy = 'lgb_l2'
start_time = datetime.datetime.now()
#### load data
valid_dfs = []
holdout_dfs = []
test_dfs = []
meta_feats = ['nn_ef', 'knn_2', 'knn_4', 'knn_8', 'knn_16', 'knn_32', 'knn_64', 'knn_128', 'knn_256', 'knn_512', 'knn_1024']
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    holdout = pd.read_csv('%s/holdout.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    test = pd.read_csv('%s/test.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    
    for t in meta_feats:
        # load cb_ef
        FoldOutputDir = '%s/%s' % (MetaInputDir, fold)
        valid_cb_ef = pd.read_csv('%s/valid_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        holdout_cb_ef = pd.read_csv('%s/holdout_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        test_cb_ef = pd.read_csv('%s/test_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        # concate
        valid = pd.concat([valid, valid_cb_ef[[t]]], axis= 1)
        holdout = pd.concat([holdout, holdout_cb_ef[[t]]], axis= 1)
        test = pd.concat([test, test_cb_ef[[t]]], axis= 1)
    #
    valid['fold'] = fold
    valid_dfs.append(valid)
    holdout_dfs.append(holdout)
    test_dfs.append(test)
    print('fold %s done.' % fold)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
print('load data done.')
##### model selection with CV
# parameters
params = {
    "boosting": "gbdt",
    "objective": "regression_l2",
    "lambda_l2": 200,  #!!!
#     "objective": "fair",
#     "fair_c": 20.0,  
#     "objective": "huber",
#     "alpha": 2,
    
    "num_iterations": 100,
    "learning_rate": 0.15,  # !!!
    #"min_data_in_leaf": 200,
    "max_depth": 6,  #!!!
#     'num_leaves': 20,

    "feature_fraction": 0.6,
    "bagging_fraction": 0.85,
    "bagging_freq": 20,
    "min_hessian": 0.001,

    "max_bin": 63,
}

sub_pred= 0
cv_score_list = []
holdout_score_list = []
for s in range(seed_num):
    # score
    cv_score = .0
    holdout_score = .0
    # predict
    y_test_pred = 0
    params['feature_fraction_seed'] = s
    params['bagging_seed'] = s
    for fold in range(kfold):
        FoldData = {
            'train': TrainData[TrainData['fold'] != fold],
            'valid': TrainData[TrainData['fold'] == fold],
            'holdout': holdout_dfs[fold],
            'test': test_dfs[fold]
        }
#         ## one-hot encoding for id features
#         value_size = len(FoldData['train']['air_store_id_encoded'].unique())
#         enc_cols = ['aid_%s' % i for i in range(value_size)]
#         enc = OneHotEncoder()
#         #
#         n = len(FoldData['train'])
#         tmp = FoldData['train']['air_store_id_encoded'].values.reshape(n, 1)
#         transformed = pd.DataFrame(data= enc.fit_transform(tmp).toarray(), index= range(len(tmp)), columns= enc_cols)
#         FoldData['train'] = pd.concat([FoldData['train'], transformed], axis= 1)
#         #
#         n = len(FoldData['valid'])
#         tmp = FoldData['valid']['air_store_id_encoded'].values.reshape(n, 1)
#         transformed = pd.DataFrame(data= enc.transform(tmp).toarray(), index= range(len(tmp)), columns= enc_cols)
#         FoldData['valid'] = pd.concat([FoldData['valid'], transformed], axis= 1)
#         #
#         n = len(FoldData['holdout'])
#         tmp = FoldData['holdout']['air_store_id_encoded'].values.reshape(n, 1)
#         transformed = pd.DataFrame(data= enc.transform(tmp).toarray(), index= range(len(tmp)), columns= enc_cols)
#         FoldData['holdout'] = pd.concat([FoldData['holdout'], transformed], axis= 1)
#         #
#         n = len(FoldData['test'])
#         tmp = FoldData['test']['air_store_id_encoded'].values.reshape(n, 1)
#         transformed = pd.DataFrame(data= enc.transform(tmp).toarray(), index= range(len(tmp)), columns= enc_cols)
#         FoldData['test'] = pd.concat([FoldData['test'], transformed], axis= 1)
        
#         FoldData['train'].fillna(0, inplace= True)
#         FoldData['valid'].fillna(0, inplace= True)
#         FoldData['holdout'].fillna(0, inplace= True)
#         FoldData['test'].fillna(0, inplace= True)
        ## feature options
        if(use_selected):
            selected_features = []
            with open('../../data/gfs/all/good_features_2018-01-30.txt', 'r') as i_file:
                for line in i_file:
                    selected_features.append(line.rstrip())
            i_file.close()
            others = ['nn_ef']#, 'latitude_x', 'latitude_y', 'longitude_x', 'longitude_y', 'lon_plus_lat_x', 'var_max_long_x', 'var_max_lat_x']
            selected_features.extend(others)
#             selected_features.extend(enc_cols)
            #selected_features.extend(['air_store_id_encoded', 'hpg_store_id_encoded'])
            col = selected_features
        else:
            col = [c for c in FoldData['train'].columns if c not in drop_cols]
            col = [c for c in col if((c.startswith('count') == False) and 
                                     (c.startswith('inter') == False) and 
                                     (c.startswith('lon') == False) and 
                                     (c.startswith('lat') == False) and 
                                     (c.startswith('var') == False))]
#             col = [c for c in col if((c.startswith('count') == False) and (c.startswith('inter') == False))]
        print('feature size %s' % len(col))
        # train
        d_cv = lightgbm.Dataset(FoldData['train'][col], 
                                label= FoldData['train']['visitors'].values, 
                                max_bin= params['max_bin'], 
                                silent= True, 
                                free_raw_data= True)
        model = lightgbm.train(params, d_cv)
        # for valid
        FoldData['valid'][strategy] = model.predict(FoldData['valid'][col])
        rmsle_valid = RMSLE(FoldData['valid']['visitors'].values, FoldData['valid'][strategy])
        cv_score += rmsle_valid
        # for holdout
        FoldData['holdout'][strategy] = model.predict(FoldData['holdout'][col])
        rmsle_holdout = RMSLE(FoldData['holdout']['visitors'].values, FoldData['holdout'][strategy])
        holdout_score += rmsle_holdout
        # for test
        FoldData['test'][strategy] = model.predict(FoldData['test'][col])
        y_test_pred += FoldData['test'][strategy]

        print('fold %s: valid score %.6f, holdout score %.6f, valid length %s' % (fold, rmsle_valid, rmsle_holdout, len(FoldData['valid'])))  
        #### output
        FoldOutputDir = '%s/%s' % (OutputDir, fold)
        if(os.path.exists(FoldOutputDir) == False):
            os.makedirs(FoldOutputDir)
        for mod in FoldData.keys():
            if(mod == 'train'):
                continue
            OutCols = []
            if(mod == 'test'):
                OutCols.append('id')
            OutCols.extend(['air_store_id', 'visit_date', 'visitors', strategy])
            OutputFile = '%s/%s_%s.csv' % (FoldOutputDir, mod, strategy)
            OutFoldData = FoldData[mod][OutCols]
            OutFoldData.to_csv(OutputFile, index= False)
#         print('saving for %sth fold data done.' % (fold))
    
    y_test_pred /= kfold  # Average test set predictions
    cv_score /= kfold # Average valid set predictions
    holdout_score /= kfold # Average holdout set predictions
    
    sub_pred += y_test_pred
    cv_score_list.append(cv_score)
    holdout_score_list.append(holdout_score)
    
    finish_time = datetime.datetime.now()
    elapsed = (finish_time - start_time).seconds
    print('\n======================')
    print("CV score %.4f, Holdout score %.4f, Elapsed time: %.2fs" % (cv_score, holdout_score, elapsed))
    print('======================\n')

# Create submission file
print('mean cv score %.4f, mean holdout score %.4f' % (np.mean(cv_score_list), np.mean(holdout_score_list)))
sub = pd.DataFrame()
sub['id'] = test_dfs[0]['id']
sub['visitors'] = np.expm1(sub_pred/seed_num)
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l1/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.6f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

  interactivity=interactivity, compiler=compiler, result=result)


fold 0 done.
fold 1 done.
fold 2 done.
fold 3 done.
fold 4 done.
load data done.
feature size 51


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


fold 0: valid score 0.506653, holdout score 0.496791, valid length 45371
feature size 51
fold 1: valid score 0.496069, holdout score 0.497400, valid length 45371
feature size 51
fold 2: valid score 0.501735, holdout score 0.497600, valid length 45371
feature size 51
fold 3: valid score 0.506074, holdout score 0.497309, valid length 45370
feature size 51
fold 4: valid score 0.499315, holdout score 0.497250, valid length 45370

CV score 0.5020, Holdout score 0.4973, Elapsed time: 85.00s

mean cv score 0.5020, mean holdout score 0.4973
zip ../../data/l1/submit/lgb_l2_submit_2018-01-31.zip ../../data/l1/submit/lgb_l2_submit_2018-01-31.csv


0