In [12]:
import json
import pandas as pd
import dill as pickle

class DataUtil2:
    """"""
    @classmethod
    def load(cls, file, format, date_cols= None):
        """"""
        data = ''
        if(format== 'csv'):
            data = pd.read_csv(file, parse_dates= date_cols)
        elif(format== 'json'):
            with open(file, 'r') as i_file:
                data = json.load(file)
            i_file.close()
        elif(format== 'pkl'):
            with open(file, 'rb') as i_file:
                data = pickle.load(i_file)
            i_file.close()
        elif(format == 'hdf'):
            data = pd.read_hdf(path_or_buf= file, key='undefined')

        return  data

    @classmethod
    def save(cls, data, file, format, precision= 8):
        """"""
        if(format == 'csv'):
            data.to_csv(file, float_format= '%%.%df' % precision, index= False)
        elif(format == 'json'):
            with open(file, 'w') as o_file:
                json.dump(data, o_file, ensure_ascii= True, indent= 4)
            o_file.close()
        elif(format == 'pkl'):
            with open(file, 'wb') as o_file:
                pickle.dump(data, o_file, -1)
            o_file.close()
        elif(format== 'hdf'):
            data.to_hdf(path_or_buf= file, key='undefined', mode='w', complib='blosc')

        return

In [13]:
import numpy as np
import pandas as pd
import datetime
import time
import os,sys
import gc
from sklearn import *
import lightgbm
from itertools import combinations
import math
from scipy.special import erfinv

drop_cols = ['id', 'visit_date', 'visitors', 'hpg_store_id', 'fold', 'air_store_id', 
             'air_reserved_visitors', 'hpg_reserved_visitors','reserved_visitors']
DataBaseDir = '../../data'
InputDir = '%s/l0/kfold' % DataBaseDir
MetaInputDir = '%s/meta/kfold' % DataBaseDir
OutputDir = '%s/l1/kfold' % DataBaseDir
kfold = 5
seed_num = 1
strategy = 'lgb_l2'
start_time = datetime.datetime.now()
#### load data
valid_dfs = []
holdout_dfs = []
test_dfs = []
meta_feats = ['nn_ef', 'knn_2', 'knn_4', 'knn_8', 'knn_16', 'knn_32', 'knn_64', 'knn_128', 'knn_256', 'knn_512', 'knn_1024']
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    holdout = pd.read_csv('%s/holdout.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    test = pd.read_csv('%s/test.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    for t in meta_feats:
        # load cb_ef
        FoldOutputDir = '%s/%s' % (MetaInputDir, fold)
        valid_cb_ef = pd.read_csv('%s/valid_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        holdout_cb_ef = pd.read_csv('%s/holdout_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        test_cb_ef = pd.read_csv('%s/test_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        # concate
        valid = pd.concat([valid, valid_cb_ef[[t]]], axis= 1)
        holdout = pd.concat([holdout, holdout_cb_ef[[t]]], axis= 1)
        test = pd.concat([test, test_cb_ef[[t]]], axis= 1)
    #
    valid['fold'] = fold
    valid_dfs.append(valid)
    holdout_dfs.append(holdout)
    test_dfs.append(test)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
kfold = 5
verbose = True

def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

def EvaluateFeature(feats):
    cv_rmlse = .0
    for fold in range(kfold):
        FoldData = {
            'train': TrainData[TrainData['fold'] != fold],
            'valid': TrainData[TrainData['fold'] == fold]
        }
        model = linear_model.ElasticNet(alpha= 0.0004, l1_ratio= 0.2, max_iter= 350, tol= 1e-4, selection= 'random', random_state= 2017)
        model.fit(FoldData['train'][feats].astype(np.float32, copy=False), FoldData['train']['visitors'].values.astype(np.float32, copy=False))
        # for valid
        pred = model.predict(FoldData['valid'][feats])
        rmsle_valid = RMSLE(FoldData['valid']['visitors'].values, pred)
        cv_rmlse += rmsle_valid
    return cv_rmlse/kfold

all_feats = [c for c in TrainData.columns if c not in drop_cols]
# all_feats = meta_feats

start = time.time()
score_history = []
good_features = set([])
while len(score_history) < 2 or score_history[-1][0] < score_history[-2][0]:
    scores = []
    for feature in all_feats:
        if feature not in good_features:
            selected_features = list(good_features) + [feature]
            score = EvaluateFeature(selected_features)
            scores.append((score, feature))
#             if verbose:
#                 print("feat %s, score %.4f" % (feature,score))
    selected = sorted(scores)[0]
    current_feat = selected[1]
    current_score = selected[0]
    good_features.add(current_feat)
    score_history.append(selected)
    end = time.time()
    if verbose:
        improved_score = .0
        if(len(score_history) > 1):
            improved_score = score_history[-1][0] - score_history[-2][0]
        print('====================================')
        print('Current master %s, improve score %.5f, time elapsed %.2fs' % (current_feat, improved_score, (end - start)))
        print('====================================\n')
        #print("iteration %s, current selected feature %s, score %.4f " % (len(good_features), current_feat, current_score))
        #print('current feature set ', good_features)

# Remove last added feature
good_features.remove(score_history[-1][1])
good_features = sorted(list(good_features))
if verbose:
    print("Selected Features : ", good_features)

Current master nn_ef, improve score 0.00000, time elapsed 231.91s

Current master air_store_id_dow_is_up_corner_visitors_mean, improve score -0.00655, time elapsed 467.23s



KeyboardInterrupt: 