In [None]:
##################################
#Forward Greedy Feature selection#
##################################

import numpy as np
import pandas as pd
import datetime
import time
import os,sys
import gc
from sklearn import *
import lightgbm
from itertools import combinations
import math
from scipy.special import erfinv

DataBaseDir = '../../data'
InputDir = '%s/l0/kfold' % DataBaseDir
MetaInputDir = '%s/meta/kfold' % DataBaseDir
kfold = 5
seed_num = 1
attention = 'inter'
verbose = True
has_snapshot = False
datestr = '%s' % datetime.datetime.now().strftime("%Y-%m-%d")

start_time = datetime.datetime.now()
#### loading stage
valid_dfs = []
holdout_dfs = []
test_dfs = []
meta_feats = ['nn_ef', 'knn_2', 'knn_4', 'knn_8', 'knn_16', 'knn_32', 'knn_64', 'knn_128', 'knn_256', 'knn_512', 'knn_1024']
for fold in range(kfold):
    # load raw features
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    # load meta features
    for t in meta_feats:
        FoldOutputDir = '%s/%s' % (MetaInputDir, fold)
        valid_cb_ef = pd.read_csv('%s/valid_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        valid = pd.concat([valid, valid_cb_ef[[t]]], axis= 1)
    valid['fold'] = fold
    valid_dfs.append(valid)
    print('fold %s done.' % fold)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
print('==================================')
print('loading data done.')
print('==================================\n')

#### evaluation stage
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

def EvaluateFeature(feats):
    cv_rmlse = .0
    for fold in range(kfold):
        FoldData = {
            'train': TrainData[TrainData['fold'] != fold],
            'valid': TrainData[TrainData['fold'] == fold]
        }
        model = linear_model.ElasticNet(alpha= 0.0004, l1_ratio= 0.2, max_iter= 200, tol= 1e-4, selection= 'random', random_state= 2017)
        model.fit(FoldData['train'][feats].astype(np.float32, copy=False), FoldData['train']['visitors'].values.astype(np.float32, copy=False))
        # for valid
        pred = model.predict(FoldData['valid'][feats])
        rmsle_valid = RMSLE(FoldData['valid']['visitors'].values, pred)
        cv_rmlse += rmsle_valid
    return cv_rmlse/kfold

all_feats = [c for c in TrainData.columns if(c.startswith(attention))]
all_cols = all_feats.copy()
all_cols.extend(['fold', 'visitors'])
TrainData = TrainData[all_cols]
print('====================================')
print('total features size %s, sample size %s' % (len(all_feats), len(TrainData)))
print('==================================\n')

start = time.time()
score_history = []
good_features = set([])
OutputDir = '%s/gfs/%s' % (DataBaseDir, attention)
if(os.path.exists(OutputDir) == False):
    os.makedirs(OutputDir)
if(has_snapshot):
    with open('%s/good_features_%s.txt' % (OutputDir, datestr), 'r') as o_feat, open('%s/score_history_%s.txt' % (OutputDir, datestr), 'r') as o_score:
        for line in o_feat:
            good_features.add(line.rstrip())
        for line in o_score:
            parts = line.rstrip().split(',')
            score_history.append((float(parts[0]), parts[1]))
    o_feat.close()
    o_score.close()
    print('loading good feature snapshot done.')
while ((len(score_history) < 2) or (score_history[-1][0] < score_history[-2][0])):
    scores = []
    for feature in all_feats:
        if feature not in good_features:
            selected_features = list(good_features) + [feature]
            score = EvaluateFeature(selected_features)
            scores.append((score, feature))
    if(len(scores) == 0):
        break
    selected = sorted(scores)[0]
    current_feat = selected[1]
    current_score = selected[0]
    good_features.add(current_feat)
    score_history.append(selected)
    end = time.time()
    if verbose:
        improved_score = .0
        if(len(score_history) > 1):
            improved_score = score_history[-2][0] - score_history[-1][0]
        print('====================================')
        print('Current master %s, improve score %.5f, time elapsed %.2fs' % (current_feat, improved_score, (end - start)))
        print('====================================\n')
    with open('%s/good_features_%s.txt' % (OutputDir, datestr), 'w') as o_feat, open('%s/score_history_%s.txt' % (OutputDir, datestr), 'w') as o_score:
        for feat in good_features:
            o_feat.write('%s\n' % feat)
        for score, feat in score_history:
            o_score.write('%s,%s\n' % (str(score), feat))
    o_feat.close()
    o_score.close()

# Remove the last added feature if necessary
if(score_history[-1][0] > score_history[-2][0]):
    good_features.remove(score_history[-1][1])
good_features = sorted(list(good_features))
if verbose:
    print("Selected Features : ", good_features)
    
with open('%s/good_features_%s.txt' % (OutputDir, datestr), 'w') as o_feat, open('%s/score_history_%s.txt' % (OutputDir, datestr), 'w') as o_score:
    for feat in good_features:
        o_feat.write('%s\n' % feat)
    for score, feat in score_history:
        o_score.write('%s,%s\n' % (str(score), feat))
o_feat.close()
o_score.close()

  interactivity=interactivity, compiler=compiler, result=result)


fold 0 done.
fold 1 done.
fold 2 done.
fold 3 done.
fold 4 done.
loading data done.

total features size 32, sample size 226853

Current master inter_count_air_genre_store_count_air_area_genre_store_multiply, improve score 0.00000, time elapsed 8.33s

Current master inter_count_air_area_genre_store_count_air_area_store_multiply, improve score 0.00290, time elapsed 17.27s

Current master inter_count_air_area_store_count_hpg_area_store_plus, improve score 0.00235, time elapsed 41.64s

Current master inter_count_air_genre_store_count_air_area_genre_store_divide, improve score 0.00029, time elapsed 73.18s

Current master inter_count_air_genre_store_count_air_area_store_divide, improve score 0.00076, time elapsed 113.72s

Current master inter_count_air_area_genre_store_count_hpg_area_genre_store_divide, improve score 0.00030, time elapsed 153.61s

Current master inter_count_air_city_genre_store_count_hpg_city_genre_store_divide, improve score 0.00017, time elapsed 206.37s

Current master in



Current master inter_count_air_city_genre_store_count_air_city_store_multiply, improve score 0.00159, time elapsed 556.55s

