In [12]:
###########################
# Elastic Net Regression #
#########################
import numpy as np
import pandas as pd
import datetime
import time
import os,sys
import gc
from sklearn import *
import lightgbm
from sklearn.preprocessing import OneHotEncoder

drop_cols = ['id', 'visit_date', 'visitors', 'hpg_store_id', 'fold', 'air_store_id', 
             #'air_store_id_encoded', 'hpg_store_id_encoded', 
             'air_reserved_visitors', 
             'hpg_reserved_visitors','reserved_visitors']
    
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

DataBaseDir = '../../data'
InputDir = '%s/l0/kfold' % DataBaseDir
MetaInputDir = '%s/meta/kfold' % DataBaseDir
OutputDir = '%s/l1/kfold' % DataBaseDir
kfold = 5
seed_num = 1
strategy = 'en'
use_selected = False
start_time = datetime.datetime.now()
#### load data
valid_dfs = []
holdout_dfs = []
test_dfs = []
meta_feats = ['nn_ef', 'knn_2', 'knn_4', 'knn_8', 'knn_16', 'knn_32', 'knn_64', 'knn_128', 'knn_256', 'knn_512', 'knn_1024']
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    holdout = pd.read_csv('%s/holdout.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    test = pd.read_csv('%s/test.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    for t in meta_feats:
        # load cb_ef
        FoldOutputDir = '%s/%s' % (MetaInputDir, fold)
        valid_cb_ef = pd.read_csv('%s/valid_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        holdout_cb_ef = pd.read_csv('%s/holdout_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        test_cb_ef = pd.read_csv('%s/test_%s.csv' % (FoldOutputDir, t), parse_dates= ['visit_date']).reset_index(drop= True)
        # concate
        valid = pd.concat([valid, valid_cb_ef[[t]]], axis= 1)
        holdout = pd.concat([holdout, holdout_cb_ef[[t]]], axis= 1)
        test = pd.concat([test, test_cb_ef[[t]]], axis= 1)
        #
    valid['fold'] = fold
    valid_dfs.append(valid)
    holdout_dfs.append(holdout)
    test_dfs.append(test)
    print('load data for fold %s done.' % fold)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True) 
##### model selection with CV
# score
cv_score = .0
holdout_score = .0
# predict
y_test_pred = 0

for fold in range(kfold):
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'holdout': holdout_dfs[fold],
        'test': test_dfs[fold]
    }
    ##
#     value_size = len(FoldData['train']['air_store_id_encoded'].unique())
#     enc_cols = ['aid_%s' % i for i in range(value_size)]
#     enc = OneHotEncoder()
#     #
#     n = len(FoldData['train'])
#     tmp = FoldData['train']['air_store_id_encoded'].values.reshape(n, 1)
#     transformed = pd.DataFrame(data= enc.fit_transform(tmp).toarray(), index= range(len(tmp)), columns= enc_cols)
#     FoldData['train'] = pd.concat([FoldData['train'], transformed], axis= 1)
#     #
#     n = len(FoldData['valid'])
#     tmp = FoldData['valid']['air_store_id_encoded'].values.reshape(n, 1)
#     transformed = pd.DataFrame(data= enc.transform(tmp).toarray(), index= range(len(tmp)), columns= enc_cols)
#     FoldData['valid'] = pd.concat([FoldData['valid'], transformed], axis= 1)
#     #
#     n = len(FoldData['holdout'])
#     tmp = FoldData['holdout']['air_store_id_encoded'].values.reshape(n, 1)
#     transformed = pd.DataFrame(data= enc.transform(tmp).toarray(), index= range(len(tmp)), columns= enc_cols)
#     FoldData['holdout'] = pd.concat([FoldData['holdout'], transformed], axis= 1)
#     #
#     n = len(FoldData['test'])
#     tmp = FoldData['test']['air_store_id_encoded'].values.reshape(n, 1)
#     transformed = pd.DataFrame(data= enc.transform(tmp).toarray(), index= range(len(tmp)), columns= enc_cols)
#     FoldData['test'] = pd.concat([FoldData['test'], transformed], axis= 1)
#     FoldData['train'].fillna(0, inplace= True)
#     FoldData['valid'].fillna(0, inplace= True)
#     FoldData['holdout'].fillna(0, inplace= True)
#     FoldData['test'].fillna(0, inplace= True)
    if(use_selected):
        selected_features = []
        with open('../../data/gfs/en_good_features.txt', 'r') as i_file:
            for line in i_file:
                selected_features.append(line.rstrip())
        i_file.close()
        selected_features.extend(meta_feats)
        #selected_features.extend(['air_store_id_encoded', 'hpg_store_id_encoded'])
        col = selected_features
    else:
        col = [c for c in FoldData['train'].columns if c not in drop_cols] 
    print('feature size %s' % len(col))
    # train
    model = linear_model.ElasticNet(alpha= 0.0004, l1_ratio= 0.2, max_iter= 200, tol= 1e-6, selection= 'random', random_state= 2017)
    model.fit(FoldData['train'][col].astype(np.float32, copy=False), FoldData['train']['visitors'].values.astype(np.float32, copy=False))
    # for valid
    FoldData['valid'][strategy] = model.predict(FoldData['valid'][col])
    rmsle_valid = RMSLE(FoldData['valid']['visitors'].values, FoldData['valid'][strategy])
    cv_score += rmsle_valid
    # for holdout
    FoldData['holdout'][strategy] = model.predict(FoldData['holdout'][col])
    rmsle_holdout = RMSLE(FoldData['holdout']['visitors'].values, FoldData['holdout'][strategy])
    holdout_score += rmsle_holdout
    # for test
    FoldData['test'][strategy] = model.predict(FoldData['test'][col])
    y_test_pred += FoldData['test'][strategy]

    print('fold %s: valid score %.6f, holdout score %.6f, valid length %s' % (fold, rmsle_valid, rmsle_holdout, len(FoldData['valid'])))  
    #### output
    FoldOutputDir = '%s/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    for mod in FoldData.keys():
        if(mod == 'train'):
            continue
        OutCols = []
        if(mod == 'test'):
            OutCols.append('id')
        OutCols.extend(['air_store_id', 'visit_date', 'visitors', strategy])
        OutputFile = '%s/%s_%s.csv' % (FoldOutputDir, mod, strategy)
        OutFoldData = FoldData[mod][OutCols]
        OutFoldData.to_csv(OutputFile, index= False)
    print('saving for %sth fold data done.' % (fold))
    
y_test_pred /= kfold  # Average test set predictions
cv_score /= kfold # Average valid set predictions
holdout_score /= kfold # Average holdout set predictions

# Create submission file
sub = pd.DataFrame()
sub['id'] = test_dfs[0]['id']
sub['visitors'] = np.expm1(y_test_pred)
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l1/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.6f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

finish_time = datetime.datetime.now()
elapsed = (finish_time - start_time).seconds
print('\n======================')
print("CV score %.4f, Holdout score %.4f, Elapsed time: %.2fs" % (cv_score, holdout_score, elapsed))
print('======================\n')

  interactivity=interactivity, compiler=compiler, result=result)


load data for fold 0 done.
load data for fold 1 done.
load data for fold 2 done.
load data for fold 3 done.
load data for fold 4 done.
feature size 183


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


fold 0: valid score 0.509887, holdout score 0.501145, valid length 45371
saving for 0th fold data done.
feature size 183
fold 1: valid score 0.501824, holdout score 0.500348, valid length 45371
saving for 1th fold data done.
feature size 183
fold 2: valid score 0.503761, holdout score 0.500367, valid length 45371
saving for 2th fold data done.
feature size 183
fold 3: valid score 0.508616, holdout score 0.501662, valid length 45370
saving for 3th fold data done.
feature size 183
fold 4: valid score 0.503353, holdout score 0.501243, valid length 45370
saving for 4th fold data done.
zip ../../data/l1/submit/en_submit_2018-01-25.zip ../../data/l1/submit/en_submit_2018-01-25.csv

CV score 0.5055, Holdout score 0.5010, Elapsed time: 111.00s

