In [2]:
#######################
# Elastic Net Regression #
#######################
import numpy as np
import pandas as pd
import datetime
import time
import os,sys
import gc
from sklearn import *
import lightgbm

def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

DataBaseDir = '../../data/meta_trees_25_200'
DataBaseDir2 = '../../data/meta_trees_50_200'
InputDir = '%s/l1/kfold' % DataBaseDir
InputDir2 = '%s/l1/kfold' % DataBaseDir2
OutputDir = '%s/l2/kfold' % DataBaseDir
OutputDir2 = '%s/l2/kfold' % DataBaseDir2
kfold = 5
seed_num = 1
TreeNum_25 = 100
TreeNum_50 = 100
InputCols_25 = ['lgb_l2_meta_trees_25_200_%s' % i for i in range(TreeNum_25)]
InputCols_50 = ['lgb_l2_meta_trees_50_200_%s' % i for i in range(TreeNum_50)]
SingleModelInputCols = ['lgb_l2', 'xgb_rmse','lgb_huber', 'lgb_fair', 'etr', 'en', 'rf',  'gbr_ls', 'gbr_huber', 'gbr_lad', 'lassolars', 'rgf']
strategy = 'en'
start_time = datetime.datetime.now()
#### load data
valid_dfs = []
holdout_dfs = []
test_dfs = []
join_keys = ['air_store_id', 'visit_date']
start = time.time()
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    FoldInputDir2 = '%s/%s' % (InputDir2, fold)
    for i in range(len(InputCols_25)):
        valid = pd.read_csv('%s/valid_%s.csv' % (FoldInputDir, InputCols_25[i]), parse_dates= ['visit_date']).reset_index(drop= True)
        holdout = pd.read_csv('%s/holdout_%s.csv' % (FoldInputDir, InputCols_25[i]), parse_dates= ['visit_date']).reset_index(drop= True)
        test = pd.read_csv('%s/test_%s.csv' % (FoldInputDir, InputCols_25[i]), parse_dates= ['visit_date']).reset_index(drop= True)
        if(i == 0):
            FoldValid = valid
            FoldHoldout = holdout
            FoldTest = test
        else:
            FoldValid['%s' % InputCols_25[i]] = valid[InputCols_25[i]]
            FoldHoldout['%s' % InputCols_25[i]] = holdout[InputCols_25[i]]
            FoldTest['%s' % InputCols_25[i]] = test[InputCols_25[i]]
    for i in range(len(InputCols_50)):
        valid2 = pd.read_csv('%s/valid_%s.csv' % (FoldInputDir2, InputCols_50[i]), parse_dates= ['visit_date']).reset_index(drop= True)
        holdout2 = pd.read_csv('%s/holdout_%s.csv' % (FoldInputDir2, InputCols_50[i]), parse_dates= ['visit_date']).reset_index(drop= True)
        test2 = pd.read_csv('%s/test_%s.csv' % (FoldInputDir2, InputCols_50[i]), parse_dates= ['visit_date']).reset_index(drop= True)
        FoldValid['%s' % InputCols_50[i]] = valid2[InputCols_50[i]]
        FoldHoldout['%s' % InputCols_50[i]] = holdout2[InputCols_50[i]]
        FoldTest['%s' % InputCols_50[i]] = test2[InputCols_50[i]]
    FoldInputDir = '../../data/l1/kfold/%s' % (fold)
    for i in range(len(SingleModelInputCols)):
        valid = pd.read_csv('%s/valid_%s.csv' % (FoldInputDir, SingleModelInputCols[i]), parse_dates= ['visit_date']).reset_index(drop= True)
        holdout = pd.read_csv('%s/holdout_%s.csv' % (FoldInputDir, SingleModelInputCols[i]), parse_dates= ['visit_date']).reset_index(drop= True)
        test = pd.read_csv('%s/test_%s.csv' % (FoldInputDir, SingleModelInputCols[i]), parse_dates= ['visit_date']).reset_index(drop= True)
        FoldValid[SingleModelInputCols[i]] = valid[SingleModelInputCols[i]]
        FoldHoldout[SingleModelInputCols[i]] = holdout[SingleModelInputCols[i]]
        FoldTest[SingleModelInputCols[i]] = test[SingleModelInputCols[i]]
    ###
    FoldValid['fold'] = fold
    valid_dfs.append(FoldValid)
    holdout_dfs.append(FoldHoldout)
    test_dfs.append(FoldTest)
    end = time.time()
    print('fold %s done, time elapsed %ss' % (fold, (end - start)))
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
print('----------------- CHECK -------------')
print('\n============================ TrainData')
print(TrainData.head(5))
print('\n============================ HoldoutData')
print(holdout_dfs[0].head(5))
print('\n============================ TestData')
print(test_dfs[0].head(5))
print('-------------------------------------')
# sys.exit(1)
##### model selection with C

sub_pred= 0
cv_score_list = []
holdout_score_list = []
for s in range(seed_num):
    # score
    cv_score = .0
    holdout_score = .0
    # predict
    y_test_pred = 0
    for fold in range(kfold):
        FoldData = {
            'train': TrainData[TrainData['fold'] != fold],
            'valid': TrainData[TrainData['fold'] == fold],
            'holdout': holdout_dfs[fold],
            'test': test_dfs[fold]
        }
        col = [c for c in FoldData['train'].columns if c not in ['id', 'air_store_id', 'visit_date', 'visitors', 'hpg_store_id', 'fold']]
        # train
        model = linear_model.ElasticNet(alpha= 0.0001, 
                                        l1_ratio= 0.8, 
                                        max_iter= 400, 
                                        tol= 1e-6, 
                                        selection= 'random', 
                                        random_state= s)
        model.fit(FoldData['train'][col].astype(np.float32, copy=False), FoldData['train']['visitors'].values.astype(np.float32, copy=False))
        # for valid
        FoldData['valid'][strategy] = model.predict(FoldData['valid'][col])
        rmsle_valid = RMSLE(FoldData['valid']['visitors'].values, FoldData['valid'][strategy])
        cv_score += rmsle_valid
        # for holdout
        FoldData['holdout'][strategy] = model.predict(FoldData['holdout'][col])
        rmsle_holdout = RMSLE(FoldData['holdout']['visitors'].values, FoldData['holdout'][strategy])
        holdout_score += rmsle_holdout
        # for test
        FoldData['test'][strategy] = model.predict(FoldData['test'][col])
        y_test_pred += FoldData['test'][strategy]

        print('fold %s: valid score %.6f, holdout score %.6f, valid length %s' % (fold, rmsle_valid, rmsle_holdout, len(FoldData['valid'])))  
        #### output
        FoldOutputDir = '%s/%s' % (OutputDir, fold)
        if(os.path.exists(FoldOutputDir) == False):
            os.makedirs(FoldOutputDir)
        for mod in FoldData.keys():
            if(mod == 'train'):
                continue
            OutCols = []
            if(mod == 'test'):
                OutCols.append('id')
            OutCols.extend(['air_store_id', 'visit_date', 'visitors', strategy])
            OutputFile = '%s/%s_%s.csv' % (FoldOutputDir, mod, strategy)
            OutFoldData = FoldData[mod][OutCols]
            OutFoldData.to_csv(OutputFile, index= False)
        print('saving for %sth fold data done.' % (fold))
    
    y_test_pred /= kfold  # Average test set predictions
    cv_score /= kfold # Average valid set predictions
    holdout_score /= kfold # Average holdout set predictions
    
    sub_pred += y_test_pred
    cv_score_list.append(cv_score)
    holdout_score_list.append(holdout_score)
        
    finish_time = datetime.datetime.now()
    elapsed = (finish_time - start_time).seconds
    print('\n======================')
    print("CV score %.4f, Holdout score %.4f, Elapsed time: %.2fs" % (cv_score, holdout_score, elapsed))
    print('======================\n')

# Create submission file
sub = pd.DataFrame()
sub['id'] = test_dfs[0]['id']
sub['visitors'] = np.expm1(sub_pred/seed_num)
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l2/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.6f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

fold 0 done, time elapsed 34.72781276702881s
fold 1 done, time elapsed 69.77418804168701s
fold 2 done, time elapsed 103.91167187690735s
fold 3 done, time elapsed 139.214097738266s
fold 4 done, time elapsed 176.7791748046875s
----------------- CHECK -------------

           air_store_id visit_date  visitors  lgb_l2_meta_trees_25_200_0  \
0  air_ba937bf13d40fb24 2016-01-15  3.401197                    3.423392   
1  air_ba937bf13d40fb24 2016-01-29  3.295837                    3.423392   
2  air_ba937bf13d40fb24 2016-01-30  1.945910                    3.148281   
3  air_ba937bf13d40fb24 2016-02-10  3.496508                    3.148281   
4  air_ba937bf13d40fb24 2016-02-13  2.197225                    3.148281   

   lgb_l2_meta_trees_25_200_1  lgb_l2_meta_trees_25_200_2  \
0                    3.335629                    3.215400   
1                    3.335629                    3.215400   
2                    3.063832                    3.233474   
3                    3.335629      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


fold 0: valid score 0.498443, holdout score 0.493261, valid length 45371
saving for 0th fold data done.
fold 1: valid score 0.489366, holdout score 0.493243, valid length 45371
saving for 1th fold data done.
fold 2: valid score 0.491854, holdout score 0.492369, valid length 45371
saving for 2th fold data done.
fold 3: valid score 0.495657, holdout score 0.492469, valid length 45370
saving for 3th fold data done.
fold 4: valid score 0.490042, holdout score 0.492204, valid length 45370
saving for 4th fold data done.

CV score 0.4931, Holdout score 0.4927, Elapsed time: 303.00s

zip ../../data/meta_trees_25_200/l2/submit/en_submit_2018-01-18.zip ../../data/meta_trees_25_200/l2/submit/en_submit_2018-01-18.csv


0