In [10]:
#######################
# Elastic Net Regression #
#######################
import numpy as np
import pandas as pd
import datetime
import time
import os,sys
import gc
from sklearn import *
import lightgbm

def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

DataBaseDir = '../../data/MixedModels'
InputDir = '%s/l1/kfold' % DataBaseDir
OutputDir = '%s/l2/kfold' % DataBaseDir
kfold = 5
TreeNum = 100
TopTreePair = 200
InputCols2 = ['lgb_l2', 'lgb_huber', 'lgb_fair', 'etr', 'en', 'knn', 'rf', 'xgb_rmse', 'lassolars', 'gbr_ls', 'gbr_huber', 'gbr_lad', 'rgf', 'rgf_sib']
strategy = 'en'
start_time = datetime.datetime.now()
#### load pair indexs
PairIndex = pd.read_csv('%s/l1/pairs/top%s_holdout.csv' % (DataBaseDir, TopTreePair))
PairIndex.columns = ['a', 'b']
PairNum = len(PairIndex)
#### load data
valid_dfs = []
holdout_dfs = []
test_dfs = []
join_keys = ['air_store_id', 'visit_date']
start = time.time()
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    FoldValid = pd.DataFrame()
    FoldHoldout = pd.DataFrame()
    FoldTest = pd.DataFrame()
    for i in PairIndex.index:
        aidx, bidx = PairIndex.loc[i, 'a'], PairIndex.loc[i, 'b']
        valid_a = pd.read_csv('%s/valid_%s.csv' % (FoldInputDir, 'lgb_l2_%s' % aidx), parse_dates= ['visit_date']).reset_index(drop= True)
        valid_b = pd.read_csv('%s/valid_%s.csv' % (FoldInputDir, 'lgb_l2_%s' % bidx), parse_dates= ['visit_date']).reset_index(drop= True)
        holdout_a = pd.read_csv('%s/holdout_%s.csv' % (FoldInputDir, 'lgb_l2_%s' % aidx), parse_dates= ['visit_date']).reset_index(drop= True)
        holdout_b = pd.read_csv('%s/holdout_%s.csv' % (FoldInputDir, 'lgb_l2_%s' % bidx), parse_dates= ['visit_date']).reset_index(drop= True)
        test_a = pd.read_csv('%s/test_%s.csv' % (FoldInputDir, 'lgb_l2_%s' % aidx), parse_dates= ['visit_date']).reset_index(drop= True)
        test_b = pd.read_csv('%s/test_%s.csv' % (FoldInputDir, 'lgb_l2_%s' % bidx), parse_dates= ['visit_date']).reset_index(drop= True)
        if(i == 0):
            FoldValid = valid_a.drop(['lgb_l2_%s' % aidx], axis= 1)
            FoldHoldout = holdout_a.drop(['lgb_l2_%s' % aidx], axis= 1)
            FoldTest = test_a.drop(['lgb_l2_%s' % aidx], axis= 1)
        FoldValid['pair_%s_%s' % (aidx, bidx)] = (valid_a['lgb_l2_%s' % aidx] + valid_b['lgb_l2_%s' % bidx])/2
        FoldHoldout['pair_%s_%s' % (aidx, bidx)] = (holdout_a['lgb_l2_%s' % aidx] + holdout_b['lgb_l2_%s' % bidx])/2
        FoldTest['pair_%s_%s' % (aidx, bidx)] = (test_a['lgb_l2_%s' % aidx] + test_b['lgb_l2_%s' % bidx])/2
    FoldInputDir = '../../data/l1/kfold/%s' % (fold)
    for i in range(len(InputCols2)):
        valid = pd.read_csv('%s/valid_%s.csv' % (FoldInputDir, InputCols2[i]), parse_dates= ['visit_date']).reset_index(drop= True)
        holdout = pd.read_csv('%s/holdout_%s.csv' % (FoldInputDir, InputCols2[i]), parse_dates= ['visit_date']).reset_index(drop= True)
        test = pd.read_csv('%s/test_%s.csv' % (FoldInputDir, InputCols2[i]), parse_dates= ['visit_date']).reset_index(drop= True)
        FoldValid[InputCols2[i]] = valid[InputCols2[i]]
        FoldHoldout[InputCols2[i]] = holdout[InputCols2[i]]
        FoldTest[InputCols2[i]] = test[InputCols2[i]]
    ###
    FoldValid['fold'] = fold
    valid_dfs.append(FoldValid)
    holdout_dfs.append(FoldHoldout)
    test_dfs.append(FoldTest)
    end = time.time()
    print('fold %s done, time elapsed %ss' % (fold, (end - start)))
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
print('----------------- CHECK -------------')
print('\n============================ TrainData')
print(TrainData.head(5))
print('\n============================ HoldoutData')
print(holdout_dfs[0].head(5))
print('\n============================ TestData')
print(test_dfs[0].head(5))
print('-------------------------------------')
# sys.exit(1)
##### model selection with CV
# score
cv_score = .0
holdout_score = .0
# predict
y_test_pred = 0
for fold in range(kfold):
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'holdout': holdout_dfs[fold],
        'test': test_dfs[fold]
    }
    col = [c for c in FoldData['train'].columns if c not in ['id', 'air_store_id', 'visit_date', 'visitors', 'hpg_store_id', 'fold']]
    # train
    model = linear_model.ElasticNet(alpha= 0.0001, l1_ratio= 0.2, max_iter= 400, tol= 1e-4, selection= 'random', random_state= 2017)
    model.fit(FoldData['train'][col].astype(np.float32, copy=False), FoldData['train']['visitors'].values.astype(np.float32, copy=False))
    # for valid
    FoldData['valid'][strategy] = model.predict(FoldData['valid'][col])
    rmsle_valid = RMSLE(FoldData['valid']['visitors'].values, FoldData['valid'][strategy])
    cv_score += rmsle_valid
    # for holdout
    FoldData['holdout'][strategy] = model.predict(FoldData['holdout'][col])
    rmsle_holdout = RMSLE(FoldData['holdout']['visitors'].values, FoldData['holdout'][strategy])
    holdout_score += rmsle_holdout
    # for test
    FoldData['test'][strategy] = model.predict(FoldData['test'][col])
    y_test_pred += FoldData['test'][strategy]

    print('fold %s: valid score %.6f, holdout score %.6f, valid length %s' % (fold, rmsle_valid, rmsle_holdout, len(FoldData['valid'])))  
    #### output
    FoldOutputDir = '%s/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    for mod in FoldData.keys():
        if(mod == 'train'):
            continue
        OutCols = []
        if(mod == 'test'):
            OutCols.append('id')
        OutCols.extend(['air_store_id', 'visit_date', 'visitors', strategy])
        OutputFile = '%s/%s_%s.csv' % (FoldOutputDir, mod, strategy)
        OutFoldData = FoldData[mod][OutCols]
        OutFoldData.to_csv(OutputFile, index= False)
    print('saving for %sth fold data done.' % (fold))
    
y_test_pred /= kfold  # Average test set predictions
cv_score /= kfold # Average valid set predictions
holdout_score /= kfold # Average holdout set predictions

# Create submission file
sub = pd.DataFrame()
sub['id'] = test_dfs[0]['id']
sub['visitors'] = np.expm1(y_test_pred)
#OutputStrategy = 'top_%s_tree_pairs_within_%s_trees' % (PairNum, TreeNum)
OutputStrategy = 'top_%s_tree_pairs_lined_with_all_single_models' % (PairNum)
OutputFileName = '%s_%s_submit_%s' % (strategy, OutputStrategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l2/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.6f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

finish_time = datetime.datetime.now()
elapsed = (finish_time - start_time).seconds
print('\n======================')
print("CV score %.4f, Holdout score %.4f, Elapsed time: %.2fs" % (cv_score, holdout_score, elapsed))
print('======================\n')

fold 0 done, time elapsed 64.54142189025879s
fold 1 done, time elapsed 127.7113687992096s
fold 2 done, time elapsed 190.09888982772827s
fold 3 done, time elapsed 256.9738419055939s
fold 4 done, time elapsed 324.7881238460541s
----------------- CHECK -------------

           air_store_id visit_date  visitors  pair_24_48  pair_24_77  \
0  air_ba937bf13d40fb24 2016-01-15  3.401197    3.404708    3.355625   
1  air_ba937bf13d40fb24 2016-01-29  3.295837    3.498439    3.421176   
2  air_ba937bf13d40fb24 2016-01-30  1.945910    3.367031    3.294966   
3  air_ba937bf13d40fb24 2016-02-10  3.496508    3.285590    3.266842   
4  air_ba937bf13d40fb24 2016-02-13  2.197225    3.343023    3.307239   

   pair_48_59  pair_24_83  pair_59_77  pair_24_71  pair_71_77  ...        knn  \
0    3.401790    3.442946    3.352706    3.421346    3.401039  ...   2.874370   
1    3.481786    3.536892    3.404523    3.454114    3.409780  ...   2.986891   
2    3.320319    3.408763    3.248254    3.363968    3.3334

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


fold 0: valid score 0.490528, holdout score 0.482246, valid length 45371
saving for 0th fold data done.
fold 1: valid score 0.481273, holdout score 0.482436, valid length 45371
saving for 1th fold data done.
fold 2: valid score 0.483521, holdout score 0.481571, valid length 45371
saving for 2th fold data done.
fold 3: valid score 0.487322, holdout score 0.481427, valid length 45370
saving for 3th fold data done.
fold 4: valid score 0.482613, holdout score 0.481596, valid length 45370
saving for 4th fold data done.
zip ../../data/MixedModels/l2/submit/en_top_199_tree_pairs_lined_with_all_single_models_submit_2018-01-10.zip ../../data/MixedModels/l2/submit/en_top_199_tree_pairs_lined_with_all_single_models_submit_2018-01-10.csv

CV score 0.4851, Holdout score 0.4819, Elapsed time: 472.00s

