In [3]:
import json
import pandas as pd
import dill as pickle

class DataUtil2:
    """"""
    @classmethod
    def load(cls, file, format, date_cols= None):
        """"""
        data = ''
        if(format== 'csv'):
            data = pd.read_csv(file, parse_dates= date_cols)
        elif(format== 'json'):
            with open(file, 'r') as i_file:
                data = json.load(file)
            i_file.close()
        elif(format== 'pkl'):
            with open(file, 'rb') as i_file:
                data = pickle.load(i_file)
            i_file.close()
        elif(format == 'hdf'):
            data = pd.read_hdf(path_or_buf= file, key='undefined')

        return  data

    @classmethod
    def save(cls, data, file, format, precision= 8):
        """"""
        if(format == 'csv'):
            data.to_csv(file, float_format= '%%.%df' % precision, index= False)
        elif(format == 'json'):
            with open(file, 'w') as o_file:
                json.dump(data, o_file, ensure_ascii= True, indent= 4)
            o_file.close()
        elif(format == 'pkl'):
            with open(file, 'wb') as o_file:
                pickle.dump(data, o_file, -1)
            o_file.close()
        elif(format== 'hdf'):
            data.to_hdf(path_or_buf= file, key='undefined', mode='w', complib='blosc')

        return

In [4]:
from catboost import CatBoostRegressor
import pandas as pd
import numpy as np
import os,sys,time,datetime
from sklearn import *

def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

drop_cols = ['id', 'visit_date', 'visitors', 'hpg_store_id', 'fold', 'air_store_id']

#cate_cols = ['store_id_encoded', 'area_name', 'city', 'genre_name']
#cate_feats = ['dow', 'hol_days', 'is_weekends', 'holiday_flg', 'month', 'is_up_corner']
#for mod in ['air', 'hpg']:
#    cate_feats.extend(['%s_%s' % (mod, c) for c in cate_cols])

DataBaseDir = '../../data'
InputDir = '%s/l0/kfold' % DataBaseDir
OutputDir = '%s/l1/kfold' % DataBaseDir
kfold = 5
strategy = 'cb_ef'
start_time = datetime.datetime.now()
#### load data
valid_dfs = []
holdout_dfs = []
test_dfs = []
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    holdout = pd.read_csv('%s/holdout.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    test = pd.read_csv('%s/test.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    valid['fold'] = fold
    valid_dfs.append(valid)
    holdout_dfs.append(holdout)
    test_dfs.append(test)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True) 

##### model selection with CV
# score
cv_score = .0
holdout_score = .0
# predict
y_test_pred = 0
# parameters

start = time.time()
for fold in range(kfold):
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'holdout': holdout_dfs[fold],
        'test': test_dfs[fold]
    }
    col = [c for c in FoldData['train'].columns if c not in drop_cols]
    # train
    model = CatBoostRegressor(learning_rate= 0.4, 
                              iterations= 200,
                              logging_level= 'Silent',
                              random_seed= 2017,
                              #bagging_temperature= 0.9,
                              depth= 12, 
                              loss_function='RMSE')
    model.fit(FoldData['train'][col].values, FoldData['train']['visitors'].values)
    # for valid
    FoldData['valid'][strategy] = model.predict(FoldData['valid'][col].values)
    rmsle_valid = RMSLE(FoldData['valid']['visitors'].values, FoldData['valid'][strategy])
    cv_score += rmsle_valid
    # for holdout
    FoldData['holdout'][strategy] = model.predict(FoldData['holdout'][col].values)
    rmsle_holdout = RMSLE(FoldData['holdout']['visitors'].values, FoldData['holdout'][strategy])
    holdout_score += rmsle_holdout
    # for test
    FoldData['test'][strategy] = model.predict(FoldData['test'][col].values)
    y_test_pred += FoldData['test'][strategy]

    print('fold %s: valid score %.6f, holdout score %.6f, valid length %s' % (fold, rmsle_valid, rmsle_holdout, len(FoldData['valid'])))  
    #### output
    FoldOutputDir = '%s/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    for mod in FoldData.keys():
        if(mod == 'train'):
            continue
        OutCols = []
        if(mod == 'test'):
            OutCols.append('id')
        OutCols.extend(['air_store_id', 'visit_date', 'visitors', strategy])
        OutputFile = '%s/%s_%s.csv' % (FoldOutputDir, mod, strategy)
        OutFoldData = FoldData[mod][OutCols]
        OutFoldData.to_csv(OutputFile, index= False)
    end = time.time()
    print('saving for %sth fold data done, time elapsed %.2fs.' % (fold, (end - start)))
    
y_test_pred /= kfold  # Average test set predictions
cv_score /= kfold # Average valid set predictions
holdout_score /= kfold # Average holdout set predictions

# Create submission file
sub = pd.DataFrame()
sub['id'] = test_dfs[0]['id']
sub['visitors'] = np.expm1(y_test_pred)
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l1/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.6f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

finish_time = datetime.datetime.now()
elapsed = (finish_time - start_time).seconds
print('\n======================')
print("CV score %.4f, Holdout score %.4f, Elapsed time: %.2fs" % (cv_score, holdout_score, elapsed))
print('======================\n')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


fold 0: valid score 0.508767, holdout score 0.498864, valid length 45371
saving for 0th fold data done, time elapsed 861.88s.
fold 1: valid score 0.500157, holdout score 0.497870, valid length 45371
saving for 1th fold data done, time elapsed 1754.15s.
fold 2: valid score 0.503304, holdout score 0.498024, valid length 45371
saving for 2th fold data done, time elapsed 2621.47s.
fold 3: valid score 0.509458, holdout score 0.499945, valid length 45370
saving for 3th fold data done, time elapsed 3528.02s.
fold 4: valid score 0.503285, holdout score 0.498950, valid length 45370
saving for 4th fold data done, time elapsed 4443.71s.
zip ../../data/l1/submit/cb_ef_submit_2018-01-02.zip ../../data/l1/submit/cb_ef_submit_2018-01-02.csv

CV score 0.5050, Holdout score 0.4987, Elapsed time: 4454.00s

