In [3]:
import numpy as np
import pandas as pd
import datetime
import time
import numba
import os,sys
import gc
import math

def LoadData(InputDir):
    """"""
    ## load raw data
    data = {
        'tra': pd.read_csv('%s/air_visit_data.csv' % InputDir, parse_dates= ['visit_date']),
        'as': pd.read_csv('%s/air_store_info.csv' % InputDir),
        'hs': pd.read_csv('%s/hpg_store_info.csv' % InputDir),
        'ar': pd.read_csv('%s/air_reserve.csv' % InputDir, parse_dates= ['visit_datetime', 'reserve_datetime']),
        'hr': pd.read_csv('%s/hpg_reserve.csv' % InputDir, parse_dates= ['visit_datetime', 'reserve_datetime']),
        'id': pd.read_csv('%s/store_id_relation.csv' % InputDir),
        'tes': pd.read_csv('%s/sample_submission.csv' % InputDir),
        'hol': pd.read_csv('%s/date_info.csv' % InputDir, parse_dates=['calendar_date']).rename(columns={'calendar_date': 'visit_date'})
    }
    return data

@numba.jit
def ApplyDayoff(VisitCols, ReserveCols):
    """"""
    n = len(VisitCols)
    result = np.zeros((n, 1), dtype= 'int8')
    for i in range(n):
        d = (VisitCols[i]- ReserveCols[i]).days
        if(d > 0):
            result[i] = d
    return result

reserve2id = {'ar': 'air', 'hr': 'hpg'}
reserve2store = {'ar': 'as', 'hr': 'hs'}# load data set
InputDir = '../../data/raw'
DataSet = LoadData(InputDir)
#### 
# date related features
print('\n============')
for mod in ['tra', 'tes']:
    start0 = time.time()
    if (mod == 'tes'):
        DataSet[mod]['visit_date'] = DataSet[mod]['id'].map(lambda x: str(x).split('_')[2])
        DataSet[mod]['air_store_id'] = DataSet[mod]['id'].map(lambda x: '_'.join(x.split('_')[:2]))
        DataSet[mod]['visit_date'] = pd.to_datetime(DataSet[mod]['visit_date'])
    DataSet[mod]['dow'] = DataSet[mod]['visit_date'].dt.dayofweek
    DataSet[mod]['year'] = DataSet[mod]['visit_date'].dt.year
    DataSet[mod]['month'] = DataSet[mod]['visit_date'].dt.month
    DataSet[mod]['visit_date'] = DataSet[mod]['visit_date'].dt.date
    end0 = time.time()
    print('%s data: unique stores %s, total %s, time elased %.2fs.' %
            (mod, len(DataSet[mod]['air_store_id'].unique()), len(DataSet[mod]['air_store_id']), (end0 - start0)))
print('============= process date related done.\n')
######## store data
# add city feature
for mod in ['ar', 'hr']:
    DataSet[reserve2store[mod]]['%s_city' % reserve2id[mod]] = DataSet[reserve2store[mod]]['%s_area_name' % reserve2id[mod]].str[:5]
    DataSet[reserve2store[mod]]['%s_area_name' % reserve2id[mod]] = DataSet[reserve2store[mod]]['%s_area_name' % reserve2id[mod]].map(lambda x: '_'.join(x.split(' ')[1:]))
print('add city feature done.')
# area (store)count
for mod in ['ar', 'hr']:
    rec = []
    groupped = DataSet[reserve2store[mod]].groupby(['%s_area_name' % reserve2id[mod]])
    for g in groupped.groups:
        ac = {}
        ac['%s_area_name' % reserve2id[mod]] = g
        ac['%s_area_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= ['%s_area_name' % reserve2id[mod]])
# genre (store)count
for mod in ['ar', 'hr']:
    rec = []
    groupped = DataSet[reserve2store[mod]].groupby(['%s_genre_name' % reserve2id[mod]])
    for g in groupped.groups:
        ac = {}
        ac['%s_genre_name' % reserve2id[mod]] = g
        ac['%s_genre_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= ['%s_genre_name' % reserve2id[mod]])
#  area_genre (store) count 
for mod in ['ar', 'hr']:
    rec = []
    groupby_keys = ['%s_area_name' % reserve2id[mod], '%s_genre_name' % reserve2id[mod]]
    groupped = DataSet[reserve2store[mod]].groupby(groupby_keys)
    for g in groupped.groups:
        ac = {}
        ac['%s_area_name' % reserve2id[mod]] = g[0]
        ac['%s_genre_name' % reserve2id[mod]] = g[1]
        ac['%s_area_genre_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= groupby_keys)
# city (store)count
for mod in ['ar', 'hr']:
    rec = []
    groupped = DataSet[reserve2store[mod]].groupby(['%s_city' % reserve2id[mod]])
    for g in groupped.groups:
        ac = {}
        ac['%s_city' % reserve2id[mod]] = g
        ac['%s_city_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        #ac['%s_area_store_ratio' % reserve2id[mod]] = ac['%s_area_store_count' % reserve2id[mod]]/len(DataSet[reserve2store[mod]])
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= ['%s_city' % reserve2id[mod]])
#  city_genre (store) count 
for mod in ['ar', 'hr']:
    rec = []
    groupby_keys = ['%s_city' % reserve2id[mod], '%s_genre_name' % reserve2id[mod]]
    groupped = DataSet[reserve2store[mod]].groupby(groupby_keys)
    for g in groupped.groups:
        ac = {}
        ac['%s_city' % reserve2id[mod]] = g[0]
        ac['%s_genre_name' % reserve2id[mod]] = g[1]
        ac['%s_city_genre_store_count' % reserve2id[mod]] = len(groupped.get_group(g)['%s_store_id' % reserve2id[mod]].unique())
        rec.append(ac)
    tmpdf = pd.DataFrame(data= rec, index= range(len(rec)))
    DataSet[reserve2store[mod]] = DataSet[reserve2store[mod]].merge(tmpdf, how= 'left', on= groupby_keys)
print(' ================ add count features done.\n')
######### holiday data
data = DataSet['hol']
### add holiday days
data['visit_date'] = data['visit_date'].dt.date
data = data.sort_values(by= 'visit_date')
def TagHoliday(df):
    ''''''
    n = len(df)
    result = ['' for x in range(n)]
    for i in range(n):
        if(i == 0):
            result[i] = 'hid_%s' % 0
        elif((df[i] - df[i-1]).days == 1):
            result[i] = result[i - 1]
        else:
            result[i] = 'hid_%s' % (int(result[i - 1].split('_')[1]) + 1)
    return result
holidays = data[data['holiday_flg'] == 1][['visit_date']]
holidays['hol_l0'] = TagHoliday(holidays['visit_date'].values)
groupped = holidays.groupby(['hol_l0'])
recs = []
for g in groupped.groups:
    hol_days = {}
    hol_days['hol_l0'] = g
    hol_days['hol_days'] = len(groupped.get_group(g))
    recs.append(hol_days)
tmpdf = pd.DataFrame(data= recs, index= range(len(recs)))
holidays = holidays.merge(tmpdf, how= 'left', on= 'hol_l0')
data = data.merge(holidays, how= 'left', on= 'visit_date')
data.drop(['hol_l0'], axis= 1, inplace= True)
data['hol_days'].fillna(0, inplace= True)
print('add holiday type done.')
### reset holiday
wkend_holidays = data.apply((lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
data['is_weekends'] = (data['day_of_week'] == 'Sunday') | (data['day_of_week'] == 'Saturday')
data.loc[wkend_holidays, 'holiday_flg'] = 0
DataSet['hol'] = data
print('========== reset holiday done.\n')
######## join 
# join holiday data
for mod in ['tra', 'tes']:
    data = DataSet[mod]
    data = data.merge(DataSet['hol'], how='left', on=['visit_date'])
    data.drop(['day_of_week', 'year'], axis=1, inplace=True)
    DataSet[mod] = data
# join store data
for mod in ['tra', 'tes']:
    data = DataSet[mod]
    for rtype in ['ar', 'hr']: 
        if((rtype == 'hr') & (('%s_store_id' % reserve2id[rtype]) not in data.columns)):
            data = data.merge(DataSet['id'], how= 'left', on= ['air_store_id'])
        data = data.merge(DataSet[reserve2store[rtype]], how= 'left', on= ['%s_store_id' % reserve2id[rtype]])
    DataSet[mod] = data
print('================ join holiday, store data done.')
######### reservation data
for mod in ['hr', 'ar']:
    start1 = time.time()
    DataSet[mod]['visit_date'] = DataSet[mod]['visit_datetime'].dt.date
    DataSet[mod]['reserve_date'] = DataSet[mod]['reserve_datetime'].dt.date
    DataSet[mod].drop(['reserve_datetime', 'visit_datetime'], axis= 1, inplace= True)
    tmpdf = pd.DataFrame(data=ApplyDayoff(DataSet[mod]['visit_date'].values, DataSet[mod]['reserve_date'].values),index=DataSet[mod].index, columns=['reserve_date_diff'])
    tmpdf = pd.concat([DataSet[mod], tmpdf], axis=1)
    tmpdf = tmpdf.groupby(['%s_store_id' % reserve2id[mod], 'visit_date'], as_index=False).agg({'reserve_visitors': sum, 'reserve_date_diff': ['mean', 'median']})
    tmpdf.columns = ['%s_store_id' % reserve2id[mod], 
                   'visit_date', 
                   '%s_reserved_visitors' % reserve2id[mod], 
                   '%s_reserved_dayoff_mean' % reserve2id[mod], 
                   '%s_reserved_dayoff_median' % reserve2id[mod]
                  ]
    end1 = time.time()
    DataSet[mod] = tmpdf
    DataSet[mod]['%s_reserved_visitors' % reserve2id[mod]] = np.log1p(DataSet[mod]['%s_reserved_visitors' % reserve2id[mod]])
print(' process reservation data done.\n')
# join reservation data
for mod in ['tra', 'tes']:
    data = DataSet[mod]
    for rtype in ['ar', 'hr']: 
        if((rtype == 'hr') & (('%s_store_id' % reserve2id[rtype]) not in data.columns)):
            data = data.merge(DataSet['id'], how= 'left', on= ['air_store_id'])
        data = data.merge(DataSet[rtype], how= 'left', on= ['%s_store_id' % reserve2id[rtype], 'visit_date'])
        # updated 2017/12/26 14:32
        data['%s_reserved_visitors' % reserve2id[rtype]].fillna(-1, inplace= True)
        data['%s_reserved_dayoff_mean' % reserve2id[rtype]].fillna(-1, inplace= True)
        data['%s_reserved_dayoff_median' % reserve2id[rtype]].fillna(-1, inplace= True)
    data['reserved_visitors'] = (data['air_reserved_visitors'] + data['hpg_reserved_visitors'])/2
    data['reserved_dayoff_mean'] = (data['air_reserved_dayoff_mean'] + data['hpg_reserved_dayoff_mean'])/2
    data['reserved_dayoff_median'] = (data['air_reserved_dayoff_median'] + data['hpg_reserved_dayoff_median'])/2
    DataSet[mod] = data
print('============= join reservation data done.\n')
####### time series related
s = time.time()

# mix train with test
DataSet['tra']['is_train'] = 1
DataSet['tes']['is_train'] = 0
AllData = pd.concat([DataSet['tra'], DataSet['tes']], axis= 0, ignore_index= True)
# !!! dividing into two pieces since 2016/7/1 is a corner point, update time 2017/12/22 15:45
DataParts = {
    '0': AllData[AllData['visit_date'] < datetime.date(2016, 7, 1)],
    '1': AllData[AllData['visit_date'] >= datetime.date(2016, 7, 1)]
}
for pidx in DataParts.keys():
    ## rolling sum by days
    groupped = DataParts[pidx].groupby(['air_store_id'])
    visitor_ticks = [39, 46, 53, 60, 67, 74, 81]#, 88, 95, 102, 109, 116, 123]  # for days
    print('total groups %s ' % len(groupped.groups))
    dfs = []
    for g in groupped.groups: 
        gdf = groupped.get_group(g).sort_values(by= ['visit_date'])
        for t in visitor_ticks:
            gdf['visitor_tick_sum_%s' % t] = np.log1p(gdf['visitors']).rolling(window= t).sum()
            gdf['visitor_tick_sum_%s' % t].fillna(0, inplace= True)
        dfs.append(gdf)
    # concate
    tmpdf = pd.concat(dfs, axis= 0, ignore_index= True)
    join_cols = ['air_store_id', 'visit_date']
    for i in range(len(visitor_ticks)):
        if(i == 0):
            continue
        # rolling mean for one week
        k_mean = 'visitor_rolling_%s_%s_mean' % (visitor_ticks[i], visitor_ticks[i - 1])
        tmpdf[k_mean] = (tmpdf['visitor_tick_sum_%s' % visitor_ticks[i]] - tmpdf['visitor_tick_sum_%s' % visitor_ticks[i - 1]]) / (visitor_ticks[i] - visitor_ticks[i - 1])
        tmpdf.loc[tmpdf[k_mean] < 0, k_mean] = -1  ## negative values exists, need to be set zero, updated 2016/12/22 20:30
        join_cols.append(k_mean)
    # merge
    tmpdf.drop(['visitor_tick_sum_%s' % col for col in visitor_ticks], axis= 1, inplace= True)
    DataParts[pidx] = DataParts[pidx].merge(tmpdf[join_cols], how= 'left', on= ['air_store_id', 'visit_date'])
    print('part %s rolling done.' % pidx)
# concat after all is done
AllData = pd.concat([DataParts['0'], DataParts['1']], axis= 0, ignore_index= True)
# restore
DataSet['tra'] = AllData[AllData['is_train'] == 1]
DataSet['tes'] = AllData[AllData['is_train'] == 0]
DataSet['tra'].drop(['is_train'], axis= 1, inplace= True)
DataSet['tes'].drop(['is_train'], axis= 1, inplace= True)
del AllData
gc.collect()
print('add time series features done.')
#### add date_int
for mod in ['tra', 'tes']:
    DataSet[mod]['date_int'] = DataSet[mod]['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
    DataSet[mod]['date_int'] = DataSet[mod]['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
print('add date int features done.')
### add var_max_lat/var_max_long
for mod in ['tra', 'tes']:
    DataSet[mod]['lon_plus_lat_x'] = DataSet[mod]['longitude_x'] + DataSet[mod]['latitude_x'] 
    DataSet[mod]['var_max_long_x'] = DataSet[mod]['longitude_x'].max() - DataSet[mod]['longitude_x']
    DataSet[mod]['var_max_lat_x'] = DataSet[mod]['latitude_x'].max() - DataSet[mod]['latitude_x']
e = time.time()
print('time elapsed %ss' % ((e - s) * 60))
print(' ============= add time series related features done.\n')


tra data: unique stores 829, total 252108, time elased 0.79s.
tes data: unique stores 821, total 32019, time elased 0.14s.

add city feature done.

add holiday type done.

 process reservation data done.


total groups 316 
part 0 rolling done.
total groups 829 
part 1 rolling done.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


add time series features done.
add date int features done.
time elapsed 992.9583549499512s



In [4]:
### fill nulls, updated 2016/12/26 14:58
from sklearn import *
cate_feats = ['genre_name', 'area_name', 'city']
cate_cols = ['%s_%s' % (m, cf) for m in ['air', 'hpg'] for cf in cate_feats]
cate_cols.append('air_store_id')
for mod in ['tra', 'tes']:
    for col in DataSet[mod].columns:
        if(col in cate_cols):
            DataSet[mod][col].fillna('unknown', inplace= True)
        elif(col == 'latitude_y'):
            DataSet[mod][col].fillna(DataSet[mod]['latitude_x'], inplace= True)
        elif(col == 'longitude_y'):
            DataSet[mod][col].fillna(DataSet[mod]['longitude_x'], inplace= True)
        else:
            DataSet[mod][col].fillna(-1, inplace= True)
print('filling missings done.')



filling missings done.


In [5]:
#### Label encoding for categorial features
TrainData = DataSet['tra']
TestData = DataSet['tes']
# for col in cate_cols:
#     lbl = preprocessing.LabelEncoder()
#     TrainData[col] = lbl.fit_transform(TrainData[col])
#     TestData[col] = lbl.transform(TestData[col])
# print('encoding for categorial features done.')

In [10]:
import sys,os
from sklearn.preprocessing import LabelBinarizer 

def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

## !!! add corner tag since 2016/7/1 is a corner point, update time 2017/12/22 15:45
TrainData['is_up_corner'] = TrainData['visit_date'] < datetime.date(2016, 7, 1)
TestData['is_up_corner'] = TestData['visit_date'] < datetime.date(2016, 7, 1)
# split TrainData into train and holdout with random strategy
np.random.seed(2017)
msk = np.random.rand(len(TrainData)) < 0.1
holdout = TrainData[msk]
train = TrainData[~msk]
test = TestData
# Set up folds
K = 5
kf = model_selection.KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(1)
OutputDir = '../../data/l0'
if(os.path.exists('%s/kfold' % OutputDir) == False):
    os.makedirs('%s/kfold' % OutputDir)
cate_cols.append('air_store_id')
for i, (train_index, test_index) in enumerate(kf.split(train)):
    FoldTrain, FoldValid = train.iloc[train_index].copy(), train.iloc[test_index].copy()
    FoldHoldout = holdout.copy()
    FoldTest = test.copy()
    FoldTrain['visitors'] = np.log1p(FoldTrain['visitors'])
    FoldValid['visitors'] = np.log1p(FoldValid['visitors'])
    FoldHoldout['visitors'] = np.log1p(FoldHoldout['visitors'])
    FoldTest['visitors'] = np.log1p(FoldTest['visitors'])
    #### one hot encoding
    for col in cate_cols:
        label_binarizer = LabelBinarizer()
        label_binarizer.fit(FoldTrain[col])
        encoded_cols = ['%s:%s' % (col, c) for c in label_binarizer.classes_]
        # for valid
        encoded = pd.DataFrame(data= label_binarizer.transform(FoldValid[col]), index= FoldValid.index, columns= encoded_cols)
        FoldValid = pd.concat([FoldValid, encoded], axis= 1)
        FoldValid.drop([col], axis= 1, inplace= True)
        # for holdout
        encoded = pd.DataFrame(data= label_binarizer.transform(FoldHoldout[col]), index= FoldHoldout.index, columns= encoded_cols)
        FoldHoldout = pd.concat([FoldHoldout, encoded], axis= 1)
        FoldHoldout.drop([col], axis= 1, inplace= True)
        # for test
        encoded = pd.DataFrame(data= label_binarizer.transform(FoldTest[col]), index= FoldTest.index, columns= encoded_cols)
        FoldTest = pd.concat([FoldTest, encoded], axis= 1)
        FoldTest.drop([col], axis= 1, inplace= True)
        #### Label encoding for categorial features
#     for col in cate_cols:
#         lbl = preprocessing.LabelEncoder()
#         lbl.fit(FoldTrain[col])
#         FoldValid[col] = lbl.transform(FoldValid[col])
#         FoldHoldout[col] = lbl.transform(FoldHoldout[col])
#         FoldTest[col] = lbl.transform(FoldTest[col])
    ## Target encoding for categorial features
#     FoldTrain['log1p_visitors'] = np.log1p(FoldTrain['visitors'])
#     for col in cate_cols:
#         gkeys = [col, 'is_up_corner']
#         tmpdf = FoldTrain.groupby(gkeys, as_index= False).agg({'log1p_visitors': np.mean})
#         tmpcols = gkeys.copy()
#         tmpcols.extend(['mean_log1p_visitors'])
#         tmpdf.columns = tmpcols
#         FoldValid = FoldValid.merge(tmpdf, how= 'left', on= gkeys)
#         FoldHoldout = FoldHoldout.merge(tmpdf, how= 'left', on= gkeys)
#         FoldTest = FoldTest.merge(tmpdf, how= 'left', on= gkeys)
#         FoldValid[col] = FoldValid['mean_log1p_visitors']
#         FoldHoldout[col] = FoldHoldout['mean_log1p_visitors']
#         FoldTest[col] = FoldTest['mean_log1p_visitors']
#         FoldValid.drop(['mean_log1p_visitors'], axis= 1, inplace= True)
#         FoldHoldout.drop(['mean_log1p_visitors'], axis= 1, inplace= True)
#         FoldTest.drop(['mean_log1p_visitors'], axis= 1, inplace= True)
#         FoldValid[col].fillna(0, inplace= True)
#         FoldHoldout[col].fillna(0, inplace= True)
#         FoldTest[col].fillna(0, inplace= True)
#     FoldTrain.drop(['log1p_visitors'], axis= 1, inplace= True)
    
    print('encoding for fold %s done.' % i)
    #### dependent features which is extreemly subtle to data-leak
    # percentiles features
    tickles = ['mean', 'median', 'max', 'min', 'count']
    for feat in ['air_store_id']:
        gkeys = [feat, 'dow', 'is_up_corner']
        gprefix = '_'.join(gkeys)
        TmpDOW = FoldTrain.groupby(gkeys, as_index= False).agg({'visitors': tickles})    
        tmpcols = gkeys.copy()
        tmpcols.extend(['%s_%s' % (gprefix, m) for m in tickles])
        TmpDOW.columns = tmpcols
        #FoldTrain = FoldTrain.merge(TmpDOW, how= 'left', on=gkeys) #### data-leak, prone to be overfitted
        FoldValid = FoldValid.merge(TmpDOW, how= 'left', on=gkeys)
        FoldHoldout = FoldHoldout.merge(TmpDOW, how= 'left', on=gkeys)
        FoldTest = FoldTest.merge(TmpDOW, how= 'left', on=gkeys)
        FoldValid.fillna(0, inplace= True)
        FoldHoldout.fillna(0, inplace= True)
        FoldTest.fillna(0, inplace= True)
    for feat in ['air_city']:
        gkeys = [feat, 'air_genre_name', 'is_up_corner']
        gprefix = '_'.join(gkeys)
        TmpDOW = FoldTrain.groupby(gkeys, as_index= False).agg({'visitors': tickles})    
        tmpcols = gkeys.copy()
        tmpcols.extend(['%s_%s' % (gprefix, m) for m in tickles])
        TmpDOW.columns = tmpcols
        #FoldTrain = FoldTrain.merge(TmpDOW, how= 'left', on=gkeys) #### data-leak, prone to be overfitted
        FoldValid = FoldValid.merge(TmpDOW, how= 'left', on=gkeys)
        FoldHoldout = FoldHoldout.merge(TmpDOW, how= 'left', on=gkeys)
        FoldTest = FoldTest.merge(TmpDOW, how= 'left', on=gkeys)
        FoldValid.fillna(0, inplace= True)
        FoldHoldout.fillna(0, inplace= True)
        FoldTest.fillna(0, inplace= True)
    print('add pencentile features for fold %s done.' % i)
    FoldOutputDir = '%s/kfold/%s' % (OutputDir, i)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    #FoldTrain.to_csv('%s/train.csv' % FoldOutputDir)
    FoldValid.to_csv('%s/valid.csv' % FoldOutputDir, index= False)
    FoldHoldout.to_csv('%s/holdout.csv' % FoldOutputDir, index= False)
    FoldTest.to_csv('%s/test.csv' % FoldOutputDir, index= False)
    
    print('Fold %s done.' % i)

encoding for fold 0 done.
add pencentile features for fold 0 done.
Fold 0 done.
encoding for fold 1 done.
add pencentile features for fold 1 done.
Fold 1 done.
encoding for fold 2 done.
add pencentile features for fold 2 done.
Fold 2 done.
encoding for fold 3 done.
add pencentile features for fold 3 done.
Fold 3 done.
encoding for fold 4 done.
add pencentile features for fold 4 done.
Fold 4 done.
