In [69]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Reshape, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import Merge, Flatten
from keras.callbacks import ModelCheckpoint
from keras.initializers import random_normal
import datetime,sys,os,time
import pandas as pd
import numpy as np
import sys,os
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from sklearn import preprocessing
from scipy.special import erfinv
from tensorflow import set_random_seed
set_random_seed(2018)

In [70]:
import json
import pandas as pd
import dill as pickle

class DataUtil2:
    """"""
    @classmethod
    def load(cls, file, format, date_cols= None):
        """"""
        data = ''
        if(format== 'csv'):
            data = pd.read_csv(file, parse_dates= date_cols)
        elif(format== 'json'):
            with open(file, 'r') as i_file:
                data = json.load(file)
            i_file.close()
        elif(format== 'pkl'):
            with open(file, 'rb') as i_file:
                data = pickle.load(i_file)
            i_file.close()
        elif(format == 'hdf'):
            data = pd.read_hdf(path_or_buf= file, key='undefined')

        return  data

    @classmethod
    def save(cls, data, file, format, precision= 8):
        """"""
        if(format == 'csv'):
            data.to_csv(file, float_format= '%%.%df' % precision, index= False)
        elif(format == 'json'):
            with open(file, 'w') as o_file:
                json.dump(data, o_file, ensure_ascii= True, indent= 4)
            o_file.close()
        elif(format == 'pkl'):
            with open(file, 'wb') as o_file:
                pickle.dump(data, o_file, -1)
            o_file.close()
        elif(format== 'hdf'):
            data.to_hdf(path_or_buf= file, key='undefined', mode='w', complib='blosc')

        return

In [71]:
cate_cols = ['store_id_encoded', 'area_name', 'city', 'genre_name']
cate_num_feats = ['dow', 'day', 'wom', 'woy', 'holidays_by_month', 'before_year', 'after_year', 'first_day_holiday', 
                  'last_day_holiday', 'hol_days', 'is_weekends', 'holiday_flg', 'month', 'is_up_corner', 
                  'prev_is_holiday', 'next_is_holiday']
cate_feats = cate_num_feats.copy()
for mod in ['air', 'hpg']:
    cate_feats.extend(['%s_%s' % (mod, c) for c in cate_cols])
## count related
count_cols = ['area_genre_store', 'area_store', 'city_genre_store', 'city_store', 'genre_store']
num_feats = []
for mod in ['air', 'hpg']:
    num_feats.extend(['count_%s_%s' % (mod, c) for c in count_cols])
## aggregate related
# agg_cols = ['dow', 'is_weekends', 'hol_days', 'prev_is_holiday', 'next_is_holiday']
# for a in agg_cols:
#     for b in ['visitors', 'reserve_visitors']:
#         for c in ['max', 'mean', 'median', 'min', 'count']:
#             num_feats.append('air_store_id_%s_is_up_corner_%s_%s' % (a, b, c))
# for a in agg_cols:
#     for b in ['max', 'mean', 'median', 'min', 'count']:
#         c1 = 'air_store_id_%s_is_up_corner_visitors_%s' % (a, b)
#         c2 = 'air_store_id_%s_is_up_corner_reserve_visitors_%s' % (a, b)
#         num_feats.append('%s_%s_diff' % (c1, c2))
# ## inter count related
# pairs = [('air_area_genre_store_count', 'air_area_store_count'), 
#          ('air_city_genre_store_count', 'air_city_store_count'), 
#          ('air_genre_store_count', 'air_area_genre_store_count'), 
#          ('air_genre_store_count', 'air_city_genre_store_count'), 
#          ('air_genre_store_count', 'air_city_store_count'), 
#          ('air_genre_store_count', 'air_area_store_count'), 
#          ('air_area_store_count', 'air_city_store_count'), 
#          ('hpg_area_genre_store_count', 'hpg_area_store_count'), 
#          ('hpg_city_genre_store_count', 'hpg_city_store_count'), 
#          ('hpg_genre_store_count', 'hpg_city_genre_store_count'), 
#          ('hpg_genre_store_count', 'hpg_area_genre_store_count')]
# for pair in pairs:
#     num_feats.append('inter_%s_%s_multiply' % (pair[0], pair[1]))
#     num_feats.append('inter_%s_%s_divide' % (pair[0], pair[1]))
# pairs = [('air_area_genre_store_count', 'hpg_area_genre_store_count'), 
#          ('air_area_store_count', 'hpg_area_store_count'), 
#          ('air_city_genre_store_count', 'hpg_city_genre_store_count'), 
#          ('air_city_store_count', 'hpg_city_store_count'), 
#          ('air_genre_store_count', 'hpg_genre_store_count')]
# for pair in pairs:
#     num_feats.append('inter_%s_%s_plus' % (pair[0], pair[1]))
#     num_feats.append('inter_%s_%s_divide' % (pair[0], pair[1]))
# ## rolling related
# rolling_cols = ['46_39', '53_46', '60_53', '67_60', '74_67']
# for idx in range(len(rolling_cols)):
#     for mod in ['visitors', 'reserve_visitors']:
#         num_feats.append('rolling_%s_%s' % (mod, rolling_cols[idx]))
#         if(idx < len(rolling_cols) - 1):
#             num_feats.append('rolling_%s_%s_rolling_%s_%s_var' % (mod, rolling_cols[idx], mod, rolling_cols[idx + 1]))
#     num_feats.append('rolling_visitors_%s_rolling_reserve_visitors_%s_diff' % (rolling_cols[idx], rolling_cols[idx]))
#     if(idx < len(rolling_cols) - 1):
#         c1 = 'rolling_visitors_%s_rolling_reserve_visitors_%s_diff' % (rolling_cols[idx], rolling_cols[idx])
#         c2 = 'rolling_visitors_%s_rolling_reserve_visitors_%s_diff' % (rolling_cols[idx + 1], rolling_cols[idx + 1])
#         num_feats.append('%s_%s_var' % (c1, c2))
        
def split_features(X):
    X_list = []
    ## for categorical feats
    for idx in range(len(cate_feats)):
        feat_vec = X[:, [idx]]
        X_list.append(feat_vec)
    ## for numeric feats
    for idx in range(len(num_feats)):
        feat_vec = X[:, [len(cate_feats) + idx]] 
        X_list.append(feat_vec)
    return X_list

class Model(object):

    def evaluate(self, X_val, y_val):
        assert(min(y_val) > 0)
        guessed_sales = self.guess(X_val)
        relative_err = np.absolute((y_val - guessed_sales) / y_val)
        result = np.sum(relative_err) / len(y_val)
        return result

class NN_with_EntityEmbedding(Model):
    def __init__(self, X_train, y_train, X_val, y_val, dim_dict):
        super().__init__()
        self.nb_epoch = 10
        self.input_dim_dict = dim_dict
        self.checkpointer = ModelCheckpoint(filepath="best_model_weights.hdf5", verbose=1, save_best_only=True)
        #self.max_log_y = max(np.max(np.log(y_train)),  np.max(np.log(y_val)))
        self.max_log_y = max(np.max(y_train), np.max(y_val))
        self.__build_keras_model()
        self.fit(X_train, y_train, X_val, y_val)

    def preprocessing(self, X):
        X_list = split_features(X)
        return X_list

    def __build_keras_model(self):
        models = []

        model_dow = Sequential()
        model_dow.add(Embedding(self.input_dim_dict['dow'], 6, input_length=1))
#         model_dow.add(Dropout(0.25))
        model_dow.add(Flatten())
        models.append(model_dow)

        model_day = Sequential()
        model_day.add(Embedding(self.input_dim_dict['day'], 4, input_length=1))
        model_day.add(Reshape(target_shape=(4,)))
        models.append(model_day)
        
        model_wom = Sequential()
        model_wom.add(Embedding(self.input_dim_dict['wom'], 3, input_length=1))
        model_wom.add(Reshape(target_shape=(3,)))
        models.append(model_wom)
        
        model_woy = Sequential()
        model_woy.add(Embedding(self.input_dim_dict['woy'], 4, input_length=1))
        model_woy.add(Reshape(target_shape=(4,)))
        models.append(model_woy)
        
        model_hbm = Sequential()
        model_hbm.add(Embedding(self.input_dim_dict['holidays_by_month'], 3, input_length=1))
        model_hbm.add(Reshape(target_shape=(3,)))
        models.append(model_hbm)
        
        model_by = Sequential()
        model_by.add(Embedding(self.input_dim_dict['before_year'], 2, input_length=1))
        model_by.add(Reshape(target_shape=(2,)))
        models.append(model_by)
        
        model_ay = Sequential()
        model_ay.add(Embedding(self.input_dim_dict['after_year'], 2, input_length=1))
        model_ay.add(Reshape(target_shape=(2,)))
        models.append(model_ay)
        
        model_fh = Sequential()
        model_fh.add(Embedding(self.input_dim_dict['first_day_holiday'], 2, input_length=1))
        model_fh.add(Reshape(target_shape=(2,)))
        models.append(model_fh)
        
        model_lh = Sequential()
        model_lh.add(Embedding(self.input_dim_dict['last_day_holiday'], 2, input_length=1))
        model_lh.add(Reshape(target_shape=(2,)))
        models.append(model_lh)
        
        model_holidays = Sequential()
        model_holidays.add(Embedding(self.input_dim_dict['hol_days'], 4, input_length=1))
#         model_holidays.add(Dropout(0.25))
        model_holidays.add(Flatten())
        models.append(model_holidays)

        model_isweekends = Sequential()
        model_isweekends.add(Dense(1, input_dim=1))
        models.append(model_isweekends)
        
        model_holidayflg = Sequential()
        model_holidayflg.add(Dense(1, input_dim=1))
        models.append(model_holidayflg)
           
        model_month = Sequential()
        model_month.add(Embedding(12, 6, input_length=1))
#         model_month.add(Dropout(0.25))
        model_month.add(Flatten())
        models.append(model_month)
        
        model_isupcorner = Sequential()
        model_isupcorner.add(Dense(1, input_dim=1))
        models.append(model_isupcorner)
        
        model_previsholiday = Sequential()
        model_previsholiday.add(Dense(1, input_dim=1))
        models.append(model_previsholiday)
        
        model_nextisholiday = Sequential()
        model_nextisholiday.add(Dense(1, input_dim=1))
        models.append(model_nextisholiday)
        
        model_airstoreid = Sequential()
        model_airstoreid.add(Embedding(self.input_dim_dict['air_store_id_encoded'], 10, input_length=1))
#         model_airstoreid.add(Dropout(0.25))
        model_airstoreid.add(Flatten())
        models.append(model_airstoreid)

        model_airareaname = Sequential()
        model_airareaname.add(Embedding(self.input_dim_dict['air_area_name'], 10, input_length=1))
#         model_airareaname.add(Dropout(0.25))
        model_airareaname.add(Flatten())
        models.append(model_airareaname)

        model_aircity = Sequential()
        model_aircity.add(Embedding(self.input_dim_dict['air_city'], 6, input_length=1))
#         model_aircity.add(Dropout(0.25))
        model_aircity.add(Flatten())
        models.append(model_aircity)
        
        model_airgenrename = Sequential()
        model_airgenrename.add(Embedding(self.input_dim_dict['air_genre_name'], 6, input_length=1))
#         model_airgenrename.add(Dropout(0.25))
        model_airgenrename.add(Flatten())
        models.append(model_airgenrename)
        
        model_hpgstoreid = Sequential()
        model_hpgstoreid.add(Embedding(self.input_dim_dict['hpg_store_id_encoded'], 10, input_length=1))
#         model_hpgstoreid.add(Dropout(0.25))
        model_hpgstoreid.add(Flatten())
        models.append(model_hpgstoreid)

        model_hpgareaname = Sequential()
        model_hpgareaname.add(Embedding(self.input_dim_dict['hpg_area_name'], 10, input_length=1))
#         model_hpgareaname.add(Dropout(0.25))
        model_hpgareaname.add(Flatten())
        models.append(model_hpgareaname)
        
        model_hpgcity = Sequential()
        model_hpgcity.add(Embedding(self.input_dim_dict['hpg_city'], 6, input_length=1))
#         model_hpgcity.add(Dropout(0.25))
        model_hpgcity.add(Flatten())
        models.append(model_hpgcity)
        
        model_hpggenrename = Sequential()
        model_hpggenrename.add(Embedding(self.input_dim_dict['hpg_genre_name'], 6, input_length=1))
#         model_hpggenrename.add(Dropout(0.25))
        model_hpggenrename.add(Flatten())
        models.append(model_hpggenrename)
        
        for i in range(len(num_feats)):
            num_model = Sequential()
            num_model.add(Dense(1, input_dim= 1))
            models.append(num_model)
        
        self.model = Sequential()
        self.model.add(Merge(models, mode='concat'))
        
        self.model.add(Dense(256,
                             kernel_initializer= random_normal(mean=0.0,stddev=0.05, seed=None), 
                             name= 'hidden1')
                      )
        self.model.add(PReLU())
        self.model.add(BatchNormalization())
#         self.model.add(Dropout(0.25))
        
        self.model.add(Dense(64, 
                             kernel_initializer= random_normal(mean=0.0,stddev=0.05, seed=None), 
                             name= 'hidden2'))
        self.model.add(PReLU())
        self.model.add(BatchNormalization())
#         self.model.add(Dropout(0.10))
        ## 
        self.model.add(Dense(1))
        self.model.add(Activation('relu'))
        
        self.model.compile(loss='mean_absolute_error', optimizer='adm')

    def _val_for_fit(self, val):
        val = val / self.max_log_y
        return val

    def _val_for_pred(self, val):
        return val * self.max_log_y

    def fit(self, X_train, y_train, X_val, y_val):
        self.model.fit(self.preprocessing(X_train), self._val_for_fit(y_train), validation_data=(self.preprocessing(X_val), self._val_for_fit(y_val)), nb_epoch= self.nb_epoch, batch_size= 128)
        # self.model.load_weights('best_model_weights.hdf5')
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, features):
        features = self.preprocessing(features)
        result = self.model.predict(features).flatten()
        return self._val_for_pred(result)

In [72]:
from sklearn import preprocessing
from sklearn import metrics

def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

DataBaseDir = '../../data'
InputDir = '%s/l0/kfold' % DataBaseDir
OutputDir = '%s/meta/kfold' % DataBaseDir
kfold = 5
strategy = 'nn'
start_time = datetime.datetime.now()
#### load data
valid_dfs = []
holdout_dfs = []
test_dfs = []
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    holdout = pd.read_csv('%s/holdout.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    test = pd.read_csv('%s/test.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    valid['fold'] = fold
    valid_dfs.append(valid)
    holdout_dfs.append(holdout)
    ## TODO
#     test = test.sample(frac= 0.1)
    test_dfs.append(test)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True) 

# score
cv_score = .0
holdout_score = .0
# predict
y_test_pred = 0
# parameters
start = time.time()

## TODO
# TrainData = TrainData.sample(frac= 0.2)
for fold in range(kfold):
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'holdout': holdout_dfs[fold],
        'test': test_dfs[fold]
    }
    ## Guass Normalization
#     all_data = FoldData['train'][num_feats]
#     scaler = preprocessing.Normalizer()
#     scaler.fit(all_data)
#     FoldData['train'][num_feats] = scaler.transform(FoldData['train'][num_feats])
#     FoldData['valid'][num_feats] = scaler.transform(FoldData['valid'][num_feats])
#     FoldData['holdout'][num_feats] = scaler.transform(FoldData['holdout'][num_feats])
#     FoldData['test'][num_feats] = scaler.transform(FoldData['test'][num_feats])

    ## Guass Rank Normalization
    FoldData['train']['source'] = 'train'
    FoldData['valid']['source'] = 'valid'
    FoldData['holdout']['source'] = 'holdout'
    FoldData['test']['source'] = 'test'
    all_data = pd.concat([FoldData['train'], FoldData['valid'], FoldData['holdout'], FoldData['test']], ignore_index= True)
    for c in num_feats:
        rank = np.argsort(all_data[c], axis= 0)
        upper = np.max(rank)
        lower = np.min(rank)
        # linear normalization to 0-1
        all_data[c] = (all_data[c] - lower)/(upper - lower)
        # gauss normalization
        all_data[c] = erfinv(all_data[c])
        all_data[c] -= np.mean(all_data[c])
    FoldData['train'] = all_data[all_data['source'] == 'train'].drop(['source'], axis= 1)
    FoldData['valid'] = all_data[all_data['source'] == 'valid'].drop(['source'], axis= 1)
    FoldData['holdout'] = all_data[all_data['source'] == 'holdout'].drop(['source'], axis= 1)
    FoldData['test'] = all_data[all_data['source'] == 'test'].drop(['source'], axis= 1)

    #
    for c in cate_num_feats:#['hol_days', 'month']:
        lbl = preprocessing.LabelEncoder()
        FoldData['train'][c] = lbl.fit_transform(FoldData['train'][c])
        FoldData['valid'][c] = lbl.transform(FoldData['valid'][c])
        FoldData['holdout'][c] = lbl.transform(FoldData['holdout'][c])
        FoldData['test'][c] = lbl.transform(FoldData['test'][c])           
    # train
    input_dim_dict = {}
    for c in cate_feats:
        input_dim_dict[c] = len(FoldData['train'][c].unique())
    print(input_dim_dict)
    feats = cate_feats.copy()
    feats.extend(num_feats)
    
    print('feature size %s' % len(feats))
    
    model = NN_with_EntityEmbedding(FoldData['train'][feats].values, FoldData['train']['visitors'].values,FoldData['valid'][feats].values, FoldData['valid']['visitors'].values, input_dim_dict)
    # for valid
    FoldData['valid'][strategy] = model.guess(FoldData['valid'][feats].values)
    rmsle_valid = RMSLE(FoldData['valid']['visitors'].values, FoldData['valid'][strategy])
    cv_score += rmsle_valid
    # for holdout
    FoldData['holdout'][strategy] = model.guess(FoldData['holdout'][feats].values)
    rmsle_holdout = RMSLE(FoldData['holdout']['visitors'].values, FoldData['holdout'][strategy])
    holdout_score += rmsle_holdout
    # for test
    FoldData['test'][strategy] = model.guess(FoldData['test'][feats].values)
    y_test_pred += FoldData['test'][strategy]

    print('fold %s: valid score %.6f, holdout score %.6f, valid length %s' % (fold, rmsle_valid, rmsle_holdout, len(FoldData['valid'])))  
    #### output
    FoldOutputDir = '%s/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    for mod in FoldData.keys():
        if(mod == 'train'):
            continue
        OutCols = []
        if(mod == 'test'):
            OutCols.append('id')
        OutCols.extend(['air_store_id', 'visit_date', 'visitors', strategy])
        OutputFile = '%s/%s_%s.csv' % (FoldOutputDir, mod, strategy)
        OutFoldData = FoldData[mod][OutCols]
        OutFoldData.to_csv(OutputFile, index= False)
    end = time.time()
    print('saving for %sth fold data done, time elapsed %.2fs.' % (fold, (end - start)))
    
y_test_pred /= kfold  # Average test set predictions
cv_score /= kfold # Average valid set predictions
holdout_score /= kfold # Average holdout set predictions

# Create submission file
sub = pd.DataFrame()
sub['id'] = test_dfs[0]['id']
sub['visitors'] = np.expm1(y_test_pred)
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l1/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.6f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

finish_time = datetime.datetime.now()
elapsed = (finish_time - start_time).seconds
print('\n======================')
print("CV score %.4f, Holdout score %.4f, Elapsed time: %.2fs" % (cv_score, holdout_score, elapsed))
print('======================\n')

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


{'dow': 7, 'day': 31, 'wom': 6, 'woy': 53, 'holidays_by_month': 7, 'before_year': 2, 'after_year': 2, 'first_day_holiday': 3, 'last_day_holiday': 3, 'hol_days': 5, 'is_weekends': 2, 'holiday_flg': 2, 'month': 12, 'is_up_corner': 2, 'prev_is_holiday': 2, 'next_is_holiday': 2, 'air_store_id_encoded': 829, 'air_area_name': 103, 'air_city': 9, 'air_genre_name': 14, 'hpg_store_id_encoded': 151, 'hpg_area_name': 34, 'hpg_city': 12, 'hpg_genre_name': 17}
feature size 34




Train on 181482 samples, validate on 45371 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  0.190866307702
fold 0: valid score 0.544207, holdout score 0.534376, valid length 45371
saving for 0th fold data done, time elapsed 257.62s.
{'dow': 7, 'day': 31, 'wom': 6, 'woy': 53, 'holidays_by_month': 7, 'before_year': 2, 'after_year': 2, 'first_day_holiday': 3, 'last_day_holiday': 3, 'hol_days': 5, 'is_weekends': 2, 'holiday_flg': 2, 'month': 12, 'is_up_corner': 2, 'prev_is_holiday': 2, 'next_is_holiday': 2, 'air_store_id_encoded': 829, 'air_area_name': 103, 'air_city': 9, 'air_genre_name': 14, 'hpg_store_id_encoded': 151, 'hpg_area_name': 34, 'hpg_city': 12, 'hpg_genre_name': 17}
feature size 34


KeyboardInterrupt: 