In [1]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Reshape
from keras.layers.embeddings import Embedding
from keras.layers import Merge
from keras.callbacks import ModelCheckpoint
import datetime,sys,os,time
import pandas as pd
import numpy as np
import sys,os

Using TensorFlow backend.


In [2]:
cate_cols = ['store_id_encoded', 'area_name', 'city', 'genre_name']
cate_feats = ['dow', 'hol_days', 'day', 'pom', 'wom', 'woy', 'prev_is_holiday', 'next_is_holiday', 'is_weekends', 'holiday_flg', 'month', 'is_up_corner']
for mod in ['air', 'hpg']:
    cate_feats.extend(['%s_%s' % (mod, c) for c in cate_cols])

def split_features(X):
    X_list = []
    for idx in range(len(cate_feats)):
        feat_vec = X[..., [idx]]
        X_list.append(feat_vec)        
    return X_list

class Model(object):

    def evaluate(self, X_val, y_val):
        assert(min(y_val) > 0)
        guessed_sales = self.guess(X_val)
        relative_err = np.absolute((y_val - guessed_sales) / y_val)
        result = np.sum(relative_err) / len(y_val)
        return result

class NN_with_EntityEmbedding(Model):
    def __init__(self, X_train, y_train, X_val, y_val, dim_dict):
        super().__init__()
        self.nb_epoch = 10
        self.input_dim_dict = dim_dict
        self.checkpointer = ModelCheckpoint(filepath="best_model_weights.hdf5", verbose=1, save_best_only=True)
        #self.max_log_y = max(np.max(np.log(y_train)),  np.max(np.log(y_val)))
        self.max_log_y = max(np.max(y_train), np.max(y_val))
        self.__build_keras_model()
        self.fit(X_train, y_train, X_val, y_val)

    def preprocessing(self, X):
        X_list = split_features(X)
        return X_list

    def __build_keras_model(self):
        models = []

        model_dow = Sequential()
        model_dow.add(Embedding(self.input_dim_dict['dow'], 6, input_length=1))
        model_dow.add(Reshape(target_shape=(6,)))
        models.append(model_dow)

        model_holidays = Sequential()
        model_holidays.add(Embedding(self.input_dim_dict['hol_days'], 4, input_length=1))
        model_holidays.add(Reshape(target_shape=(4,)))
        models.append(model_holidays)
        
        model_day = Sequential()
        model_day.add(Embedding(self.input_dim_dict['day'], 10, input_length=1))
        model_day.add(Reshape(target_shape=(10,)))
        models.append(model_day)
        
        model_pom = Sequential()
        model_pom.add(Embedding(self.input_dim_dict['pom'], 2, input_length=1))
        model_pom.add(Reshape(target_shape=(2,)))
        models.append(model_pom)
        
        model_wom = Sequential()
        model_wom.add(Embedding(self.input_dim_dict['wom'], 4, input_length=1))
        model_wom.add(Reshape(target_shape=(4,)))
        models.append(model_wom)
        
        model_woy = Sequential()
        model_woy.add(Embedding(self.input_dim_dict['woy'], 6, input_length=1))
        model_woy.add(Reshape(target_shape=(6,)))
        models.append(model_woy)
    
#         model_dow = Sequential()
#         model_dow.add(Dense(1, input_dim=1))
#         models.append(model_dow)

#         model_holidays = Sequential()
#         model_holidays.add(Dense(1, input_dim=1))
#         models.append(model_holidays)
        
#         model_day = Sequential()
#         model_day.add(Dense(1, input_dim=1))
#         models.append(model_day)
        
#         model_pom = Sequential()
#         model_pom.add(Dense(1, input_dim=1))
#         models.append(model_pom)
        
    
        model_previsholiday = Sequential()
        model_previsholiday.add(Dense(1, input_dim=1))
        models.append(model_previsholiday)
        
        model_nextisholiday = Sequential()
        model_nextisholiday.add(Dense(1, input_dim=1))
        models.append(model_nextisholiday)
        
        ##
        
        model_isweekends = Sequential()
        model_isweekends.add(Dense(1, input_dim=1))
        models.append(model_isweekends)
        
        model_holidayflg = Sequential()
        model_holidayflg.add(Dense(1, input_dim=1))
        models.append(model_holidayflg)
           
        model_month = Sequential()
        model_month.add(Embedding(12, 6, input_length=1))
        model_month.add(Reshape(target_shape=(6,)))
        models.append(model_month)
        
        model_isupcorner = Sequential()
        model_isupcorner.add(Dense(1, input_dim=1))
        models.append(model_isupcorner)
        
        model_airstoreid = Sequential()
        model_airstoreid.add(Embedding(self.input_dim_dict['air_store_id_encoded'], 10, input_length=1))
        model_airstoreid.add(Reshape(target_shape=(10,)))
        models.append(model_airstoreid)

        model_airareaname = Sequential()
        model_airareaname.add(Embedding(self.input_dim_dict['air_area_name'], 10, input_length=1))
        model_airareaname.add(Reshape(target_shape=(10,)))
        models.append(model_airareaname)

        model_aircity = Sequential()
        model_aircity.add(Embedding(self.input_dim_dict['air_city'], 4, input_length=1))
        model_aircity.add(Reshape(target_shape=(4,)))
        models.append(model_aircity)
        
        model_airgenrename = Sequential()
        model_airgenrename.add(Embedding(self.input_dim_dict['air_genre_name'], 4, input_length=1))
        model_airgenrename.add(Reshape(target_shape=(4,)))
        models.append(model_airgenrename)
        
        model_hpgstoreid = Sequential()
        model_hpgstoreid.add(Embedding(self.input_dim_dict['hpg_store_id_encoded'], 10, input_length=1))
        model_hpgstoreid.add(Reshape(target_shape=(10,)))
        models.append(model_hpgstoreid)

        model_hpgareaname = Sequential()
        model_hpgareaname.add(Embedding(self.input_dim_dict['hpg_area_name'], 10, input_length=1))
        model_hpgareaname.add(Reshape(target_shape=(10,)))
        models.append(model_hpgareaname)
        
        model_hpgcity = Sequential()
        model_hpgcity.add(Embedding(self.input_dim_dict['hpg_city'], 4, input_length=1))
        model_hpgcity.add(Reshape(target_shape=(4,)))
        models.append(model_hpgcity)
        
        model_hpggenrename = Sequential()
        model_hpggenrename.add(Embedding(self.input_dim_dict['hpg_genre_name'], 4, input_length=1))
        model_hpggenrename.add(Reshape(target_shape=(6,)))
        models.append(model_hpggenrename)
        
        self.model = Sequential()
        self.model.add(Merge(models, mode='concat'))
        self.model.add(Dense(1000, init='uniform'))
        self.model.add(Activation('relu'))
        self.model.add(Dense(500, init='uniform'))
        self.model.add(Activation('relu'))
        self.model.add(Dense(1))
        self.model.add(Activation('sigmoid'))

        self.model.compile(loss='mean_absolute_error', optimizer='adam')

    def _val_for_fit(self, val):
        val = val / self.max_log_y
        return val

    def _val_for_pred(self, val):
        return val * self.max_log_y

    def fit(self, X_train, y_train, X_val, y_val):
        self.model.fit(self.preprocessing(X_train), self._val_for_fit(y_train), validation_data=(self.preprocessing(X_val), self._val_for_fit(y_val)), nb_epoch= self.nb_epoch, batch_size= 128)
        # self.model.load_weights('best_model_weights.hdf5')
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, features):
        features = self.preprocessing(features)
        result = self.model.predict(features).flatten()
        return self._val_for_pred(result)

In [3]:
import json
import pandas as pd
import dill as pickle

class DataUtil2:
    """"""
    @classmethod
    def load(cls, file, format, date_cols= None):
        """"""
        data = ''
        if(format== 'csv'):
            data = pd.read_csv(file, parse_dates= date_cols)
        elif(format== 'json'):
            with open(file, 'r') as i_file:
                data = json.load(file)
            i_file.close()
        elif(format== 'pkl'):
            with open(file, 'rb') as i_file:
                data = pickle.load(i_file)
            i_file.close()
        elif(format == 'hdf'):
            data = pd.read_hdf(path_or_buf= file, key='undefined')

        return  data

    @classmethod
    def save(cls, data, file, format, precision= 8):
        """"""
        if(format == 'csv'):
            data.to_csv(file, float_format= '%%.%df' % precision, index= False)
        elif(format == 'json'):
            with open(file, 'w') as o_file:
                json.dump(data, o_file, ensure_ascii= True, indent= 4)
            o_file.close()
        elif(format == 'pkl'):
            with open(file, 'wb') as o_file:
                pickle.dump(data, o_file, -1)
            o_file.close()
        elif(format== 'hdf'):
            data.to_hdf(path_or_buf= file, key='undefined', mode='w', complib='blosc')

        return

In [4]:
from sklearn import preprocessing
from sklearn import metrics

def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5

DataBaseDir = '../../data'
InputDir = '%s/l0/kfold' % DataBaseDir
OutputDir = '%s/l1/kfold' % DataBaseDir
kfold = 5
strategy = 'NN_EF'
start_time = datetime.datetime.now()
#### load data
valid_dfs = []
holdout_dfs = []
test_dfs = []
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    holdout = pd.read_csv('%s/holdout.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    test = pd.read_csv('%s/test.csv' % FoldInputDir, parse_dates= ['visit_date']).reset_index(drop= True)
    valid['fold'] = fold
    valid_dfs.append(valid)
    holdout_dfs.append(holdout)
    test_dfs.append(test)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True) 

# score
cv_score = .0
holdout_score = .0
# predict
y_test_pred = 0
# parameters
start = time.time()
for fold in range(kfold):
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'holdout': holdout_dfs[fold],
        'test': test_dfs[fold]
    }
    #
    for c in ['hol_days', 'month']:
        lbl = preprocessing.LabelEncoder()
        FoldData['train'][c] = lbl.fit_transform(FoldData['train'][c])
        FoldData['valid'][c] = lbl.transform(FoldData['valid'][c])
        FoldData['holdout'][c] = lbl.transform(FoldData['holdout'][c])
        FoldData['test'][c] = lbl.transform(FoldData['test'][c])           
#         assert(len(FoldData['train'][c].unique()) == len(FoldData['valid'][c].unique()))
#         assert(len(FoldData['train'][c].unique()) == len(FoldData['holdout'][c].unique()))
#         assert(len(FoldData['train'][c].unique()) == len(FoldData['test'][c].unique()))
    # train
    input_dim_dict = {}
    for c in cate_feats:
        input_dim_dict[c] = len(FoldData['train'][c].unique())
    print(input_dim_dict)
    model = NN_with_EntityEmbedding(FoldData['train'][cate_feats].values, FoldData['train']['visitors'].values,FoldData['valid'][cate_feats].values, FoldData['valid']['visitors'].values, input_dim_dict)
    # for valid
    FoldData['valid'][strategy] = model.guess(FoldData['valid'][cate_feats].values)
    rmsle_valid = RMSLE(FoldData['valid']['visitors'].values, FoldData['valid'][strategy])
    cv_score += rmsle_valid
    # for holdout
    FoldData['holdout'][strategy] = model.guess(FoldData['holdout'][cate_feats].values)
    rmsle_holdout = RMSLE(FoldData['holdout']['visitors'].values, FoldData['holdout'][strategy])
    holdout_score += rmsle_holdout
    # for test
    FoldData['test'][strategy] = model.guess(FoldData['test'][cate_feats].values)
    y_test_pred += FoldData['test'][strategy]

    print('fold %s: valid score %.6f, holdout score %.6f, valid length %s' % (fold, rmsle_valid, rmsle_holdout, len(FoldData['valid'])))  
    #### output
    FoldOutputDir = '%s/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    for mod in FoldData.keys():
        if(mod == 'train'):
            continue
        OutCols = []
        if(mod == 'test'):
            OutCols.append('id')
        OutCols.extend(['air_store_id', 'visit_date', 'visitors', strategy])
        OutputFile = '%s/%s_%s.csv' % (FoldOutputDir, mod, strategy)
        OutFoldData = FoldData[mod][OutCols]
        OutFoldData.to_csv(OutputFile, index= False)
    end = time.time()
    print('saving for %sth fold data done, time elapsed %.2fs.' % (fold, (end - start)))
    
y_test_pred /= kfold  # Average test set predictions
cv_score /= kfold # Average valid set predictions
holdout_score /= kfold # Average holdout set predictions

# Create submission file
sub = pd.DataFrame()
sub['id'] = test_dfs[0]['id']
sub['visitors'] = np.expm1(y_test_pred)
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l1/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.6f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

finish_time = datetime.datetime.now()
elapsed = (finish_time - start_time).seconds
print('\n======================')
print("CV score %.4f, Holdout score %.4f, Elapsed time: %.2fs" % (cv_score, holdout_score, elapsed))
print('======================\n')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


{'dow': 7, 'hol_days': 5, 'day': 31, 'pom': 3, 'wom': 6, 'woy': 53, 'prev_is_holiday': 2, 'next_is_holiday': 2, 'is_weekends': 2, 'holiday_flg': 2, 'month': 12, 'is_up_corner': 2, 'air_store_id_encoded': 829, 'air_area_name': 103, 'air_city': 9, 'air_genre_name': 14, 'hpg_store_id_encoded': 151, 'hpg_area_name': 34, 'hpg_city': 12, 'hpg_genre_name': 17}




Train on 181482 samples, validate on 45371 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  0.19117816173


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


fold 0: valid score 0.522967, holdout score 0.513400, valid length 45371
saving for 0th fold data done, time elapsed 163.82s.
{'dow': 7, 'hol_days': 5, 'day': 31, 'pom': 3, 'wom': 6, 'woy': 53, 'prev_is_holiday': 2, 'next_is_holiday': 2, 'is_weekends': 2, 'holiday_flg': 2, 'month': 12, 'is_up_corner': 2, 'air_store_id_encoded': 829, 'air_area_name': 103, 'air_city': 9, 'air_genre_name': 14, 'hpg_store_id_encoded': 151, 'hpg_area_name': 34, 'hpg_city': 12, 'hpg_genre_name': 17}
Train on 181482 samples, validate on 45371 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  0.185138189432
fold 1: valid score 0.514006, holdout score 0.513164, valid length 45371
saving for 1th fold data done, time elapsed 334.68s.
{'dow': 7, 'hol_days': 5, 'day': 31, 'pom': 3, 'wom': 6, 'woy': 53, 'prev_is_holiday': 2, 'next_is_holiday': 2, 'is_weekends': 2, 'holiday_flg': 2, 'month': 12, 'is_up_corner': 2, 'air_st