In [20]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Reshape, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import Merge, Flatten
from keras.callbacks import ModelCheckpoint
from keras.initializers import random_normal
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam

from sklearn import preprocessing
import datetime,sys,os,time
import pandas as pd
import numpy as np
import sys,os
from scipy.special import erfinv
from tensorflow import set_random_seed
from sklearn.metrics import log_loss
set_random_seed(2018)

kfold = 5
DataBaseDir = '../../data'
InputDir = '%s/l0' % DataBaseDir
OutputDir = '%s/l1' % DataBaseDir
strategy = 'meta_nn'
target = 'is_trade'

finish_time = datetime.datetime.now()
## loading data
valid_dfs = []
for fold in range(kfold):
    FoldInputDir = '%s/kfold/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir)
    if(fold == 0):
        TestData = pd.read_csv('%s/test.csv' % FoldInputDir)
    valid['fold'] = fold
    valid_dfs.append(valid)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
print('\nLoading data done.')
cate_feats = [c for c in TrainData.columns if(c.startswith('c_'))]
num_feats = [c for c in TrainData.columns if(c.startswith('n_'))]
print('\nCategory features: ')
print(cate_feats)
print('\nNumeric features: ')
print(num_feats)

print('\n Unique length for entire train data set: ')
for c in cate_feats:
    print(c, len(TrainData[c].unique()))


Loading data done.

Category features: 
['c_item_category_1', 'c_item_category_2', 'c_item_property_0', 'c_item_property_1', 'c_item_property_2', 'c_predict_category_0', 'c_predict_category_1', 'c_predict_category_2', 'c_predict_category_0_hit_0', 'c_predict_category_1_hit_0', 'c_predict_category_2_hit_0', 'c_item_id', 'c_user_id', 'c_shop_id', 'c_item_brand_id', 'c_item_city_id', 'c_item_category_list', 'c_item_price_level', 'c_item_sales_level', 'c_item_collected_level', 'c_item_pv_level', 'c_user_gender_id', 'c_user_age_level', 'c_user_occupation_id', 'c_user_star_level', 'c_shop_review_num_level', 'c_shop_star_level', 'c_context_page_id']

Numeric features: 
['n_hour', 'n_count_item_property_0_item_id', 'n_count_item_property_1_item_id', 'n_count_item_property_2_item_id', 'n_count_item_brand_id_item_id', 'n_count_item_city_id_item_id', 'n_count_item_price_level_item_id', 'n_count_item_sales_level_item_id', 'n_count_item_collected_level_item_id', 'n_count_item_pv_level_item_id', 'n

In [21]:
## Embedding Model Framework
class NN_with_EntityEmbedding:
    def __init__(self, X_train, y_train, X_val, y_val, epoch, batch_size, dim_dict):
        super().__init__()
        self.nb_epoch = epoch
        self.batch_size = batch_size
        self.input_dim_dict = dim_dict
        self.checkpointer = ModelCheckpoint(filepath="best_model_weights.hdf5", verbose=1, save_best_only=True)
        self.__build_keras_model()
        self.fit(X_train, y_train, X_val, y_val)

    def preprocessing(self, X):
        X_list = []
        ## for categorical feats
        for idx in range(len(cate_feats)):
            feat_vec = X[:, [idx]]
            X_list.append(feat_vec)
        ## for numeric feats
        for idx in range(len(num_feats)):
            feat_vec = X[:, [len(cate_feats) + idx]] 
            X_list.append(feat_vec)
        return X_list

    def __build_keras_model(self):
        models = []

        # model_item_category_1, 13
        model_item_category_1 = Sequential()
        model_item_category_1.add(Embedding(self.input_dim_dict['c_item_category_1'], 6, input_length=1))
        model_item_category_1.add(Flatten())
        models.append(model_item_category_1)
        
        # c_item_category_2, 3
        model_item_category_2 = Sequential()
        model_item_category_2.add(Embedding(self.input_dim_dict['c_item_category_2'], 2, input_length=1))
        model_item_category_2.add(Flatten())
        models.append(model_item_category_2)
        
        #c_item_property_0 217
        model_item_property_0 = Sequential()
        model_item_property_0.add(Embedding(self.input_dim_dict['c_item_property_0'], 10, input_length=1))
        model_item_property_0.add(Flatten())
        models.append(model_item_property_0)
        
        # c_item_property_1 125
        model_item_property_1 = Sequential()
        model_item_property_1.add(Embedding(self.input_dim_dict['c_item_property_1'], 10, input_length=1))
        model_item_property_1.add(Flatten())
        models.append(model_item_property_1)
        
        # c_item_property_2 195
        model_item_property_2 = Sequential()
        model_item_property_2.add(Embedding(self.input_dim_dict['c_item_property_2'], 10, input_length=1))
        model_item_property_2.add(Flatten())
        models.append(model_item_property_2)
        
        # c_predict_category_0 383
        model_predict_category_0 = Sequential()
        model_predict_category_0.add(Embedding(self.input_dim_dict['c_predict_category_0'], 10, input_length=1))
        model_predict_category_0.add(Flatten())
        models.append(model_predict_category_0)

        # c_predict_category_1 397
        model_predict_category_1 = Sequential()
        model_predict_category_1.add(Embedding(self.input_dim_dict['c_predict_category_1'], 10, input_length=1))
        model_predict_category_1.add(Flatten())
        models.append(model_predict_category_1)

        # c_predict_category_2 459
        model_predict_category_2 = Sequential()
        model_predict_category_2.add(Embedding(self.input_dim_dict['c_predict_category_2'], 10, input_length=1))
        model_predict_category_2.add(Flatten())
        models.append(model_predict_category_2)
        
        # c_predict_category_0_hit_0 2
        model_predict_category_0_hit_0 = Sequential()
        model_predict_category_0_hit_0.add(Dense(1, input_dim=1))
        models.append(model_predict_category_0_hit_0)
        
        # c_predict_category_1_hit_0 2 
        model_predict_category_1_hit_0 = Sequential()
        model_predict_category_1_hit_0.add(Dense(1, input_dim=1))
        models.append(model_predict_category_1_hit_0)
        
        #c_predict_category_2_hit_0 2
        model_predict_category_2_hit_0 = Sequential()
        model_predict_category_2_hit_0.add(Dense(1, input_dim=1))
        models.append(model_predict_category_2_hit_0)

        # c_item_id 10075
        model_item_id = Sequential()
        model_item_id.add(Embedding(self.input_dim_dict['c_item_id'], 10, input_length=1))
        model_item_id.add(Flatten())
        models.append(model_item_id)
        
        # c_user_id 197694
        model_user_id = Sequential()
        model_user_id.add(Embedding(self.input_dim_dict['c_user_id'], 10, input_length=1))
        model_user_id.add(Flatten())
        models.append(model_user_id)
        
        # c_shop_id 3959
        model_shop_id = Sequential()
        model_shop_id.add(Embedding(self.input_dim_dict['c_shop_id'], 10, input_length=1))
        model_shop_id.add(Flatten())
        models.append(model_shop_id)

        #c_item_brand_id 2055
        model_item_brand_id = Sequential()
        model_item_brand_id.add(Embedding(self.input_dim_dict['c_item_brand_id'], 10, input_length=1))
        model_item_brand_id.add(Flatten())
        models.append(model_item_brand_id)
        
        #c_item_city_id 128
        model_item_city_id = Sequential()
        model_item_city_id.add(Embedding(self.input_dim_dict['c_item_city_id'], 6, input_length=1))
        model_item_city_id.add(Flatten())
        models.append(model_item_city_id)
        
        # c_item_category 14
        model_item_category_list = Sequential()
        model_item_category_list.add(Embedding(self.input_dim_dict['c_item_category_list'], 6, input_length=1))
        model_item_category_list.add(Flatten())
        models.append(model_item_category_list)
        
        #c_item_price_level 14
        model_item_price_level = Sequential()
        model_item_price_level.add(Embedding(self.input_dim_dict['c_item_price_level'], 6, input_length=1))
        model_item_price_level.add(Flatten())
        models.append(model_item_price_level)
        
        # c_item_sales_level 18
        model_item_sales_level = Sequential()
        model_item_sales_level.add(Embedding(self.input_dim_dict['c_item_sales_level'], 6, input_length=1))
        model_item_sales_level.add(Flatten())
        models.append(model_item_sales_level)
        
        # c_item_collected_level 18
        model_item_collected_level = Sequential()
        model_item_collected_level.add(Embedding(self.input_dim_dict['c_item_collected_level'], 6, input_length=1))
        model_item_collected_level.add(Flatten())
        models.append(model_item_collected_level)
        
        # c_item_pv_level 22
        model_item_pv_level = Sequential()
        model_item_pv_level.add(Embedding(self.input_dim_dict['c_item_pv_level'], 6, input_length=1))
        model_item_pv_level.add(Flatten())
        models.append(model_item_pv_level)
        
        # c_user_gender_id 4
        model_user_gender_id = Sequential()
        model_user_gender_id.add(Embedding(self.input_dim_dict['c_user_gender_id'], 2, input_length=1))
        model_user_gender_id.add(Flatten())
        models.append(model_user_gender_id)
        
        #c_user_age_level 9
        model_user_age_level = Sequential()
        model_user_age_level.add(Embedding(self.input_dim_dict['c_user_age_level'], 4, input_length=1))
        model_user_age_level.add(Flatten())
        models.append(model_user_age_level)
        
        # c_user_occupation_id 5
        model_user_occupation_id = Sequential()
        model_user_occupation_id.add(Embedding(self.input_dim_dict['c_user_occupation_id'], 4, input_length=1))
        model_user_occupation_id.add(Flatten())
        models.append(model_user_occupation_id)

        # c_user_star_level 12
        model_user_star_level = Sequential()
        model_user_star_level.add(Embedding(self.input_dim_dict['c_user_star_level'], 4, input_length=1))
        model_user_star_level.add(Flatten())
        models.append(model_user_star_level)

        # c_shop_review_num_level 25
        model_shop_review_num_level = Sequential()
        model_shop_review_num_level.add(Embedding(self.input_dim_dict['c_shop_review_num_level'], 6, input_length=1))
        model_shop_review_num_level.add(Flatten())
        models.append(model_shop_review_num_level)
        
        # c_shop_star_level 22
        model_shop_star_level = Sequential()
        model_shop_star_level.add(Embedding(self.input_dim_dict['c_shop_star_level'], 6, input_length=1))
        model_shop_star_level.add(Flatten())
        models.append(model_shop_star_level)
        
        # c_context_page_id 20
        model_context_page_id = Sequential()
        model_context_page_id.add(Embedding(self.input_dim_dict['c_context_page_id'], 6, input_length=1))
        model_context_page_id.add(Flatten())
        models.append(model_context_page_id)

        for i in range(len(num_feats)):
            num_model = Sequential()
            num_model.add(Dense(1, input_dim= 1))
            models.append(num_model)
        
        self.model = Sequential()
        self.model.add(Merge(models, mode='concat'))
        
        self.model.add(Dense(256,
                             kernel_initializer= random_normal(mean=0.0,stddev=0.05, seed=None), 
                             name= 'hidden1')
                      )
        self.model.add(PReLU())
        self.model.add(BatchNormalization())
        self.model.add(Dropout(0.25))
        
        self.model.add(Dense(64, 
                             kernel_initializer= random_normal(mean=0.0,stddev=0.05, seed=None), 
                             name= 'hidden2'))
        self.model.add(PReLU())
        self.model.add(BatchNormalization())
        self.model.add(Dropout(0.10))
        ## 
        self.model.add(Dense(1))
        self.model.add(Activation('sigmoid'))
        
        self.model.compile(loss='binary_crossentropy', optimizer= Adam(lr=1e-3), metrics=['accuracy'])

    def fit(self, X_train, y_train, X_val, y_val):
        self.model.fit(self.preprocessing(X_train), y_train, 
                       validation_data=(self.preprocessing(X_val), y_val), 
                       nb_epoch= self.nb_epoch, batch_size= self.batch_size)
        # self.model.load_weights('best_model_weights.hdf5')
        # print("Result on validation data: ", log_loss(self.guess(X_val), y_val))

    def predict(self, X):
        return self.model.predict(self.preprocessing(X), batch_size= 1024)[:,0]

In [25]:
## CV
batch_size = 256
epoch = 10
start = time.time()
drop_columns = ['fold']
drop_columns.append(target)
# score
cv_score = .0
# predict
y_test_pred = 0
for fold in range(kfold):
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'test': TestData
    }
    ## pre-processing

    # normalization for numeric features
    FoldData['train']['source'] = 'train'
    FoldData['valid']['source'] = 'valid'
    FoldData['test']['source'] = 'test'
    all_data = pd.concat([FoldData['train'], FoldData['valid'], FoldData['test']], ignore_index= True)
    for c in num_feats:
        rank = np.argsort(all_data[c], axis= 0)
        upper = np.max(rank)
        lower = np.min(rank)
        # linear normalization to 0-1
        all_data[c] = (all_data[c] - lower)/(upper - lower)
        # gauss normalization
        all_data[c] = erfinv(all_data[c])
        all_data[c] -= np.mean(all_data[c])
    FoldData['train'] = all_data[all_data['source'] == 'train'].drop(['source'], axis= 1)
    FoldData['valid'] = all_data[all_data['source'] == 'valid'].drop(['source'], axis= 1)
    FoldData['test'] = all_data[all_data['source'] == 'test'].drop(['source'], axis= 1)
    print('\nNormalization(GaussRank) done.')
    # encoding for category features
    for c in cate_feats:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(all_data[c])
        FoldData['train'][c] = lbl.transform(FoldData['train'][c])
        FoldData['valid'][c] = lbl.transform(FoldData['valid'][c])
        FoldData['test'][c] = lbl.transform(FoldData['test'][c])
    print('\nEncoding(LabelEncode) done.')
    print('\n Pre-processing done.')
    
    # train
    input_dim_dict = {}
    for c in cate_feats:
        input_dim_dict[c] = len(all_data[c].unique())
    print(input_dim_dict)
    feats = cate_feats.copy()
    feats.extend(num_feats)
    model = NN_with_EntityEmbedding(FoldData['train'][feats].values, 
                                    FoldData['train'][target].values,
                                    FoldData['valid'][feats].values, 
                                    FoldData['valid'][target].values, epoch, batch_size, input_dim_dict)
    print('\n Training done.')
    
    # for valid
    FoldData['valid'][strategy] = model.predict(FoldData['valid'][feats].values)[:,0]
    logloss_valid = log_loss(FoldData['valid'][target].values, FoldData['valid'][strategy])
    cv_score += logloss_valid
    # for test
    FoldData['test'][strategy] = model.predict(FoldData['test'][feats].values)[:,0]
    y_test_pred += FoldData['test'][strategy]

    print('\nEvaluation done. Fold %s: valid score %.8f, valid length %s' % (fold, logloss_valid, len(FoldData['valid'])))  
    #### output
    FoldOutputDir = '%s/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    for mod in FoldData.keys():
        if(mod == 'train'):
            continue
        OutCols = []
        if(mod == 'test'):
            OutCols.append('instance_id')
        OutCols.extend([strategy, target])
        OutputFile = '%s/%s_%s.csv' % (FoldOutputDir, mod, strategy)
        OutFoldData = FoldData[mod][OutCols]
        OutFoldData.to_csv(OutputFile, float_format='%.8f', index= False)
    print('Saving for %sth fold data done, time elapsed %.2fs.' % (fold, (end - start)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Normalization(GaussRank) done.

Encoding(LabelEncode) done.

 Pre-processing done.
{'c_item_category_1': 13, 'c_item_category_2': 3, 'c_item_property_0': 218, 'c_item_property_1': 125, 'c_item_property_2': 196, 'c_predict_category_0': 285, 'c_predict_category_1': 398, 'c_predict_category_2': 460, 'c_predict_category_0_hit_0': 2, 'c_predict_category_1_hit_0': 2, 'c_predict_category_2_hit_0': 2, 'c_item_id': 10076, 'c_user_id': 197695, 'c_shop_id': 3960, 'c_item_brand_id': 2056, 'c_item_city_id': 128, 'c_item_category_list': 14, 'c_item_price_level': 14, 'c_item_sales_level': 18, 'c_item_collected_level': 18, 'c_item_pv_level': 22, 'c_user_gender_id': 4, 'c_user_age_level': 9, 'c_user_occupation_id': 5, 'c_user_star_level': 12, 'c_shop_review_num_level': 25, 'c_shop_star_level': 22, 'c_context_page_id': 20}




Train on 382509 samples, validate on 95629 samples
Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [None]:
y_test_pred /= kfold  # Average test set predictions
cv_score /= kfold # Average valid set predictions

# Create submission file
sub = pd.DataFrame()
sub['instance_id'] = TestData['instance_id']
sub['predicted_score'] = np.expm1(y_test_pred)
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l1/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.8f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

finish_time = datetime.datetime.now()
elapsed = (finish_time - start_time).seconds
print('\n======================')
print("CV score %.6f, Elapsed time: %.2fs" % (cv_score, elapsed))
print('======================\n')