https://github.com/fastai/fastai/blob/master/courses/dl1/lesson3-rossman.ipynb

https://github.com/entron/entity-embedding-rossmann

### 1、导入数据

In [1]:
#导入序列化数据的库，pickle可以把对象序列化，
#然后保存到磁盘；或把磁盘文件反序列化读入内存.
#pickle是Python独有，更一般的处理库有json
import pickle
import csv

In [2]:
#把csv文件转换为字典
def csv2dicts(csvfile):
    data = []
    keys = []
    for row_index, row in enumerate(csvfile):
        #把第一行标题打印出来
        if row_index == 0:
            keys = row
            print(row)
            continue
        
        data.append({key: value for key, value in zip(keys, row)})
    return data

#如果值为空，则用'0'填充
def set_nan_as_string(data, replace_str='0'):
    for i, x in enumerate(data):
        for key, value in x.items():
            if value == '':
                x[key] = replace_str
        data[i] = x


In [3]:
train_data = r".\data\train.csv"
store_data = r".\data\store.csv"
store_states = r'.\data\store_states.csv'

#把处理后的训练数据写入文件
with open(train_data) as csvfile:
    data = csv.reader(csvfile, delimiter=',')
    with open('train_data.pickle', 'wb') as f:
        data = csv2dicts(data)
        #头尾倒过来
        data = data[::-1]
        #序列化，把数据保存到文件中
        pickle.dump(data, f, -1)
        print(data[:3])

['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']
[{'Store': '1115', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1114', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1113', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}]


In [4]:
data[0]

{'Store': '1115',
 'DayOfWeek': '2',
 'Date': '2013-01-01',
 'Sales': '0',
 'Customers': '0',
 'Open': '0',
 'Promo': '0',
 'StateHoliday': 'a',
 'SchoolHoliday': '1'}

In [5]:
#把处理后的store_data，store_states数据写入文件store_data.pickle
with open(store_data) as csvfile, open(store_states) as csvfile2:
    data = csv.reader(csvfile, delimiter=',')
    state_data = csv.reader(csvfile2, delimiter=',')
    with open('store_data.pickle', 'wb') as f:
        data = csv2dicts(data)
        state_data = csv2dicts(state_data)
        set_nan_as_string(data)
        #把state加到store_data数据集中，然后保存生成的数据        
        for index, val in enumerate(data):
            state = state_data[index]
            val['State'] = state['State']
            data[index] = val
        pickle.dump(data, f, -1)
        print(data[:2])

['Store', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
['Store', 'State']
[{'Store': '1', 'StoreType': 'c', 'Assortment': 'a', 'CompetitionDistance': '1270', 'CompetitionOpenSinceMonth': '9', 'CompetitionOpenSinceYear': '2008', 'Promo2': '0', 'Promo2SinceWeek': '0', 'Promo2SinceYear': '0', 'PromoInterval': '0', 'State': 'HE'}, {'Store': '2', 'StoreType': 'a', 'Assortment': 'a', 'CompetitionDistance': '570', 'CompetitionOpenSinceMonth': '11', 'CompetitionOpenSinceYear': '2007', 'Promo2': '1', 'Promo2SinceWeek': '13', 'Promo2SinceYear': '2010', 'PromoInterval': 'Jan,Apr,Jul,Oct', 'State': 'TH'}]


### 2、数据预处理

In [6]:
import pickle
from datetime import datetime
from sklearn import preprocessing
import numpy as np
import random
random.seed(42)

#### 2.1 加载数据
读取pickle文件数据

In [7]:
with open('train_data.pickle', 'rb') as f:
    train_data = pickle.load(f)
    num_records = len(train_data)
with open('store_data.pickle', 'rb') as f:
    store_data = pickle.load(f)

In [8]:
print(len(train_data),len(store_data))

1017209 1115


In [9]:
train_data[0]

{'Store': '1115',
 'DayOfWeek': '2',
 'Date': '2013-01-01',
 'Sales': '0',
 'Customers': '0',
 'Open': '0',
 'Promo': '0',
 'StateHoliday': 'a',
 'SchoolHoliday': '1'}

In [10]:
store_data[0]

{'Store': '1',
 'StoreType': 'c',
 'Assortment': 'a',
 'CompetitionDistance': '1270',
 'CompetitionOpenSinceMonth': '9',
 'CompetitionOpenSinceYear': '2008',
 'Promo2': '0',
 'Promo2SinceWeek': '0',
 'Promo2SinceYear': '0',
 'PromoInterval': '0',
 'State': 'HE'}

#### 2.2 定义预处理训练数据函数

In [11]:
#对时间特征进行拆分和转换，是否促销promo等特征转换为整数
def feature_list(record):
    dt = datetime.strptime(record['Date'], '%Y-%m-%d')
    store_index = int(record['Store'])
    year = dt.year
    month = dt.month
    day = dt.day
    day_of_week = int(record['DayOfWeek'])
    try:
        store_open = int(record['Open'])
    except:
        store_open = 1

    promo = int(record['Promo'])
    #同时返回state对应的简称
    return [store_open,
            store_index,
            day_of_week,
            promo,
            year,
            month,
            day,
            store_data[store_index - 1]['State']
            ]



#### 2.3 生成训练数据集

In [12]:
train_data_X = []
train_data_y = []

for record in train_data:
    if record['Sales'] != '0' and record['Open'] != '':
        fl = feature_list(record)
        train_data_X.append(fl)
        train_data_y.append(int(record['Sales']))
print("销售记录数: ", len(train_data_y))

print("最小销售量:{}，最大销售量:{}".format(min(train_data_y), max(train_data_y)))


销售记录数:  844338
最小销售量:46，最大销售量:41551


In [13]:
#查看训练数据处理后的前3条记录
train_data_X[:3]

[[1, 1097, 2, 0, 2013, 1, 1, 'RP'],
 [1, 948, 2, 0, 2013, 1, 1, 'BW'],
 [1, 769, 2, 0, 2013, 1, 1, 'NW']]

In [14]:
np.array(train_data_X).shape[1]

8

In [15]:
#查看销量数据的前3条记录
train_data_y[0:3]

[5961, 4491, 5035]

#### 2.4 对特征数值化，并保存结果到文件中

In [16]:
full_X = np.array(train_data_X)
#full_X = np.array(full_X)
train_data_X = np.array(train_data_X)
les = []
#对每列进行处理，把类别转换为数值
for i in range(train_data_X.shape[1]):
    le = preprocessing.LabelEncoder()
    le.fit(full_X[:, i])
    les.append(le)
    train_data_X[:, i] = le.transform(train_data_X[:, i])

#处理后的数据写入pickle文件
with open('les.pickle', 'wb') as f:
    pickle.dump(les, f, -1)

#把训练数据转换为整数
train_data_X = train_data_X.astype(int)
train_data_y = np.array(train_data_y)

#保存数据到feature_train_data.pickle文件
with open('feature_train_data.pickle', 'wb') as f:
    pickle.dump((train_data_X, train_data_y), f, -1)
    print(train_data_X[0], train_data_y[0])


[  0 109   1   0   0   0   0   7] 5961


#### LabelEncoder可以将标签分配一个0—n_classes-1之间的编码

In [17]:
train_data_X[100:110]

array([[ 0, 37,  2,  0,  0,  0, 11,  6],
       [ 0, 36,  2,  0,  0,  0, 11, 10],
       [ 0, 34,  2,  0,  0,  0, 11, 10],
       [ 0, 33,  2,  0,  0,  0, 11,  6],
       [ 0, 32,  2,  0,  0,  0, 11,  2],
       [ 0, 31,  2,  0,  0,  0, 11,  6],
       [ 0, 30,  2,  0,  0,  0, 11,  7],
       [ 0, 29,  2,  0,  0,  0, 11,  8],
       [ 0, 28,  2,  0,  0,  0, 11,  9],
       [ 0, 27,  2,  0,  0,  0, 11,  6]])

In [18]:
set(train_data_X[:,3])

{0, 1}

### 3、构建模型

#### 3.1 导入库
导入的库包括对数据预处理的库，SVM,XGB等算法。
导入构建模型的tensorflow.keras库

In [55]:
import tensorflow as tf
import numpy
numpy.random.seed(123)
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn import neighbors
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
#import sys
#sys.setrecursionlimit(10000)

from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model as KerasModel
from tensorflow.keras.layers import Input, Dense, Activation, Reshape,Flatten
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import ModelCheckpoint

#屏蔽警告信息
import warnings
warnings.filterwarnings("ignore")

In [22]:
## 定义一些超参数

In [56]:
train_ratio = 0.9
shuffle_data = False
one_hot_as_input = False
embeddings_as_input = False
save_embeddings = True
saved_embeddings_fname = "embeddings.pickle"  # set save_embeddings to True to create this file

## 3.2 导入训练数据

In [57]:
f = open('feature_train_data.pickle', 'rb')
(X, y) = pickle.load(f)

num_records = len(X)
train_size = int(train_ratio * num_records)

## 3.3 对特征进行预测处理
预处理主要包括选择特征，把特征转换为one-hot编码等

In [59]:
#打乱数据
if shuffle_data:
    print("Using shuffled data")
    sh = numpy.arange(X.shape[0])
    numpy.random.shuffle(sh)
    X = X[sh]
    y = y[sh]


#把数据集转换为oneHot编码
def one_hot(X):
    enc = OneHotEncoder(sparse=False)
    enc.fit(X)
    X1= enc.transform(X)
    return X1



把训练集转换为one-hot编码

In [60]:
one_hot_as_input = True
if one_hot_as_input:
    X1=one_hot(X)
    
#X2=X
#enc = OneHotEncoder(sparse=False)
#X1=enc.fit_transform(X2)
#X2 = enc.transform(X2)

MemoryError: 

In [61]:
X.shape

(844338, 8)

In [54]:
X[0]

array([  0, 109,   1,   0,   0,   0,   0,   7])

把数据集划分为训练集和验证集

In [38]:
X1_train = X1[:train_size]
X1_val = X1[train_size:]
y1_train = y[:train_size]
y1_val = y[train_size:]

NameError: name 'X1' is not defined

## 3.4 定义采样函数

In [27]:
def sample(X, y, n):
    '''random samples'''
    num_row = X.shape[0]
    indices = numpy.random.randint(num_row, size=n)
    return X[indices, :], y[indices]

In [28]:
X_train, y_train = sample(X_train, y_train, 200000)  # Simulate data sparsity
print("Number of samples used for training: " + str(y_train.shape[0]))

Number of samples used for training: 200000


## 3.5定义模型（不使用Embedding层）

In [30]:
class Model(object):

    def evaluate(self, X_val, y_val):
        assert(min(y_val) > 0)
        guessed_sales = self.guess(X_val)
        relative_err = numpy.absolute((y_val - guessed_sales) / y_val)
        result = numpy.sum(relative_err) / len(y_val)
        return result


class NN(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.epochs = 10
        self.checkpointer = ModelCheckpoint(filepath="best_model_weights.hdf5", verbose=1, save_best_only=True)
        self.max_log_y = max(numpy.max(numpy.log(y_train)), numpy.max(numpy.log(y_val)))
        self.__build_keras_model()
        self.fit(X_train, y_train, X_val, y_val)

    def __build_keras_model(self):
        self.model = Sequential()
        self.model.add(Dense(1000, kernel_initializer="uniform", input_dim=1183))
        #self.model.add(Dense(1000, kernel_initializer="uniform", input_dim=8))
        self.model.add(Activation('relu'))
        self.model.add(Dense(500, kernel_initializer="uniform"))
        self.model.add(Activation('relu'))
        self.model.add(Dense(1))
        self.model.add(Activation('sigmoid'))

        self.model.compile(loss='mean_absolute_error', optimizer='adam')

    def _val_for_fit(self, val):
        val = numpy.log(val) / self.max_log_y
        return val

    def _val_for_pred(self, val):
        return numpy.exp(val * self.max_log_y)

    def fit(self, X_train, y_train, X_val, y_val):
        self.model.fit(X_train, self._val_for_fit(y_train),
                       validation_data=(X_val, self._val_for_fit(y_val)),
                       epochs=self.epochs, batch_size=128,
                       # callbacks=[self.checkpointer],
                       )
        # self.model.load_weights('best_model_weights.hdf5')
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, features):
        result = self.model.predict(features).flatten()
        return self._val_for_pred(result)

## 3.6 训练模型

In [32]:
models = []
print("Fitting NN...")
for i in range(1):
     models.append(NN(X_train, y_train, X_val, y_val))

Fitting NN...


ValueError: Error when checking input: expected dense_input to have shape (1183,) but got array with shape (8,)

#### 3.2 定义特征处理函数

In [19]:
#从训练结果读取各特征的embedding向量，并用这些向量作为输入值
def embed_features(X, saved_embeddings_fname):
    # f_embeddings = open("embeddings_shuffled.pickle", "rb")
    f_embeddings = open(saved_embeddings_fname, "rb")
    embeddings = pickle.load(f_embeddings) 
    
    #因store_open,promo这两列，至多只有两个值，没有进行embedding，故需排除在外
    index_embedding_mapping = {1: 0, 2: 1, 4: 2, 5: 3, 6: 4, 7: 5}
    X_embedded = []

    (num_records, num_features) = X.shape
    for record in X:
        embedded_features = []
        for i, feat in enumerate(record):
            feat = int(feat)
            if i not in index_embedding_mapping.keys():
                embedded_features += [feat]
            else:
                embedding_index = index_embedding_mapping[i]
                embedded_features += embeddings[embedding_index][feat].tolist()

        X_embedded.append(embedded_features)

    return numpy.array(X_embedded)

#分别取出各特征,取出X中前8列数据，除第1列，
def split_features(X):
    X_list = []
    #获取X第2列数据
    store_index = X[..., [1]]
    X_list.append(store_index)
    #获取X第3列数据,以下类推
    day_of_week = X[..., [2]]
    X_list.append(day_of_week)

    promo = X[..., [3]]
    X_list.append(promo)

    year = X[..., [4]]
    X_list.append(year)

    month = X[..., [5]]
    X_list.append(month)

    day = X[..., [6]]
    X_list.append(day)

    State = X[..., [7]]
    X_list.append(State)

    return X_list

#### 3.3 构建传统机器学习模型

In [20]:
class Model(object):

    def evaluate(self, X_val, y_val):
        assert(min(y_val) > 0)
        guessed_sales = self.guess(X_val)
        relative_err = numpy.absolute((y_val - guessed_sales) / y_val)
        result = numpy.sum(relative_err) / len(y_val)
        return result


class LinearModel(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.clf = linear_model.LinearRegression()
        self.clf.fit(X_train, numpy.log(y_train))
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, feature):
        return numpy.exp(self.clf.predict(feature))


class RF(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.clf = RandomForestRegressor(n_estimators=200, verbose=True, max_depth=35, min_samples_split=2,
                                         min_samples_leaf=1)
        self.clf.fit(X_train, numpy.log(y_train))
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, feature):
        return numpy.exp(self.clf.predict(feature))


class SVM(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.X_train = X_train
        self.y_train = y_train
        self.__normalize_data()
        self.clf = SVR(kernel='linear', degree=3, gamma='auto', coef0=0.0, tol=0.001,
                       C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1)

        self.clf.fit(self.X_train, numpy.log(self.y_train))
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def __normalize_data(self):
        self.scaler = StandardScaler()
        self.X_train = self.scaler.fit_transform(self.X_train)

    def guess(self, feature):
        return numpy.exp(self.clf.predict(feature))


class XGBoost(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        dtrain = xgb.DMatrix(X_train, label=numpy.log(y_train))
        evallist = [(dtrain, 'train')]
        param = {'nthread': -1,
                 'max_depth': 7,
                 'eta': 0.02,
                 'silent': 1,
                 'objective': 'reg:linear',
                 'colsample_bytree': 0.7,
                 'subsample': 0.7}
        num_round = 3000
        self.bst = xgb.train(param, dtrain, num_round, evallist)
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, feature):
        dtest = xgb.DMatrix(feature)
        return numpy.exp(self.bst.predict(dtest))


class HistricalMedian(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.history = {}
        self.feature_index = [1, 2, 3, 4]
        for x, y in zip(X_train, y_train):
            key = tuple(x[self.feature_index])
            self.history.setdefault(key, []).append(y)
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, features):
        features = numpy.array(features)
        features = features[:, self.feature_index]
        guessed_sales = [numpy.median(self.history[tuple(feature)]) for feature in features]
        return numpy.array(guessed_sales)


class KNN(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.normalizer = Normalizer()
        self.normalizer.fit(X_train)
        self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1)
        self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train))
        print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val))

    def guess(self, feature):
        return numpy.exp(self.clf.predict(self.normalizer.transform(feature)))


#### 3.4 构建神经网络模型

In [35]:
class NN_with_EntityEmbedding(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.epochs = 10
        self.checkpointer = ModelCheckpoint(filepath="best_model_weights.hdf5", verbose=1, save_best_only=True)
        self.max_log_y = max(numpy.max(numpy.log(y_train)), numpy.max(numpy.log(y_val)))
        self.__build_keras_model()
        self.fit(X_train, y_train, X_val, y_val)

    def preprocessing(self, X):
        X_list = split_features01(X)
        return X_list

    def __build_keras_model(self):
        input_store = Input(shape=(1,))
        output_store = Embedding(1115, 10, name='store_embedding')(input_store)
        output_store = Reshape(target_shape=(10,))(output_store)

        input_dow = Input(shape=(1,))
        output_dow = Embedding(7, 6, name='dow_embedding')(input_dow)
        output_dow = Reshape(target_shape=(6,))(output_dow)

        input_promo = Input(shape=(1,))
        output_promo = Dense(1)(input_promo)

        input_year = Input(shape=(1,))
        output_year = Embedding(3, 2, name='year_embedding')(input_year)
        output_year = Reshape(target_shape=(2,))(output_year)

        input_month = Input(shape=(1,))
        output_month = Embedding(12, 6, name='month_embedding')(input_month)
        output_month = Reshape(target_shape=(6,))(output_month)

        input_day = Input(shape=(1,))
        output_day = Embedding(31, 10, name='day_embedding')(input_day)
        output_day = Reshape(target_shape=(10,))(output_day)

        input_germanstate = Input(shape=(1,))
        output_germanstate = Embedding(12, 6, name='state_embedding')(input_germanstate)
        output_germanstate = Reshape(target_shape=(6,))(output_germanstate)

        input_model = [input_store, input_dow, input_promo,
                       input_year, input_month, input_day, input_germanstate]

        output_embeddings = [output_store, output_dow, output_promo,
                             output_year, output_month, output_day, output_germanstate]

        output_model = Concatenate()(output_embeddings)
        output_model = Dense(1000, kernel_initializer="uniform")(output_model)
        output_model = Activation('relu')(output_model)
        output_model = Dense(500, kernel_initializer="uniform")(output_model)
        output_model = Activation('relu')(output_model)
        output_model = Dense(1)(output_model)
        output_model = Activation('sigmoid')(output_model)

        self.model = KerasModel(inputs=input_model, outputs=output_model)

        self.model.compile(loss='mean_absolute_error', optimizer='adam')

    def _val_for_fit(self, val):
        val = numpy.log(val) / self.max_log_y
        return val

    def _val_for_pred(self, val):
        return numpy.exp(val * self.max_log_y)

    def fit(self, X_train, y_train, X_val, y_val):
        self.model.fit(self.preprocessing(X_train), self._val_for_fit(y_train),
                       validation_data=(self.preprocessing(X_val), self._val_for_fit(y_val)),
                       epochs=self.epochs, batch_size=128,
                       # callbacks=[self.checkpointer],
                       )
        # self.model.load_weights('best_model_weights.hdf5')
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, features):
        features = self.preprocessing(features)
        result = self.model.predict(features).flatten()
        return self._val_for_pred(result)


class NN(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.epochs = 10
        self.checkpointer = ModelCheckpoint(filepath="best_model_weights.hdf5", verbose=1, save_best_only=True)
        self.max_log_y = max(numpy.max(numpy.log(y_train)), numpy.max(numpy.log(y_val)))
        self.__build_keras_model()
        self.fit(X_train, y_train, X_val, y_val)

    def __build_keras_model(self):
        self.model = Sequential()
        self.model.add(Dense(1000, kernel_initializer="uniform", input_dim=1183))
        #self.model.add(Dense(1000, kernel_initializer="uniform", input_dim=8))
        self.model.add(Activation('relu'))
        self.model.add(Dense(500, kernel_initializer="uniform"))
        self.model.add(Activation('relu'))
        self.model.add(Dense(1))
        self.model.add(Activation('sigmoid'))

        self.model.compile(loss='mean_absolute_error', optimizer='adam')

    def _val_for_fit(self, val):
        val = numpy.log(val) / self.max_log_y
        return val

    def _val_for_pred(self, val):
        return numpy.exp(val * self.max_log_y)

    def fit(self, X_train, y_train, X_val, y_val):
        self.model.fit(X_train, self._val_for_fit(y_train),
                       validation_data=(X_val, self._val_for_fit(y_val)),
                       epochs=self.epochs, batch_size=128,
                       # callbacks=[self.checkpointer],
                       )
        # self.model.load_weights('best_model_weights.hdf5')
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, features):
        result = self.model.predict(features).flatten()
        return self._val_for_pred(result)

### 4 训练模型

#### 4.1 导入需要的库

In [22]:
import pickle
import numpy
numpy.random.seed(123)
from models import *
from sklearn.preprocessing import OneHotEncoder
import sys
sys.setrecursionlimit(10000)

train_ratio = 0.9
shuffle_data = False
one_hot_as_input = False
embeddings_as_input = False
save_embeddings = True
saved_embeddings_fname = "embeddings.pickle"  # set save_embeddings to True to create this file

#### 4.2 导入训练数据

In [23]:
f = open('feature_train_data.pickle', 'rb')
(X, y) = pickle.load(f)

num_records = len(X)
train_size = int(train_ratio * num_records)

In [24]:
num_records

844338

#### 4.3 划分训练与验证数据集

In [25]:
##### 把X转换为one-hot编码

In [26]:
#enc = OneHotEncoder(sparse=False)
#enc.fit(X)
#X= enc.transform(X)

In [27]:
X.shape

(844338, 8)

#### 4.3.1 打乱数据

In [28]:
#打乱数据
shuffle_data=True
if shuffle_data:
    print("Using shuffled data")
    sh = numpy.arange(X.shape[0])
    numpy.random.shuffle(sh)
    X = X[sh]
    y = y[sh]

Using shuffled data


In [32]:
X_train = X[:train_size]
X_val = X[train_size:]
y_train = y[:train_size]
y_val = y[train_size:]

In [33]:
X_train[0]

array([  0, 468,   4,   0,   1,   6,  17,   5])

In [34]:
X_val[0]

array([  0, 154,   2,   0,   0,   0,  11,  11])

#### 4.4 定义特征预处理函数

In [27]:
if shuffle_data:
    print("Using shuffled data")
    sh = numpy.arange(X.shape[0])
    numpy.random.shuffle(sh)
    X = X[sh]
    y = y[sh]

if embeddings_as_input:
    print("Using learned embeddings as input")
    X = embed_features(X, saved_embeddings_fname)

#对数据集X的各列进行oneHot编码
if one_hot_as_input:
    print("Using one-hot encoding as input")
    enc = OneHotEncoder(sparse=False)
    enc.fit(X)
    X = enc.transform(X)

#### 4.5 定义取样函数

In [41]:
def sample(X, y, n):
    '''random samples'''
    num_row = X.shape[0]
    indices = numpy.random.randint(num_row, size=n)
    return X[indices, :], y[indices]

In [36]:
X_train, y_train = sample(X_train, y_train, 200000)  # Simulate data sparsity
print("Number of samples used for training: " + str(y_train.shape[0]))


Number of samples used for training: 200000


### 4.5.1 训练xgb模型（不进行one-hot转换，也没有进行Embedding，数据没有shuffle）

In [37]:
models = []
print("Fitting XGBoost...")
models.append(XGBoost(X_train, y_train, X_val, y_val))

Fitting XGBoost...
[0]	train-rmse:8.10386
[1]	train-rmse:7.94217
[2]	train-rmse:7.78374
[3]	train-rmse:7.62846
[4]	train-rmse:7.47632
[5]	train-rmse:7.32718
[6]	train-rmse:7.18105
[7]	train-rmse:7.03787
[8]	train-rmse:6.89751
[9]	train-rmse:6.76001
[10]	train-rmse:6.62523
[11]	train-rmse:6.49317
[12]	train-rmse:6.3638
[13]	train-rmse:6.23696
[14]	train-rmse:6.11276
[15]	train-rmse:5.99098
[16]	train-rmse:5.87168
[17]	train-rmse:5.75472
[18]	train-rmse:5.64012
[19]	train-rmse:5.52783
[20]	train-rmse:5.41777
[21]	train-rmse:5.30995
[22]	train-rmse:5.20427
[23]	train-rmse:5.10075
[24]	train-rmse:4.9993
[25]	train-rmse:4.89989
[26]	train-rmse:4.80247
[27]	train-rmse:4.70702
[28]	train-rmse:4.6135
[29]	train-rmse:4.52187
[30]	train-rmse:4.43206
[31]	train-rmse:4.34404
[32]	train-rmse:4.25782
[33]	train-rmse:4.17331
[34]	train-rmse:4.09051
[35]	train-rmse:4.00938
[36]	train-rmse:3.92986
[37]	train-rmse:3.85198
[38]	train-rmse:3.77565
[39]	train-rmse:3.70087
[40]	train-rmse:3.62761
[41]	train

[325]	train-rmse:0.308843
[326]	train-rmse:0.308431
[327]	train-rmse:0.308193
[328]	train-rmse:0.308084
[329]	train-rmse:0.308029
[330]	train-rmse:0.307976
[331]	train-rmse:0.307908
[332]	train-rmse:0.307828
[333]	train-rmse:0.30772
[334]	train-rmse:0.307474
[335]	train-rmse:0.307428
[336]	train-rmse:0.306994
[337]	train-rmse:0.306771
[338]	train-rmse:0.306664
[339]	train-rmse:0.306381
[340]	train-rmse:0.306153
[341]	train-rmse:0.306043
[342]	train-rmse:0.305954
[343]	train-rmse:0.305915
[344]	train-rmse:0.305844
[345]	train-rmse:0.305794
[346]	train-rmse:0.305716
[347]	train-rmse:0.305642
[348]	train-rmse:0.30529
[349]	train-rmse:0.304906
[350]	train-rmse:0.304841
[351]	train-rmse:0.304748
[352]	train-rmse:0.304243
[353]	train-rmse:0.303882
[354]	train-rmse:0.303598
[355]	train-rmse:0.30357
[356]	train-rmse:0.303358
[357]	train-rmse:0.303137
[358]	train-rmse:0.303099
[359]	train-rmse:0.303067
[360]	train-rmse:0.303002
[361]	train-rmse:0.302675
[362]	train-rmse:0.302623
[363]	train-rms

[642]	train-rmse:0.262747
[643]	train-rmse:0.262599
[644]	train-rmse:0.262569
[645]	train-rmse:0.262559
[646]	train-rmse:0.262411
[647]	train-rmse:0.262358
[648]	train-rmse:0.262346
[649]	train-rmse:0.26219
[650]	train-rmse:0.26205
[651]	train-rmse:0.262031
[652]	train-rmse:0.262023
[653]	train-rmse:0.261977
[654]	train-rmse:0.261938
[655]	train-rmse:0.261886
[656]	train-rmse:0.261734
[657]	train-rmse:0.261514
[658]	train-rmse:0.261297
[659]	train-rmse:0.261052
[660]	train-rmse:0.260844
[661]	train-rmse:0.260691
[662]	train-rmse:0.260668
[663]	train-rmse:0.260518
[664]	train-rmse:0.26036
[665]	train-rmse:0.260222
[666]	train-rmse:0.260208
[667]	train-rmse:0.260188
[668]	train-rmse:0.260076
[669]	train-rmse:0.260064
[670]	train-rmse:0.259926
[671]	train-rmse:0.259883
[672]	train-rmse:0.259849
[673]	train-rmse:0.259831
[674]	train-rmse:0.259816
[675]	train-rmse:0.259674
[676]	train-rmse:0.25965
[677]	train-rmse:0.259639
[678]	train-rmse:0.259624
[679]	train-rmse:0.259607
[680]	train-rmse

[960]	train-rmse:0.231562
[961]	train-rmse:0.231469
[962]	train-rmse:0.231323
[963]	train-rmse:0.231309
[964]	train-rmse:0.231297
[965]	train-rmse:0.23129
[966]	train-rmse:0.231278
[967]	train-rmse:0.231186
[968]	train-rmse:0.231111
[969]	train-rmse:0.231066
[970]	train-rmse:0.230987
[971]	train-rmse:0.230972
[972]	train-rmse:0.230857
[973]	train-rmse:0.230699
[974]	train-rmse:0.230601
[975]	train-rmse:0.230409
[976]	train-rmse:0.230394
[977]	train-rmse:0.23038
[978]	train-rmse:0.230366
[979]	train-rmse:0.230357
[980]	train-rmse:0.230232
[981]	train-rmse:0.230124
[982]	train-rmse:0.230095
[983]	train-rmse:0.229861
[984]	train-rmse:0.229837
[985]	train-rmse:0.229684
[986]	train-rmse:0.22966
[987]	train-rmse:0.229652
[988]	train-rmse:0.229511
[989]	train-rmse:0.229492
[990]	train-rmse:0.229479
[991]	train-rmse:0.229266
[992]	train-rmse:0.229245
[993]	train-rmse:0.229226
[994]	train-rmse:0.229193
[995]	train-rmse:0.2291
[996]	train-rmse:0.22898
[997]	train-rmse:0.228857
[998]	train-rmse:0

[1267]	train-rmse:0.210812
[1268]	train-rmse:0.210799
[1269]	train-rmse:0.210795
[1270]	train-rmse:0.210724
[1271]	train-rmse:0.210713
[1272]	train-rmse:0.210707
[1273]	train-rmse:0.210597
[1274]	train-rmse:0.210525
[1275]	train-rmse:0.21052
[1276]	train-rmse:0.210457
[1277]	train-rmse:0.21034
[1278]	train-rmse:0.21022
[1279]	train-rmse:0.210166
[1280]	train-rmse:0.210118
[1281]	train-rmse:0.209967
[1282]	train-rmse:0.209947
[1283]	train-rmse:0.209927
[1284]	train-rmse:0.20987
[1285]	train-rmse:0.209853
[1286]	train-rmse:0.209847
[1287]	train-rmse:0.209713
[1288]	train-rmse:0.209699
[1289]	train-rmse:0.209528
[1290]	train-rmse:0.209435
[1291]	train-rmse:0.209427
[1292]	train-rmse:0.209356
[1293]	train-rmse:0.209349
[1294]	train-rmse:0.209237
[1295]	train-rmse:0.209229
[1296]	train-rmse:0.209224
[1297]	train-rmse:0.209188
[1298]	train-rmse:0.209093
[1299]	train-rmse:0.209043
[1300]	train-rmse:0.209036
[1301]	train-rmse:0.209018
[1302]	train-rmse:0.209007
[1303]	train-rmse:0.20893
[1304]

[1573]	train-rmse:0.193545
[1574]	train-rmse:0.193537
[1575]	train-rmse:0.193501
[1576]	train-rmse:0.19349
[1577]	train-rmse:0.193479
[1578]	train-rmse:0.193419
[1579]	train-rmse:0.193365
[1580]	train-rmse:0.193308
[1581]	train-rmse:0.193296
[1582]	train-rmse:0.1932
[1583]	train-rmse:0.193196
[1584]	train-rmse:0.193188
[1585]	train-rmse:0.19318
[1586]	train-rmse:0.193175
[1587]	train-rmse:0.193056
[1588]	train-rmse:0.193047
[1589]	train-rmse:0.192964
[1590]	train-rmse:0.192871
[1591]	train-rmse:0.192858
[1592]	train-rmse:0.192853
[1593]	train-rmse:0.19277
[1594]	train-rmse:0.192733
[1595]	train-rmse:0.192687
[1596]	train-rmse:0.19259
[1597]	train-rmse:0.192501
[1598]	train-rmse:0.192357
[1599]	train-rmse:0.192268
[1600]	train-rmse:0.192242
[1601]	train-rmse:0.19223
[1602]	train-rmse:0.19209
[1603]	train-rmse:0.191992
[1604]	train-rmse:0.191987
[1605]	train-rmse:0.191981
[1606]	train-rmse:0.191965
[1607]	train-rmse:0.191884
[1608]	train-rmse:0.191875
[1609]	train-rmse:0.191871
[1610]	tr

[1878]	train-rmse:0.180607
[1879]	train-rmse:0.180603
[1880]	train-rmse:0.180565
[1881]	train-rmse:0.180514
[1882]	train-rmse:0.180507
[1883]	train-rmse:0.180503
[1884]	train-rmse:0.180462
[1885]	train-rmse:0.180398
[1886]	train-rmse:0.180269
[1887]	train-rmse:0.180215
[1888]	train-rmse:0.1802
[1889]	train-rmse:0.180185
[1890]	train-rmse:0.180118
[1891]	train-rmse:0.180048
[1892]	train-rmse:0.180044
[1893]	train-rmse:0.179981
[1894]	train-rmse:0.17994
[1895]	train-rmse:0.179934
[1896]	train-rmse:0.179928
[1897]	train-rmse:0.179924
[1898]	train-rmse:0.179921
[1899]	train-rmse:0.179915
[1900]	train-rmse:0.179873
[1901]	train-rmse:0.179867
[1902]	train-rmse:0.179812
[1903]	train-rmse:0.179762
[1904]	train-rmse:0.179752
[1905]	train-rmse:0.179731
[1906]	train-rmse:0.179726
[1907]	train-rmse:0.179687
[1908]	train-rmse:0.179682
[1909]	train-rmse:0.17965
[1910]	train-rmse:0.179588
[1911]	train-rmse:0.179545
[1912]	train-rmse:0.17948
[1913]	train-rmse:0.179403
[1914]	train-rmse:0.179357
[1915]

[2183]	train-rmse:0.168672
[2184]	train-rmse:0.168609
[2185]	train-rmse:0.168606
[2186]	train-rmse:0.168601
[2187]	train-rmse:0.168569
[2188]	train-rmse:0.168567
[2189]	train-rmse:0.168556
[2190]	train-rmse:0.168553
[2191]	train-rmse:0.168548
[2192]	train-rmse:0.16847
[2193]	train-rmse:0.168463
[2194]	train-rmse:0.168385
[2195]	train-rmse:0.168358
[2196]	train-rmse:0.168298
[2197]	train-rmse:0.168293
[2198]	train-rmse:0.168177
[2199]	train-rmse:0.16805
[2200]	train-rmse:0.167999
[2201]	train-rmse:0.167936
[2202]	train-rmse:0.167929
[2203]	train-rmse:0.167896
[2204]	train-rmse:0.167858
[2205]	train-rmse:0.167802
[2206]	train-rmse:0.167683
[2207]	train-rmse:0.167678
[2208]	train-rmse:0.167625
[2209]	train-rmse:0.167593
[2210]	train-rmse:0.167539
[2211]	train-rmse:0.167538
[2212]	train-rmse:0.167509
[2213]	train-rmse:0.167493
[2214]	train-rmse:0.167443
[2215]	train-rmse:0.167368
[2216]	train-rmse:0.167363
[2217]	train-rmse:0.167284
[2218]	train-rmse:0.167218
[2219]	train-rmse:0.167183
[22

[2488]	train-rmse:0.159953
[2489]	train-rmse:0.159947
[2490]	train-rmse:0.15992
[2491]	train-rmse:0.159918
[2492]	train-rmse:0.15991
[2493]	train-rmse:0.159895
[2494]	train-rmse:0.159835
[2495]	train-rmse:0.15983
[2496]	train-rmse:0.159808
[2497]	train-rmse:0.159807
[2498]	train-rmse:0.159798
[2499]	train-rmse:0.159775
[2500]	train-rmse:0.159743
[2501]	train-rmse:0.159742
[2502]	train-rmse:0.159664
[2503]	train-rmse:0.159639
[2504]	train-rmse:0.159637
[2505]	train-rmse:0.159635
[2506]	train-rmse:0.159627
[2507]	train-rmse:0.159589
[2508]	train-rmse:0.159562
[2509]	train-rmse:0.159507
[2510]	train-rmse:0.159382
[2511]	train-rmse:0.15936
[2512]	train-rmse:0.159353
[2513]	train-rmse:0.159315
[2514]	train-rmse:0.159253
[2515]	train-rmse:0.159226
[2516]	train-rmse:0.159223
[2517]	train-rmse:0.15922
[2518]	train-rmse:0.159183
[2519]	train-rmse:0.159126
[2520]	train-rmse:0.1591
[2521]	train-rmse:0.159099
[2522]	train-rmse:0.159065
[2523]	train-rmse:0.159062
[2524]	train-rmse:0.159012
[2525]	t

[2794]	train-rmse:0.152274
[2795]	train-rmse:0.152254
[2796]	train-rmse:0.15225
[2797]	train-rmse:0.152222
[2798]	train-rmse:0.15216
[2799]	train-rmse:0.152154
[2800]	train-rmse:0.152114
[2801]	train-rmse:0.15206
[2802]	train-rmse:0.152058
[2803]	train-rmse:0.152057
[2804]	train-rmse:0.152
[2805]	train-rmse:0.151944
[2806]	train-rmse:0.151819
[2807]	train-rmse:0.151792
[2808]	train-rmse:0.151773
[2809]	train-rmse:0.151744
[2810]	train-rmse:0.151741
[2811]	train-rmse:0.151689
[2812]	train-rmse:0.151686
[2813]	train-rmse:0.151647
[2814]	train-rmse:0.151644
[2815]	train-rmse:0.15162
[2816]	train-rmse:0.151597
[2817]	train-rmse:0.151569
[2818]	train-rmse:0.151552
[2819]	train-rmse:0.151523
[2820]	train-rmse:0.151508
[2821]	train-rmse:0.151502
[2822]	train-rmse:0.151495
[2823]	train-rmse:0.151494
[2824]	train-rmse:0.151484
[2825]	train-rmse:0.15148
[2826]	train-rmse:0.151464
[2827]	train-rmse:0.151408
[2828]	train-rmse:0.151407
[2829]	train-rmse:0.151329
[2830]	train-rmse:0.151325
[2831]	tr

#### 评估模型

In [38]:
def evaluate_models(models, X, y):
    assert(min(y) > 0)
    guessed_sales = numpy.array([model.guess(X) for model in models])
    mean_sales = guessed_sales.mean(axis=0)
    relative_err = numpy.absolute((y - mean_sales) / y)
    result = numpy.sum(relative_err) / len(y)
    return result


print("Evaluate combined models...")
print("Training error...")
r_train = evaluate_models(models, X_train, y_train)
print(r_train)

print("Validation error...")
r_val = evaluate_models(models, X_val, y_val)
print(r_val)

Evaluate combined models...
Training error...
0.11259707148948464
Validation error...
0.11917678406883771


### 4.5.2 训练神经网络（不带Embedding层）

### 4.5.3 转换为one-hot

In [80]:
#打乱数据
shuffle_data=True
if shuffle_data:
    print("Using shuffled data")
    sh = numpy.arange(X.shape[0])
    numpy.random.shuffle(sh)
    X = X[sh]
    y = y[sh]


#把数据集转换为oneHot编码
def one_hot(X):
    enc = OneHotEncoder(sparse=False)
    enc.fit(X)
    X1= enc.transform(X)
    return X1

Using shuffled data


In [107]:
a=np.array([[1,2,3],[4,5,6]])
a1=a[...,[0]]

In [108]:
a1

array([[1],
       [4]])

In [109]:
one_hot(a1)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1., 0.],
       [0., 1.]])

### 4.5.4 数据划分

In [42]:
X1_train = X1[:train_size]
X1_val = X1[train_size:]
y_train = y[:train_size]
y_val = y[train_size:]

In [43]:
X1_train.shape

(759904, 1183)

### 4.5.5 取样数据

In [44]:
X2_train, y_train = sample(X1_train, y_train, 200000)  # Simulate data sparsity
print("Number of samples used for training: " + str(y_train.shape[0]))


Number of samples used for training: 200000


### 4.5.6 训练模型

In [46]:
models = []
print("Fitting NN...")
for i in range(1):
     models.append(NN(X2_train, y_train, X1_val, y_val))

Fitting NN...
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  0.08003406537139751


### 4.5.7 评估模型

In [47]:
print("Evaluate combined models...")
print("Training error...")
r_train = evaluate_models(models, X2_train, y_train)
print(r_train)

print("Validation error...")
r_val = evaluate_models(models, X1_val, y_val)
print(r_val)

Evaluate combined models...
Training error...
0.042231767600769236
Validation error...
0.08003406537139751


### 4.6 训练带Embedding层的神经网络模型

In [32]:
#打乱数据
shuffle_data=True
if shuffle_data:
    print("Using shuffled data")
    sh = numpy.arange(X.shape[0])
    numpy.random.shuffle(sh)
    X = X[sh]
    y = y[sh]


#把数据集中每列转换为oneHot编码
def one_hot(X):
    enc = OneHotEncoder(sparse=False)
    enc.fit(X)
    X1= enc.transform(X)
    return X1

Using shuffled data


In [36]:
#划分数据
X1_train = X[:train_size]
X1_val = X[train_size:]
y_train = y[:train_size]
y_val = y[train_size:]

In [37]:
###查看数据
#X1_train[..., [1]][:6]
dd=X1_train[0]
#sorted(dd, reverse=True)

In [39]:
X1_train.shape

(759904, 8)

In [42]:
#数据抽样
X1_train, y1_train = sample(X1_train, y_train, 200000)  # Simulate data sparsity
print("Number of samples used for training: " + str(y_train.shape[0]))


Number of samples used for training: 759904


In [75]:
a=np.array([[1,2,3,4,5,6,7,8],[9,10,11,12,13,14,15,16]])
b=split_features(a)

In [67]:
a=np.array([[1,2,3],[2,3,4]])
a1=np.array([[1,2,3,4]])

In [68]:
a1.shape

(1, 4)

In [103]:
def one_hot1(X):
    enc = OneHotEncoder()
    enc.fit(X)
    X1= enc.transform(X).toarray()
    return X1

In [100]:
X_list=[]
store_index = X1_train[..., [1]]
store_index=one_hot1(store_index)
X_list.append(store_index)
day_of_week = X1_train[..., [2]]
day_of_week=one_hot1(day_of_week)
X_list.append(day_of_week)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [107]:
X_list[0].shape

(200000, 1115)

In [58]:
b

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=[[2], [3]], sparse=True)

#### 4.6.1 从新定义split_features函数，添加one_hot的功能

In [85]:
def split_features01(X):
    X_list = []

    store_index = X[..., [1]]
    store_index=one_hot1(store_index)
    X_list.append(store_index)
    
    day_of_week = X[..., [2]]
    day_of_week=one_hot1(day_of_week)
    X_list.append(day_of_week)

    promo = X[..., [3]]
    promo=one_hot1(promo)
    X_list.append(promo)

    year = X[..., [4]]
    year=one_hot1(year)
    X_list.append(year)

    month = X[..., [5]]
    month=one_hot1(month)
    X_list.append(month)

    day = X[..., [6]]
    day=one_hot1(day)
    X_list.append(day)

    State = X[..., [7]]
    State=one_hot1(State)
    X_list.append(State)

    return X_list



In [112]:
class NN_with_EntityEmbedding01(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.epochs = 10
        self.checkpointer = ModelCheckpoint(filepath="best_model_weights.hdf5", verbose=1, save_best_only=True)
        self.max_log_y = max(numpy.max(numpy.log(y_train)), numpy.max(numpy.log(y_val)))
        self.__build_keras_model()
        self.fit(X_train, y_train, X_val, y_val)

    def preprocessing(self, X):
        X_list = split_features01(X)
        return X_list

    def __build_keras_model(self):
        input_store = Input(shape=(1115,))
        output_store = Embedding(1115, 10, name='store_embedding')(input_store)
        output_store = Reshape(target_shape=(10,))(output_store)

        input_dow = Input(shape=(7,))
        output_dow = Embedding(7, 6, name='dow_embedding')(input_dow)
        output_dow = Reshape(target_shape=(6,))(output_dow)

        input_promo = Input(shape=(1,))
        output_promo = Dense(1)(input_promo)

        input_year = Input(shape=(3,))
        output_year = Embedding(3, 2, name='year_embedding')(input_year)
        output_year = Reshape(target_shape=(2,))(output_year)

        input_month = Input(shape=(12,))
        output_month = Embedding(12, 6, name='month_embedding')(input_month)
        output_month = Reshape(target_shape=(6,))(output_month)

        input_day = Input(shape=(31,))
        output_day = Embedding(31, 10, name='day_embedding')(input_day)
        output_day = Reshape(target_shape=(10,))(output_day)

        input_germanstate = Input(shape=(12,))
        output_germanstate = Embedding(12, 6, name='state_embedding')(input_germanstate)
        output_germanstate = Reshape(target_shape=(6,))(output_germanstate)

        input_model = [input_store, input_dow, input_promo,
                       input_year, input_month, input_day, input_germanstate]

        output_embeddings = [output_store, output_dow, output_promo,
                             output_year, output_month, output_day, output_germanstate]

        output_model = Concatenate()(output_embeddings)
        output_model = Flatten()(output_model)
        output_model = Dense(1000, kernel_initializer="uniform")(output_model)
        output_model = Activation('relu')(output_model)
        output_model = Dense(500, kernel_initializer="uniform")(output_model)
        output_model = Activation('relu')(output_model)
        output_model = Dense(1)(output_model)
        output_model = Activation('sigmoid')(output_model)

        self.model = KerasModel(inputs=input_model, outputs=output_model)

        self.model.compile(loss='mean_absolute_error', optimizer='adam')

    def _val_for_fit(self, val):
        val = numpy.log(val) / self.max_log_y
        return val

    def _val_for_pred(self, val):
        return numpy.exp(val * self.max_log_y)

    def fit(self, X_train, y_train, X_val, y_val):
        self.model.fit(self.preprocessing(X_train), self._val_for_fit(y_train),
                       validation_data=(self.preprocessing(X_val), self._val_for_fit(y_val)),
                       epochs=self.epochs, batch_size=128,
                       # callbacks=[self.checkpointer],
                       )
        # self.model.load_weights('best_model_weights.hdf5')
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, features):
        features = self.preprocessing(features)
        result = self.model.predict(features).flatten()
        return self._val_for_pred(result)

In [113]:
models = []

print("Fitting NN_with_EntityEmbedding...")
for i in range(2):
    models.append(NN_with_EntityEmbedding01(X1_train, y1_train, X1_val, y_val))


Fitting NN_with_EntityEmbedding...


ValueError: total size of new array must be unchanged

### 5. 保存并评估模型

#### 5.1 保存模型

In [31]:
if save_embeddings:
    model = models[0].model
    store_embedding = model.get_layer('store_embedding').get_weights()[0]
    dow_embedding = model.get_layer('dow_embedding').get_weights()[0]
    year_embedding = model.get_layer('year_embedding').get_weights()[0]
    month_embedding = model.get_layer('month_embedding').get_weights()[0]
    day_embedding = model.get_layer('day_embedding').get_weights()[0]
    german_states_embedding = model.get_layer('state_embedding').get_weights()[0]
    with open(saved_embeddings_fname, 'wb') as f:
        pickle.dump([store_embedding, dow_embedding, year_embedding,
                     month_embedding, day_embedding, german_states_embedding], f, -1)


#### 5.2 评估模型

In [32]:
def evaluate_models(models, X, y):
    assert(min(y) > 0)
    guessed_sales = numpy.array([model.guess(X) for model in models])
    mean_sales = guessed_sales.mean(axis=0)
    relative_err = numpy.absolute((y - mean_sales) / y)
    result = numpy.sum(relative_err) / len(y)
    return result


print("Evaluate combined models...")
print("Training error...")
r_train = evaluate_models(models, X_train, y_train)
print(r_train)

print("Validation error...")
r_val = evaluate_models(models, X_val, y_val)
print(r_val)

Evaluate combined models...
Training error...
0.36276543002692446
Validation error...
0.3290402526694366


#### 5.3 训练不带embedding层的神经网络

In [36]:
print("Fitting NN...")
for i in range(5):
     models.append(NN(X_train, y_train, X_val, y_val))


Fitting NN...
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  5.825855990372018
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  5.825855990372018
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  5.825855990372018
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  5.825855990372018
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoc

In [37]:
print("Evaluate combined models...")
print("Training error...")
r_train = evaluate_models(models, X_train, y_train)
print(r_train)

print("Validation error...")
r_val = evaluate_models(models, X_val, y_val)
print(r_val)

Evaluate combined models...
Training error...
3.112884150840002
Validation error...
2.9053194526974613


In [40]:
class NN1(Model):

    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.epochs = 10
        self.checkpointer = ModelCheckpoint(filepath="best_model_weights.hdf5", verbose=1, save_best_only=True)
        self.max_log_y = max(numpy.max(numpy.log(y_train)), numpy.max(numpy.log(y_val)))
        self.__build_keras_model()
        self.fit(X_train, y_train, X_val, y_val)

    def __build_keras_model(self):
        self.model = Sequential()
        #self.model.add(Dense(1000, kernel_initializer="uniform", input_dim=1183))
        self.model.add(Dense(100, input_dim=8))
        self.model.add(Activation('relu'))
        self.model.add(Dense(50))
        self.model.add(Activation('relu'))
        self.model.add(Dense(1))
        self.model.add(Activation('sigmoid'))

        self.model.compile(loss='mean_absolute_error', optimizer='adam')

    def _val_for_fit(self, val):
        val = numpy.log(val) / self.max_log_y
        return val

    def _val_for_pred(self, val):
        return numpy.exp(val * self.max_log_y)

    def fit(self, X_train, y_train, X_val, y_val):
        self.model.fit(X_train, self._val_for_fit(y_train),
                       validation_data=(X_val, self._val_for_fit(y_val)),
                       epochs=self.epochs, batch_size=128,
                       # callbacks=[self.checkpointer],
                       )
        # self.model.load_weights('best_model_weights.hdf5')
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, features):
        result = self.model.predict(features).flatten()
        return self._val_for_pred(result)

In [41]:
print("Fitting NN...")
for i in range(5):
     models.append(NN1(X_train, y_train, X_val, y_val))

Fitting NN...
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  5.825855549448704
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  5.819964447145853
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  5.825855015565381
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  5.825802734868867
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoc

In [42]:
print("Evaluate combined models...")
print("Training error...")
r_train = evaluate_models(models, X_train, y_train)
print(r_train)

print("Validation error...")
r_val = evaluate_models(models, X_val, y_val)
print(r_val)

Evaluate combined models...
Training error...
4.133171781147755
Validation error...
3.865164903940068


In [58]:
f = open('feature_train_data.pickle', 'rb')
(X, y) = pickle.load(f)

num_records = len(X)
train_size = int(train_ratio * num_records)

In [59]:
X2=X
enc = OneHotEncoder(sparse=False)
enc.fit(X2)
X2 = enc.transform(X2)


X1_train = X2[:train_size]
X1_val = X2[train_size:]
y_train = y[:train_size]
y_val = y[train_size:]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [65]:
y_train.shape

(759904,)

In [60]:
X1_train, y1_train = sample(X1_train, y_train, 200000)  # Simulate data sparsity
print("Number of samples used for training: " + str(y_train.shape[0]))


Number of samples used for training: 759904


In [62]:
len(X1_val[0])

1183

In [52]:
V1=X_val
enc = OneHotEncoder(sparse=False)
enc.fit(V1)
V1 = enc.transform(V1)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [None]:
print("Fitting NN...")
for i in range(5):
     models.append(NN(X1_train, y1_train, X1_val, y_val))

Fitting NN...
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  0.10832588508976115
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  0.10354307748809172
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Result on validation data:  0.10932309884352688
Train on 200000 samples, validate on 84434 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

In [55]:
len(X1[0])

1183

In [56]:
len(V1[0])

1172