In [1]:
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import initializers as init

In [10]:
train_data = pd.read_csv('house_data/train.csv')
test_data = pd.read_csv('house_data/test.csv')

In [11]:
train_data.shape # 1460个样本，80个特征和1个标签

(1460, 81)

In [12]:
test_data.shape # 输出(1459，80)

(1459, 80)

In [13]:
# 前4个样本的前4个特征、后2个特征和标签
train_data.iloc[0:4,[0,1,2,3,-3,-2-1]]


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,SaleType,SaleType.1
0,1,60,RL,65.0,WD,WD
1,2,20,RL,80.0,WD,WD
2,3,60,RL,68.0,WD,WD
3,4,70,RL,60.0,WD,WD


In [17]:
# 将所有的训练数据和测试数据的79个特征按样本连结
all_features = pd.concat((train_data.iloc[:, 1: -1], test_data.iloc[:, 1:]))
all_features

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


## 3.16.3 预处理数据 ##

In [19]:
# 将特征的每个值先减去均值u再除以标准差得到标准化后的特征值
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std())
)
# 标准化后，每个特征的均值变为0，所以可以直接用0来代替缺失值
all_features = all_features.fillna(0)

In [22]:
# dummy_na = True将缺失值也当作合法的特征值并为其创建特征指标
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape # (2919, 354)
# 特征数从79增加到354

(2919, 354)

In [25]:
# 通过values属性得到numpy格式的数据，并转化成NDArray方便后面的训练
n_train = train_data.shape[0]
train_features = np.array(all_features[:n_train].values, dtype=np.float)
test_features = np.array(all_features[n_train:].values, dtype=np.float)
train_labels = np.array(train_data.SalePrice.values.reshape(-1, 1), dtype=np.float)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  This is separate from the ipykernel package so we can avoid doing imports until
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """


## 3.16.4 训练模型 ##

In [27]:
# 使用基本线性回归来训练模型
def get_net():
    net = keras.models.Sequential()
    net.add(keras.layers.Dense(1))
    return net

In [28]:
# 对数的均方差误差来评价模型
log_rmse = tf.keras.losses.mean_squared_logarithmic_error

## 3.16.5 K折交叉验证 ##

In [36]:
# 以下函数返回第i折交叉验证时所需要的训练和验证数据
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part  = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = tf.concat([X_train, X_part], axis=0)
    return X_train, y_train, X_valid, y_valid


In [38]:
# 在K折交叉验证中我们训练K次，并返回训练和验证的平均误差
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
    train_1_sum, valid_1_su = 0, 0
    for i in range(k):
        # create model
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        # Compile model
        net.compile(loss=tf.keras.losses.mean_squared_logarithmic_error,
                    optimizer=tf.keras.optimizers.Adam(learning_rate))
        # Fit the model
        history=net.fit(data[0], data[1], validation_data=(data[2], data[3]), epochs=num_epochs, batch_size=batch_size, validation_freq=1,verbose=0)
        loss = history.history['loss']
        val_loss = history.history['loss']
        print('fold %d, train rmse %f valid rmse %f'
              % (i, loss[-1], val_loss[-1]))
    plt.subplot(1, 2, 2)
    plt.plot(loss, label='train')
    plt.plot(val_loss, label='valid')
    plt.legend(loc='upper right')
    plt.title("Training and Validation Loss")
    plt.show()

## 3.16.6 模型选择 ##

In [39]:
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
k_fold(k, train_features, train_labels, num_epochs, lr,weight_decay, batch_size)

ValueError: Do not pass inputs that mix Numpy arrays and TensorFlow tensors. You passed: x=tf.Tensor(
[[-1.67876685e-01 -4.36638724e-01  1.57333142e-01 ...  1.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 6.73198800e-02  1.86279879e-16  8.22985789e-01 ...  1.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-8.73466378e-01  5.01784504e-01 -7.20317411e-02 ...  1.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [ 3.02516445e-01 -1.55111755e-01 -1.42781108e-01 ...  1.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-8.73466378e-01 -6.12694327e-02 -5.71971965e-02 ...  1.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-8.73466378e-01  2.67178697e-01 -2.93031808e-02 ...  1.00000000e+00
   0.00000000e+00  0.00000000e+00]], shape=(1168, 354), dtype=float64); y=[[131000.]
 [235000.]
 [167000.]
 [142500.]
 [152000.]
 [239000.]
 [175000.]
 [158500.]
 [157000.]
 [267000.]
 [205000.]
 [149900.]
 [295000.]
 [305900.]
 [225000.]
 [ 89500.]
 [ 82500.]
 [360000.]
 [165600.]
 [132000.]
 [119900.]
 [375000.]
 [178000.]
 [188500.]
 [260000.]
 [270000.]
 [260000.]
 [187500.]
 [342643.]
 [354000.]
 [301000.]
 [126175.]
 [242000.]
 [ 87000.]
 [324000.]
 [145250.]
 [214500.]
 [ 78000.]
 [119000.]
 [139000.]
 [284000.]
 [207000.]
 [192000.]
 [228950.]
 [377426.]
 [214000.]
 [202500.]
 [155000.]
 [202900.]
 [ 82000.]
 [ 87500.]
 [266000.]
 [ 85000.]
 [140200.]
 [151500.]
 [157500.]
 [154000.]
 [437154.]
 [318061.]
 [190000.]
 [ 95000.]
 [105900.]
 [140000.]
 [177500.]
 [173000.]
 [134000.]
 [130000.]
 [280000.]
 [156000.]
 [145000.]
 [198500.]
 [118000.]
 [190000.]
 [147000.]
 [159000.]
 [165000.]
 [132000.]
 [162000.]
 [172400.]
 [134432.]
 [125000.]
 [123000.]
 [219500.]
 [ 61000.]
 [148000.]
 [340000.]
 [394432.]
 [179000.]
 [127000.]
 [187750.]
 [213500.]
 [ 76000.]
 [240000.]
 [192000.]
 [ 81000.]
 [125000.]
 [191000.]
 [426000.]
 [119000.]
 [215000.]
 [106500.]
 [100000.]
 [109000.]
 [129000.]
 [123000.]
 [169500.]
 [ 67000.]
 [241000.]
 [245500.]
 [164990.]
 [108000.]
 [258000.]
 [168000.]
 [150000.]
 [115000.]
 [177000.]
 [280000.]
 [339750.]
 [ 60000.]
 [145000.]
 [222000.]
 [115000.]
 [228000.]
 [181134.]
 [149500.]
 [239000.]
 [126000.]
 [142000.]
 [206300.]
 [215000.]
 [113000.]
 [315000.]
 [139000.]
 [135000.]
 [275000.]
 [109008.]
 [195400.]
 [175000.]
 [ 85400.]
 [ 79900.]
 [122500.]
 [181000.]
 [ 81000.]
 [212000.]
 [116000.]
 [119000.]
 [ 90350.]
 [110000.]
 [555000.]
 [118000.]
 [162900.]
 [172500.]
 [210000.]
 [127500.]
 [190000.]
 [199900.]
 [119500.]
 [120000.]
 [110000.]
 [280000.]
 [204000.]
 [210000.]
 [188000.]
 [175500.]
 [ 98000.]
 [256000.]
 [161000.]
 [110000.]
 [263435.]
 [155000.]
 [ 62383.]
 [188700.]
 [124000.]
 [178740.]
 [167000.]
 [146500.]
 [250000.]
 [187000.]
 [212000.]
 [190000.]
 [148000.]
 [440000.]
 [251000.]
 [132500.]
 [208900.]
 [380000.]
 [297000.]
 [ 89471.]
 [326000.]
 [374000.]
 [155000.]
 [164000.]
 [132500.]
 [147000.]
 [156000.]
 [175000.]
 [160000.]
 [ 86000.]
 [115000.]
 [133000.]
 [172785.]
 [155000.]
 [ 91300.]
 [ 34900.]
 [430000.]
 [184000.]
 [130000.]
 [120000.]
 [113000.]
 [226700.]
 [140000.]
 [289000.]
 [147000.]
 [124500.]
 [215000.]
 [208300.]
 [161000.]
 [124500.]
 [164900.]
 [202665.]
 [129900.]
 [134000.]
 [ 96500.]
 [402861.]
 [158000.]
 [265000.]
 [211000.]
 [234000.]
 [106250.]
 [150000.]
 [159000.]
 [184750.]
 [315750.]
 [176000.]
 [132000.]
 [446261.]
 [ 86000.]
 [200624.]
 [175000.]
 [128000.]
 [107500.]
 [ 39300.]
 [178000.]
 [107500.]
 [188000.]
 [111250.]
 [158000.]
 [272000.]
 [315000.]
 [248000.]
 [213250.]
 [133000.]
 [179665.]
 [229000.]
 [210000.]
 [129500.]
 [125000.]
 [263000.]
 [140000.]
 [112500.]
 [255500.]
 [108000.]
 [284000.]
 [113000.]
 [141000.]
 [108000.]
 [175000.]
 [234000.]
 [121500.]
 [170000.]
 [108000.]
 [185000.]
 [268000.]
 [128000.]
 [325000.]
 [214000.]
 [316600.]
 [135960.]
 [142600.]
 [120000.]
 [224500.]
 [170000.]
 [139000.]
 [118500.]
 [145000.]
 [164500.]
 [146000.]
 [131500.]
 [181900.]
 [253293.]
 [118500.]
 [325000.]]

## 3.16.7 预测并在Kaggle提交结果 ##

In [40]:
x_train = tf.convert_to_tensor(train_features, dtype=tf.float32)
y_train = tf.convert_to_tensor(train_labels, dtype=tf.float32)
x_test = tf.convert_to_tensor(test_features, dtype=tf.float32)
x_test = tf.convert_to_tensor(test_features, dtype=tf.float32)
model=tf.keras.models.Sequential([
    tf.keras.layers.Dense(1)
])
adam = tf.keras.optimizers.Adam(0.5)
model.compile(optimizer=adam,
              loss=tf.keras.losses.mean_squared_logarithmic_error)
model.fit(x_train, y_train, epochs=200, batch_size=32, verbose=0)
preds=np.array(model.predict(x_test))
test_data["SalePrice"] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data["Id"], test_data["SalePrice"]], axis=1)
submission.to_csv('submission.csv', index=False)