In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

tf.test.is_gpu_available()

2.0.0
sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.17.4
pandas 0.25.3
sklearn 0.22
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


False

In [3]:
# 在打开网址时出现和ssl证书相关的问题时，需要加上这句话
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# 采用房屋预测模型数据
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [5]:
from sklearn.model_selection import train_test_split

x_train_all,x_test,y_train_all,y_test = train_test_split(
    housing.data,housing.target,random_state = 7)
x_train,x_valid,y_train,y_valid = train_test_split(
    x_train_all,y_train_all,random_state = 11)
print(x_train.shape,y_train.shape)
print(x_valid.shape,y_valid.shape)
print(x_test.shape,y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [8]:
# 进行数据归一化处理
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.fit_transform(x_valid)
x_test_scaled = scaler.fit_transform(x_test)

# 首先必须了解metric的使用

In [13]:
# metric的使用
metric = keras.metrics.MeanSquaredError() # 均方差
print(metric([5.],[2.])) # 接受两个列表参数，求对应的元素差的平方，然后最后在求均值：tf.Tensor(9.0, shape=(), dtype=float32)
# metric默认数据会累加
print(metric([0.],[1.])) # [(5-2)^2 + (0-1)^2]/2 = 5
print(metric.result()) # 这是输出最终的结果

# 如果不想metric累加前面的结果需要调用reset_states()
metric.reset_states()
metric([1.],[3.])
print(metric.result())

tf.Tensor(9.0, shape=(), dtype=float32)
tf.Tensor(5.0, shape=(), dtype=float32)
tf.Tensor(5.0, shape=(), dtype=float32)
tf.Tensor(4.0, shape=(), dtype=float32)


In [19]:
test_idx = np.random.randint(0,100,size=32) # 在0-100之间随机取32个数
print(test_idx)
# print(x_train_scaled[test_idx]) # 会把每个对应索引的1维数据取出来组成一个新的

[29 26 33  2 75 37 95 88 57 73 13 27 17 40  4 10 21 73 45 61 88 18 19  0
 80 20 87 56  3 73 98 15]


In [20]:
# 先定义模型
model = keras.models.Sequential(
    [
        keras.layers.Dense(30,activation='relu',input_shape=x_train_scaled.shape[1:]),
        keras.layers.Dense(1),
    ])

# 要实现自定义的模型训练需要做以下几步操作：
# 1.按照batch 来去遍历训练集 metric
#   1.1 自动求导
# 2.epoch结束，验证集 metric

epochs = 100 # 需要迭代多少个epochs
batch_size = 32 # 每个batch的size
steps_per_epoch = len(x_train_scaled) // batch_size # 每迭代一个epoch需要进行多少次梯度下降迭代
optimizer = keras.optimizers.SGD() # 用于梯度下降的Optimizer
metric = keras.metrics.MeanSquaredError() # 用于计算累计loss的平均值

# 定义构建batch的方法
def random_batch(x,y,batch_size=32):
    idxs = np.random.randint(0,len(x),size=batch_size)
    return x[idxs],y[idxs]

# 下面我们来实现类似pytorch的自定义实现模型训练
for epoch in range(epochs):
    metric.reset_states()
    for step in range(steps_per_epoch):
        # step1: 数据初始化
        # 其实这里取batch数据的逻辑时有问题，的因为数据没有去重，且可能不能覆盖到所有数据
        x_batch,y_batch = random_batch(x_train_scaled,y_train,batch_size)
        
        # 计算loss+梯度下降
        with tf.GradientTape() as tape:
            y_pred = model(x_batch) # 得到预测数据
            
            # step2：计算loss
            loss = tf.reduce_mean(
                keras.losses.mean_squared_error(y_batch,y_pred))
            metric(y_batch,y_pred) # 累加计算均方差的平均值
        # step3：计算梯度
        grads = tape.gradient(loss,model.variables)
        grads_and_vars = zip(grads,model.variables)
        # step4：梯度下降，更新权重
        optimizer.apply_gradients(grads_and_vars)
        print("\rEpoch:",epoch,"; Train Mse:",metric.result().numpy(),end=" ")
    # 进行验证集的loss计算
    y_valid_pred = model(x_valid_scaled)
    valid_loss = tf.reduce_mean(keras.losses.mean_squared_error(y_valid,y_valid_pred))
    print("\tValid Mse:%s" % valid_loss.numpy())



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch: 0 ; Train Mse: 1.4885765 	Valid Mse:1.9255974
Epoch: 1 ; Train Mse: 1.6178323  ; Train Mse: 2.3839338 	Valid Mse:1.4302462
Epoch: 2 ; Train Mse: 1.2895281 	Valid Mse:1.3988471
Epoch: 3 ; Train Mse: 1.2692423 	Valid Mse:1.3938874
Epoch: 4 ; Train Mse: 1.2559029 	Valid Mse:1.3932227
Epoch: 5 ; Train Mse: 1.2676643 	Valid Mse:1.3906212
Epoch: 6 ; Train Mse: 1.2537959 	Valid Mse:1.3915281
Epoch: 7 ; Train Mse: 1.2563562 	Valid Mse:1.3900213
Epoch: 8 ; Train Mse: 1.242881 	Valid Mse:1.4016278
Epoch: 9 ; Train Mse: 1.2635403 	Valid Mse:1.3926986
Epoch: 10 ; Train Mse: 1.2719797 	Valid Mse:1.3896056
Epoch: 11 ; Train Mse: 1.2459768 	Valid Mse:1.3883191
Epoch: 12 ; Train Mse: 1.2481139 	Vali