In [2]:
import os
import sys
import time
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf

In [3]:
from sklearn.datasets import fetch_california_housing

In [4]:
housing = fetch_california_housing()
print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [5]:
import pprint
pprint.pprint(housing.data[0:3])
pprint.pprint(housing.target[0:3])

array([[ 8.32520000e+00,  4.10000000e+01,  6.98412698e+00,
         1.02380952e+00,  3.22000000e+02,  2.55555556e+00,
         3.78800000e+01, -1.22230000e+02],
       [ 8.30140000e+00,  2.10000000e+01,  6.23813708e+00,
         9.71880492e-01,  2.40100000e+03,  2.10984183e+00,
         3.78600000e+01, -1.22220000e+02],
       [ 7.25740000e+00,  5.20000000e+01,  8.28813559e+00,
         1.07344633e+00,  4.96000000e+02,  2.80225989e+00,
         3.78500000e+01, -1.22240000e+02]])
array([4.526, 3.585, 3.521])


### 数据集划分

In [6]:
from sklearn.model_selection import train_test_split
#默认训练集：测试集 =3 ：1 
x_train_all,x_test,y_train_all,y_test = train_test_split(housing.data,housing.target,random_state = 7,test_size = 0.25)
x_train,x_valid,y_train,y_valid = train_test_split(x_train_all,y_train_all,random_state = 11,test_size = 0.25)
print(x_train.shape,y_train.shape)
print(x_valid.shape,y_valid.shape)
print(x_test.shape,y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


### 数据归一化处理

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled  = scaler.transform(x_test)

In [17]:
#metric (性能指标) 使用
#metric 会自动记录之前的数据，使用metric.reset_states()可以重置metric
metric = tf.keras.metrics.MeanSquaredError()
print(metric([5.],[2.]))
print(metric([0.],[1.]))
print(metric.result())

tf.Tensor(9.0, shape=(), dtype=float32)
tf.Tensor(5.0, shape=(), dtype=float32)
tf.Tensor(5.0, shape=(), dtype=float32)


In [19]:
#自定义训练求导
#1. epoch开始，batch 遍历训练集 计算 metric
#   1.1 自动求导
#2. epoch结束 验证集 计算metric
epochs = 100
batch_size = 32
steps_per_epoch = len(x_train_scaled) // batch_size
optimizer = tf.keras.optimizers.SGD()
metric = tf.keras.metrics.MeanSquaredError()
#随机选择batch_size 的训练集合
def random_batch(x,y,batch_size):
    idx = np.random.randint(0,len(x),size = batch_size)
    return x[idx],y[idx]

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(30,activation = "relu",input_shape = x_train_scaled.shape[1:]),
    tf.keras.layers.Dense(1)
])

for epoch in range(epochs):
    metric.reset_states()
    for step in range(steps_per_epoch):
        x_batch,y_batch = random_batch(x_train_scaled,y_train,batch_size)
        with tf.GradientTape() as tape: 
            #预测值
            y_pred = model(x_batch)
            #损失(平方误差)
            loss = tf.reduce_mean(tf.keras.losses.mean_squared_error(y_batch,y_pred))
            #均方误差
            metric(y_batch,y_pred)
        #loss 关于参数求导
        grads = tape.gradient(loss,model.variables)
        grads_and_vars = zip(grads,model.variables)
        #参数更新
        optimizer.apply_gradients(grads_and_vars)
        print('\rEpoch',epoch,'train mse:',metric.result().numpy(),end="")
    #验证集 性能指标计算
    y_valid_pred = model(x_valid_scaled)
    valid_loss = tf.reduce_mean(tf.keras.losses.mean_squared_error(y_valid,y_valid_pred))
    print("\t","valid mse:",valid_loss.numpy())



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 0 train mse: 1.4347446	 valid mse: 1.46970435 0 train mse: 2.4985466 0 train mse: 1.9455135 0 train mse: 1.9156942 0 train mse: 1.6934938 0 train mse: 1.6105996 0 train mse: 1.5843027 0 train mse: 1.5828782 0 train mse: 1.5356575 0 train mse: 1.5089465 0 train mse: 1.4898674 0 train mse: 1.4305785 0 train mse: 1.4351405
Epoch 1 train mse: 1.3049934	 valid mse: 1.477582746 1 train mse: 1.2848732 1 train mse: 1.286467 1 train mse: 1.2788749 1 train mse: 1.2834331 train mse: 1.2888778 1 train mse: 1.3013705
Epoch 2 train mse: 1.2514185 2 train mse: 1.3562986 2 train mse: 1.2385421 2 train mse: 1.2410055	 valid mse: 1.4185176
Epoch 3 train mse: 1.9037963	 valid mse: 1.48489495 3 train mse

KeyboardInterrupt: 

In [1]:
# history = model.fit(x_train_scaled,y_train,validation_data=(x_valid_scaled,y_valid),epochs=100,callbacks=callbacks)

NameError: name 'model' is not defined

In [None]:
# def plot_learning_curves(history):
#     pd.DataFrame(history.history).plot(figsize=(8,5))
#     plt.grid(True)
#     plt.gca().set_ylim(0,1)
#     plt.show()
# plot_learning_curves(history)

In [None]:
# model.evaluate(x_test_scaled,y_test)