In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

tf.test.is_gpu_available()

2.0.0
sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.17.4
pandas 0.25.3
sklearn 0.22
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


False

In [2]:
# 在打开网址时出现和ssl证书相关的问题时，需要加上这句话
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# 采用房屋预测模型数据
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [3]:
from sklearn.model_selection import train_test_split

x_train_all,x_test,y_train_all,y_test = train_test_split(
    housing.data,housing.target,random_state = 7)
x_train,x_valid,y_train,y_valid = train_test_split(
    x_train_all,y_train_all,random_state = 11)
print(x_train.shape,y_train.shape)
print(x_valid.shape,y_valid.shape)
print(x_test.shape,y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [4]:
# 进行数据归一化处理
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.fit_transform(x_valid)
x_test_scaled = scaler.fit_transform(x_test)

### 生成csv文件

In [5]:
output_dir = "generate_csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

def save_to_csv(output_dir,
                data, # dataset
                name_prefix, # test,train,valid要分别生成对应的csv文件，用name_prefix前缀来做个区分
                header=None, # 
                n_parts=10 # n_parts切分成10个文件进行存储
               ):
    # 用path_format来去生成文件名 : 第一{}填name_prefix，第二个{:02d}要填写一个两位的整数用于表示是第几个part
    path_format = os.path.join(output_dir,"{}_{:02d}.csv")
    filenames = [] # 把文件名都返回回去
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)),n_parts)):
        part_csv = path_format.format(name_prefix,file_idx)
        filenames.append(part_csv)
        with open(part_csv,'wt',encoding='utf-8') as f:
            if header is not None:
                f.write(header + '\n') # 把header信息写进去
            for row_index in row_indices: # 遍历行索引，取出每一行数据，然后写入文件中
                    f.write(",".join(
                        repr(col) for col in data[row_index])) # repr(obj)函数将对象转化为供解释器读取的形式。
                    f.write("\n")
    return filenames

# 把x和y的数据进行merge
train_data = np.c_[x_train_scaled,y_train]
valid_data = np.c_[x_valid_scaled,y_valid]
test_data = np.c_[x_test_scaled,y_test]

# header里面都有什么可以从housing的数据集去获取:housing.feature_names
header_cols = housing.feature_names + ["MidianHouseValue"] 
header_str = ','.join(header_cols)

# 存储文件
train_filenames = save_to_csv(output_dir,train_data,"train",
                              header_str, n_parts = 20)
valid_filenames = save_to_csv(output_dir,valid_data,"valid",
                              header_str,n_parts = 10)
test_filenames = save_to_csv(output_dir,test_data,'test',
                             header_str,n_parts = 10)


In [6]:
print(housing.feature_names + ["MidianHouseValue"] )

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MidianHouseValue']


In [7]:
for idx , item  in enumerate(np.split(np.arange(len(train_data)),10)):
    print(idx)

0
1
2
3
4
5
6
7
8
9


# tf.io.decode_csv使用

In [9]:
import pprint
print("训练数据集文件名称：")
pprint.pprint(train_filenames)
print("验证数据集文件名称：")
pprint.pprint(valid_filenames)
print("测试数据集文件名称：")
pprint.pprint(test_filenames)

训练数据集文件名称：
['generate_csv/train_00.csv',
 'generate_csv/train_01.csv',
 'generate_csv/train_02.csv',
 'generate_csv/train_03.csv',
 'generate_csv/train_04.csv',
 'generate_csv/train_05.csv',
 'generate_csv/train_06.csv',
 'generate_csv/train_07.csv',
 'generate_csv/train_08.csv',
 'generate_csv/train_09.csv',
 'generate_csv/train_10.csv',
 'generate_csv/train_11.csv',
 'generate_csv/train_12.csv',
 'generate_csv/train_13.csv',
 'generate_csv/train_14.csv',
 'generate_csv/train_15.csv',
 'generate_csv/train_16.csv',
 'generate_csv/train_17.csv',
 'generate_csv/train_18.csv',
 'generate_csv/train_19.csv']
验证数据集文件名称：
['generate_csv/valid_00.csv',
 'generate_csv/valid_01.csv',
 'generate_csv/valid_02.csv',
 'generate_csv/valid_03.csv',
 'generate_csv/valid_04.csv',
 'generate_csv/valid_05.csv',
 'generate_csv/valid_06.csv',
 'generate_csv/valid_07.csv',
 'generate_csv/valid_08.csv',
 'generate_csv/valid_09.csv']
测试数据集文件名称：
['generate_csv/test_00.csv',
 'generate_csv/test_01.csv',
 'generat

# 如何将存储的csv文件变成dataset

In [11]:
# 1. filename -> dataset
# 2. read file -> dataset -> datatsets -> merge
# 3. parse csv:解析csv文件

filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
    print(filename)

tf.Tensor(b'generate_csv/train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_00.csv', 

In [13]:
n_readers = 5
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1), # 按行读取然后生成一个dataset;skip(1)是为了不读取第一行的header
    cycle_length = n_readers
)
for line in dataset.take(15): # 取出15条出来看一下
    print(line)

tf.Tensor(b'-0.09719300311107498,-1.249743071766074,0.36232962250170797,0.026906080250728295,1.033811814747154,0.045881586971778555,1.3418334617377423,-1.6353869745909178,1.832', shape=(), dtype=string)
tf.Tensor(b'0.42408210084996534,0.9129633171802288,-0.04437481876046234,-0.15297213746739335,-0.24727627804141977,-0.10539166599677323,0.8612674255663844,-1.3357789003702432,3.955', shape=(), dtype=string)
tf.Tensor(b'-0.32652634129448693,0.43236189741438374,-0.09345459539684739,-0.08402991822890092,0.8460035745154013,-0.0266316482653991,-0.5617679242614233,0.1422875991184281,2.431', shape=(), dtype=string)
tf.Tensor(b'0.04971034572063198,-0.8492418886278699,-0.06214699417830008,0.17878747064657746,-0.8025354230744277,0.0005066066922077538,0.6466457006743215,-1.1060793768010604,2.286', shape=(), dtype=string)
tf.Tensor(b'-1.0591781535672364,1.393564736946074,-0.026331968874673636,-0.11006759528831847,-0.6138198966579805,-0.09695934953589447,0.3247131133362288,-0.037477245413977976,0.672

In [16]:
# 因为csv文件都是字符串，我们需要把字符转中对应的数据转换成我们想要的数据类型，
# 所以我们需要使用下面的方法来进行解析csv文件
# tf.io.decode_csv(str,record_defaults) # record_defaults是一个列表，用来指定每一个','分割的数据的数据类型和默认值
sample_str = '1,2,3,4,5'
# 通过record_defaults 指定每个要解析的数据的数据类型和默认值
# record_defaults = [tf.constant(0, dtype=tf.int32)] * 5 # 这种是默认都是整型
record_defaults = [
    tf.constant(0,dtype=tf.int32),
    0, # int32，默认为0
    np.nan, # float32，
    'hello', # string,默认'hello'
    tf.constant([]) # float32
]
parsed_filds = tf.io.decode_csv(sample_str,record_defaults)
print(parsed_filds)

sample_str2 = '1 2 3 4 5'
parsed_filds2 = tf.io.decode_csv(sample_str2,record_defaults,field_delim=' ',)
print(parsed_filds2)

[<tf.Tensor: id=160, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=161, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=162, shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: id=163, shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: id=164, shape=(), dtype=float32, numpy=5.0>]
[<tf.Tensor: id=169, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=170, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=171, shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: id=172, shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: id=173, shape=(), dtype=float32, numpy=5.0>]


In [18]:
# 如果给的值不对，会报错
try:
    parsed_fields = tf.io.decode_csv(',,,,',record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Field 4 is required but missing in record 0! [Op:DecodeCSV]


In [19]:
# 若干给的值数目不对的话，也会报错
try:
    parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,7',record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]


# 现在将我们存储的csv文件读取出来，然后进行解析成dataset

In [25]:
def parse_csv_line(line_str,n_fields = 9 ):
    record_defaults = [tf.constant(np.nan)] * n_fields # float32
    parsed_fields = tf.io.decode_csv(line_str,record_defaults= record_defaults)
    x = tf.stack(parsed_fields[0:-1]) # 前八个数是x
    y = tf.stack(parsed_fields[-1:]) # 最后一个值时y, 注意这里一定要写成[-1:],不然返回的tensor是一个标量
    return x,y

parse_csv_line(b'-0.09719300311107498,-1.249743071766074,0.36232962250170797,0.026906080250728295,1.033811814747154,0.045881586971778555,1.3418334617377423,-1.6353869745909178,1.832',
               9)

(<tf.Tensor: id=223, shape=(8,), dtype=float32, numpy=
 array([-0.097193  , -1.2497431 ,  0.36232963,  0.02690608,  1.0338118 ,
         0.04588159,  1.3418335 , -1.635387  ], dtype=float32)>,
 <tf.Tensor: id=222, shape=(), dtype=float32, numpy=1.832>)

In [30]:
# 1. filename -> dataset
# 2. read file -> dataset -> datatsets -> merge
# 3. parse csv:解析csv文件
def csv_reader_dataset(filenames,n_readers=5,
                       batch_size = 32, n_parse_threads = 5,
                       shuffle_buffer_size = 10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat() # 不传参数就是重复无限次
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size) # 混排，
    # dataset.map 就是把dataset中的每一个元素进行一个操作返回一个新的值:interleave是一对多,map是一对一
    dataset = dataset.map(parse_csv_line, # map只会给parse_csv_line传一个参数
                          num_parallel_calls = n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

train_set = csv_reader_dataset(train_filenames,batch_size=3)
for x_batch, y_batch in train_set.take(2):
    print('x:')
    pprint.pprint(x_batch)
    print('y:')
    pprint.pprint(y_batch)

x:
<tf.Tensor: id=560, shape=(3, 8), dtype=float32, numpy=
array([[-8.2195884e-01,  1.8741661e+00,  1.8212350e-01, -3.1700194e-02,
        -6.0111791e-01, -1.4337493e-01,  1.0852206e+00, -8.6139947e-01],
       [-3.2652634e-01,  4.3236190e-01, -9.3454592e-02, -8.4029920e-02,
         8.4600359e-01, -2.6631648e-02, -5.6176794e-01,  1.4228760e-01],
       [ 4.9710345e-02, -8.4924191e-01, -6.2146995e-02,  1.7878747e-01,
        -8.0253541e-01,  5.0660671e-04,  6.4664572e-01, -1.1060793e+00]],
      dtype=float32)>
y:
<tf.Tensor: id=561, shape=(3,), dtype=float32, numpy=array([1.054, 2.431, 2.286], dtype=float32)>
x:
<tf.Tensor: id=562, shape=(3, 8), dtype=float32, numpy=
array([[ 0.8015443 ,  0.27216142, -0.11624393, -0.20231152, -0.5430516 ,
        -0.02103962, -0.5897621 , -0.08241846],
       [ 0.15782312,  0.4323619 ,  0.3379948 , -0.01588031, -0.37338907,
        -0.05305246,  0.80061346, -1.2359096 ],
       [-0.46794146, -0.92934215,  0.11909926, -0.06047011,  0.30344644,
        

In [33]:
batch_size = 32
train_set = csv_reader_dataset(train_filenames,batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames,batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,batch_size = batch_size)

In [38]:
train_set.take(1)

<DatasetV1Adapter shapes: ((None, 8), (None,)), types: (tf.float32, tf.float32)>

In [41]:
model = keras.models.Sequential([
    keras.layers.Dense(30,activation = 'relu',
                       input_shape = [8]),
    keras.layers.Dense(1),
])
model.summary()
model.compile(loss = 'mean_squared_error',optimizer = "sgd")
callbacks = [keras.callbacks.EarlyStopping(
    patience = 5, min_delta=1e-2)]

steps_per_epoch = 11160 // batch_size # 遍历一个epoch需要多少step，总的训练数据集的大小 // batch_size
history = model.fit(train_set,
                    validation_data = valid_set,
                    steps_per_epoch = steps_per_epoch,
                    validation_steps = 3870 // batch_size,
                    epochs = 100,
                    callbacks = callbacks)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 30)                270       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 31        
Total params: 301
Trainable params: 301
Non-trainable params: 0
_________________________________________________________________
Train for 348 steps, validate for 120 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


In [43]:
model.evaluate(test_set,steps = 5160 // batch_size)



0.4315478506480685