# Datasets 使用之 CSV

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
import sys
import matplotlib.pyplot as plt

### 1. 准备数据

In [2]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

In [3]:
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

**把数据保存为CSV格式**

In [10]:
output_dir = "../generate_csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

def save_to_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    
    for file_idx, row_indies in enumerate(
        np.array_split(np.arange(len(data)), n_parts)):
        """data 是元组的形式"""
        path_csv = path_format.format(name_prefix, file_idx)
        filenames.append(path_csv)
        # 把数据写入 CSV 文件
        with open(path_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n")
            for row_indie in row_indies:
                f.write(",".join([
                    repr(col) for col in data[row_indie]
                ]))
                f.write("\n")
    return filenames

# merge data
train_data = np.hstack([x_train_scaled, y_train.reshape(-1, 1)])
valid_data = np.hstack([x_valid_scaled, y_valid.reshape(-1, 1)])
test_data = np.hstack([x_test_scaled, y_test.reshape(-1, 1)])
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

**写入数据到文件**

In [11]:
train_filenames = save_to_csv(output_dir, train_data, "train", 
                            header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid",
                             header_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, "test",
                             header_str, n_parts=10)

In [13]:
import pprint
print("train filenames:")
pprint.pprint(train_filenames)
print("valid filenames:")
pprint.pprint(valid_filenames)
print("test filenames:")
pprint.pprint(test_filenames)

train filenames:
['../generate_csv/train_00.csv',
 '../generate_csv/train_01.csv',
 '../generate_csv/train_02.csv',
 '../generate_csv/train_03.csv',
 '../generate_csv/train_04.csv',
 '../generate_csv/train_05.csv',
 '../generate_csv/train_06.csv',
 '../generate_csv/train_07.csv',
 '../generate_csv/train_08.csv',
 '../generate_csv/train_09.csv',
 '../generate_csv/train_10.csv',
 '../generate_csv/train_11.csv',
 '../generate_csv/train_12.csv',
 '../generate_csv/train_13.csv',
 '../generate_csv/train_14.csv',
 '../generate_csv/train_15.csv',
 '../generate_csv/train_16.csv',
 '../generate_csv/train_17.csv',
 '../generate_csv/train_18.csv',
 '../generate_csv/train_19.csv']
valid filenames:
['../generate_csv/valid_00.csv',
 '../generate_csv/valid_01.csv',
 '../generate_csv/valid_02.csv',
 '../generate_csv/valid_03.csv',
 '../generate_csv/valid_04.csv',
 '../generate_csv/valid_05.csv',
 '../generate_csv/valid_06.csv',
 '../generate_csv/valid_07.csv',
 '../generate_csv/valid_08.csv',
 '../gene

**读取CSV文件数据**

In [14]:
# 1. 读取所有文件名为数据集对象 filename -> dataset
filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
    print(filename)

tf.Tensor(b'../generate_csv/train_18.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_17.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_03.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_10.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_09.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_08.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_01.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_16.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_11.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_15.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_00.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_19.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_12.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_05.csv', shape=(), dtype=string)
tf.Tensor(b'../generate_csv/train_13.csv', shape=(), dtype=str

In [16]:
# 2. 根据文件名读取相应的数据 read file -> dataset -> datasets -> merge
n_readers = 5
dataset = filename_dataset.interleave(
    # 按行读取文本文件 TextLineDataset, 并忽略第一行（头部信息）
    lambda filename: tf.data.TextLineDataset(filename).skip(1),
    cycle_length=n_readers
)
for line in dataset.take(10):
    print(line.numpy())

b'-0.32652634129448693,0.43236189741438374,-0.09345459539684739,-0.08402991822890092,0.8460035745154013,-0.0266316482653991,-0.5617679242614233,0.1422875991184281,2.431'
b'0.42408210084996534,0.9129633171802288,-0.04437481876046234,-0.15297213746739335,-0.24727627804141977,-0.10539166599677323,0.8612674255663844,-1.3357789003702432,3.955'
b'0.8115083791797953,-0.04823952235146133,0.5187339067174729,-0.029386394873127775,-0.034064024638222286,-0.05081594842905086,-0.7157356834231196,0.9162751241885168,2.147'
b'-1.0775077698160966,-0.44874070548966555,-0.5680568205591913,-0.14269262164909954,-0.09666677138213985,0.12326468238687088,-0.3144863716683942,-0.4818958888413162,0.978'
b'-1.0591781535672364,1.393564736946074,-0.026331968874673636,-0.11006759528831847,-0.6138198966579805,-0.09695934953589447,0.3247131133362288,-0.037477245413977976,0.672'
b'2.2754266257529974,-1.249743071766074,1.0294788075585177,-0.17124431895714504,-0.45413752815175606,0.10527151658164971,-0.9023632702857819,0.

In [20]:
# 3. 解析 CSV 文本数据 parse csv
# tf.io.decode_csv(str, record_defaults)
sample_str = '1,2,3,4,5'
record_defaults = [
    tf.constant(0, dtype=tf.int32),
    0,
    np.nan,
    "hello",
    tf.constant([])
]
parsed_fields = tf.io.decode_csv(sample_str, record_defaults)
print(parsed_fields)

[<tf.Tensor: id=170, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=171, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=172, shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: id=173, shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: id=174, shape=(), dtype=float32, numpy=5.0>]




In [21]:
try:
    parsed_fields = tf.io.decode_csv(',,,,', record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Field 4 is required but missing in record 0! [Op:DecodeCSV]


In [22]:
try:
    parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,7', record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]


In [30]:
# 按行解析数据
def parse_csv_line(line, n_fields = 9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

parse_csv_line(b'-0.32652634129448693,0.43236189741438374,-0.09345459539684739,-0.08402991822890092,0.8460035745154013,-0.0266316482653991,-0.5617679242614233,0.1422875991184281,2.431',
              n_fields=9)

(<tf.Tensor: id=198, shape=(8,), dtype=float32, numpy=
 array([-0.32652634,  0.4323619 , -0.09345459, -0.08402992,  0.8460036 ,
        -0.02663165, -0.56176794,  0.1422876 ], dtype=float32)>,
 <tf.Tensor: id=199, shape=(1,), dtype=float32, numpy=array([2.431], dtype=float32)>)

In [31]:
# 1. filename -> dataset
# 2. read file -> dataset -> datasets -> merge
# 3. parse csv
def csv_reader_dataset(filenames, n_readers=5,
                       batch_size=32, n_parse_threads=5,
                       shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line,
                          num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

train_set = csv_reader_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
    print("x:")
    pprint.pprint(x_batch)
    print("y:")
    pprint.pprint(y_batch)

x:
<tf.Tensor: id=283, shape=(3, 8), dtype=float32, numpy=
array([[-0.66722274, -0.04823952,  0.34529406,  0.53826684,  1.8521839 ,
        -0.06112538, -0.8417093 ,  1.5204847 ],
       [ 0.09734604,  0.75276285, -0.20218964, -0.19547   , -0.40605137,
         0.00678553, -0.81371516,  0.6566148 ],
       [ 0.81150836, -0.04823952,  0.5187339 , -0.0293864 , -0.03406402,
        -0.05081595, -0.7157357 ,  0.91627514]], dtype=float32)>
y:
<tf.Tensor: id=284, shape=(3, 1), dtype=float32, numpy=
array([[1.59 ],
       [1.119],
       [2.147]], dtype=float32)>
x:
<tf.Tensor: id=285, shape=(3, 8), dtype=float32, numpy=
array([[ 0.15782312,  0.4323619 ,  0.3379948 , -0.01588031, -0.37338907,
        -0.05305246,  0.80061346, -1.2359096 ],
       [-0.82195884,  1.8741661 ,  0.1821235 , -0.03170019, -0.6011179 ,
        -0.14337493,  1.0852206 , -0.8613995 ],
       [ 1.5180511 , -0.52884096,  0.81024706, -0.1921417 ,  0.44135395,
         0.02733506, -0.81838083,  0.8563535 ]], dtype=float32)