In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

tf.test.is_gpu_available()

2.0.0
sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.17.4
pandas 0.25.3
sklearn 0.22
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


False

# 构建tfrecord数据

In [9]:
# tfrecord 文件格式
# -> tf.train.Example
#  -> tf.train.Features -> {"key": tf.train.Feature}
#    -> tf.train.Feature -> tf.train.ByteList/FloatList/Int64List

# ByteList
favorite_books = [name.encode('utf-8') for name in ['machine learning','cc150']]
# print(favorite_books)
favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books_bytelist)

# FloatList
hours_floatlist = tf.train.FloatList(value = [12.05,23.08,9.07,6.34])
print(hours_floatlist)

# Int64List
age_int64list = tf.train.Int64List(value = [12,35,8,64,28])
print(age_int64list)

# 构建tf.train.Features
features = tf.train.Features(
    feature = {
        "favorite_books": tf.train.Feature(
            bytes_list = favorite_books_bytelist),
        "hours": tf.train.Feature(
            float_list = hours_floatlist),
        "age": tf.train.Feature(
            int64_list = age_int64list)
    }
)
print(features)

value: "machine learning"
value: "cc150"

value: 12.050000190734863
value: 23.079999923706055
value: 9.069999694824219
value: 6.340000152587891

value: 12
value: 35
value: 8
value: 64
value: 28

feature {
  key: "age"
  value {
    int64_list {
      value: 12
      value: 35
      value: 8
      value: 64
      value: 28
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 12.050000190734863
      value: 23.079999923706055
      value: 9.069999694824219
      value: 6.340000152587891
    }
  }
}



In [10]:
# 构建tf.train.Example
example = tf.train.Example(features = features)
print(example)

# 序列化成二进制文件
serialized_example = example.SerializeToString()
print(serialized_example)

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 12
        value: 35
        value: 8
        value: 64
        value: 28
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 12.050000190734863
        value: 23.079999923706055
        value: 9.069999694824219
        value: 6.340000152587891
      }
    }
  }
}

b'\n`\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x10\n\x03age\x12\t\x1a\x07\n\x05\x0c#\x08@\x1c\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\xcd\xcc@A\xd7\xa3\xb8A\xb8\x1e\x11AH\xe1\xca@'


# 存储tfrecord文件数据

In [11]:
output_dir = 'tfrecord_basic'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = "test.tfrecords"
filename_fullpath = os.path.join(output_dir,filename)
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)

# 读取tfrecord文件数据

In [12]:
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    print(serialized_example_tensor)

tf.Tensor(b'\n`\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x10\n\x03age\x12\t\x1a\x07\n\x05\x0c#\x08@\x1c\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\xcd\xcc@A\xd7\xa3\xb8A\xb8\x1e\x11AH\xe1\xca@', shape=(), dtype=string)
tf.Tensor(b'\n`\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x10\n\x03age\x12\t\x1a\x07\n\x05\x0c#\x08@\x1c\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\xcd\xcc@A\xd7\xa3\xb8A\xb8\x1e\x11AH\xe1\xca@', shape=(), dtype=string)
tf.Tensor(b'\n`\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x10\n\x03age\x12\t\x1a\x07\n\x05\x0c#\x08@\x1c\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\xcd\xcc@A\xd7\xa3\xb8A\xb8\x1e\x11AH\xe1\xca@', shape=(), dtype=string)


# 解析二进制的tfrecord文件数据

In [19]:
# 需要一个类似于解析csv文件的指定每个value值默认类型的定义
expected_features = {
    "favorite_books": tf.io.VarLenFeature(dtype = tf.string),# 变长的String
    "hours": tf.io.VarLenFeature(dtype = tf.float32), # 变长的float32
    "age": tf.io.FixedLenFeature([5],dtype = tf.int64), # 固定长度为5的int64
}
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    # 这里进行解析操作
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    print(example)
    # 这里进一步把books解析出来
    books = tf.sparse.to_dense(example["favorite_books"],
                               default_value = b"")
    for book in books:
        print(book)
        print(book.numpy())
        print(book.numpy().decode("UTF-8"))
    print()

{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f9245fa7a20>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f91fdfc30f0>, 'age': <tf.Tensor: id=289, shape=(5,), dtype=int64, numpy=array([12, 35,  8, 64, 28])>}
tf.Tensor(b'machine learning', shape=(), dtype=string)
b'machine learning'
machine learning
tf.Tensor(b'cc150', shape=(), dtype=string)
b'cc150'
cc150

{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f92195b66a0>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f92195b68d0>, 'age': <tf.Tensor: id=308, shape=(5,), dtype=int64, numpy=array([12, 35,  8, 64, 28])>}
tf.Tensor(b'machine learning', shape=(), dtype=string)
b'machine learning'
machine learning
tf.Tensor(b'cc150', shape=(), dtype=string)
b'cc150'
cc150

{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x7f9245fa7748>, 'hours': <tensorflow.python.fr

# 将tfrecord文件存储为压缩文件

In [20]:
filename_fullpath_zip = filename_fullpath + '.zip'
options = tf.io.TFRecordOptions(compression_type="GZIP") # 定义一个压缩操作
with tf.io.TFRecordWriter(filename_fullpath_zip,options) as writer:
    for i in range(3):
        writer.write(serialized_example)

# 读取压缩后的tfrecord文件

In [22]:
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip],
                                   compression_type="GZIP") # 只是这里多了一个compression_type，其他基本不变
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    books = tf.sparse.to_dense(example['favorite_books'],
                               default_value = b"")
    
    for book in books:
        print(book.numpy().decode("UTF-8"))

machine learning
cc150
machine learning
cc150
machine learning
cc150
