In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [2]:
import numpy as np
import os
import gzip

import tensorflow as tf
from tensorflow import keras
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [3]:
# tfrecord 文件格式
# -> tf.train.Example
#    -> tf.train.Features  -> {"key": tf.train.Features}
#       -> tf.train.Feature   -> tf.train.ByteList(字符串)/FloatList/Int64List

favorite_books_names = [name.encode('utf-8') for name in ['machine learing', 'cc150']]
favorite_books_bytelist = tf.train.BytesList(value = favorite_books_names)
print(favorite_books_bytelist)

hours_floatlist = tf.train.FloatList(value = [15.5, 9.5, 7.0, 8.0])
print(hours_floatlist)

age_int64list = tf.train.Int64List(value = [42])
print(age_int64list)

features = tf.train.Features(
    feature = {
        "favorite_books": tf.train.Feature(
            bytes_list = favorite_books_bytelist),
        "hours": tf.train.Feature(
            float_list = hours_floatlist),
        "age": tf.train.Feature(
            int64_list = age_int64list)
    }
)
print(features)

value: "machine learing"
value: "cc150"

value: 15.5
value: 9.5
value: 7.0
value: 8.0

value: 42

feature {
  key: "age"
  value {
    int64_list {
      value: 42
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learing"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 15.5
      value: 9.5
      value: 7.0
      value: 8.0
    }
  }
}



In [5]:
example = tf.train.Example(features=features)
print(example)

# 序列化
serialized_example = example.SerializeToString()
print(serialized_example)

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 42
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learing"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 15.5
        value: 9.5
        value: 7.0
        value: 8.0
      }
    }
  }
}

b'\n[\n,\n\x0efavorite_books\x12\x1a\n\x18\n\x0fmachine learing\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*'


In [7]:
# 将example存入文件，生成具体的tfrecord文件
output_dir = 'tfrecord_basic'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = 'test.tfrecords'
filename_fullpath = os.path.join(output_dir, filename)
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [8]:
# 读取 (序列化结果)
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    print(serialized_example_tensor)

tf.Tensor(b'\n[\n,\n\x0efavorite_books\x12\x1a\n\x18\n\x0fmachine learing\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)
tf.Tensor(b'\n[\n,\n\x0efavorite_books\x12\x1a\n\x18\n\x0fmachine learing\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)
tf.Tensor(b'\n[\n,\n\x0efavorite_books\x12\x1a\n\x18\n\x0fmachine learing\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)


In [11]:
# 定义字典，
expected_features = {
    "favorite_books": tf.io.VarLenFeature(dtype=tf.string),
    "hours": tf.io.VarLenFeature(dtype=tf.float32),
    "age": tf.io.FixedLenFeature([], dtype=tf.int64)   # 定长需要指定shape
}

dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    print(example)

{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000002155F19B198>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000002155EC70EB8>, 'age': <tf.Tensor: id=46, shape=(), dtype=int64, numpy=42>}
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000002155EC70898>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000021561ADFFD0>, 'age': <tf.Tensor: id=55, shape=(), dtype=int64, numpy=42>}
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000002155F19B2B0>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x000002155EC70F60>, 'age': <tf.Tensor: id=64, shape=(), dtype=int64, numpy=42>}


In [12]:
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    # 提出具体的一个属性
    books = tf.sparse.to_dense(example["favorite_books"],
                               default_value=b"")       # 0不能作为默认值存成字符串
    for book in books:
        print(book.numpy().decode('utf-8'))             # 需要解码

machine learing
cc150
machine learing
cc150
machine learing
cc150


In [13]:
# 存成压缩文件
filename_fullpath_zip = filename_fullpath + '.zip'
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter(filename_fullpath_zip, options) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [15]:
# 读取压缩文件
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip], compression_type='GZIP')
for serialized_example_tensor in dataset_zip:
    example_zip = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    books = tf.sparse.to_dense(example_zip["favorite_books"], default_value=b"")
    for book in books:
        print(book.numpy().decode('utf-8'))
    

machine learing
cc150
machine learing
cc150
machine learing
cc150
