In [1]:
import tensorflow as tf
import numpy as np

## tf.train.Feature

In [3]:
def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value=value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [4]:
_bytes_feature(b'test string')

bytes_list {
  value: "test string"
}

In [5]:
_float_feature(np.exp(1))

float_list {
  value: 2.7182817459106445
}

In [6]:
fea = _float_feature(np.exp(1))
fea.SerializeToString()

b'\x12\x06\n\x04T\xf8-@'

## tf.train.Example
- SerializeToString
- FromString

In [10]:
num=int(1e4)
feature0 = np.random.choice([False, True], num)
feature1 = np.random.randint(0, 5, num)
strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1]
feature3 = np.random.randn(num)

In [11]:
def serialize_example(feature0, feature1, feature2, feature3):
    feature={
        'feature0':_int64_feature(feature0),
        'feature1':_int64_feature(feature1),
        'feature2':_bytes_feature(feature2),
        'feature3':_float_feature(feature3)
    }
    
    example = tf.train.Example(features=tf.train.Features(feature=feature))
    return example.SerializeToString()

In [12]:
example = serialize_example(False, 4, b'goat', 0.9876)
example

b'\nR\n\x14\n\x08feature2\x12\x08\n\x06\n\x04goat\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04[\xd3|?\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x04'

In [14]:
example_proto = tf.train.Example.FromString(example)
example_proto

features {
  feature {
    key: "feature0"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "feature1"
    value {
      int64_list {
        value: 4
      }
    }
  }
  feature {
    key: "feature2"
    value {
      bytes_list {
        value: "goat"
      }
    }
  }
  feature {
    key: "feature3"
    value {
      float_list {
        value: 0.9876000285148621
      }
    }
  }
}

## tf.data.Dataset
- from_tensor_slices
- from_generator

In [16]:
features_dataset = tf.data.Dataset.from_tensor_slices((feature0,feature1,feature2,feature3))
for f0,f1,f2,f3 in features_dataset.take(1):
    print(f0, f1, f2, f3, sep='\n')

tf.Tensor(True, shape=(), dtype=bool)
tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(b'dog', shape=(), dtype=string)
tf.Tensor(0.6490147819064547, shape=(), dtype=float64)


In [17]:
def generator_func():
    for features in features_dataset:
        yield serialize_example(*features)

serialized_features_dataset = tf.data.Dataset.from_generator(generator_func, output_types=tf.string, output_shapes=())

## tfrecord
- tf.data.experimental.TFRecordWriter
- tf.data.TFRecordDataset

In [18]:
filename='test.tfrecord'
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(serialized_features_dataset)

In [19]:
raw_dataset = tf.data.TFRecordDataset([filename])
raw_dataset

<TFRecordDatasetV2 shapes: (), types: tf.string>

In [22]:
for raw_record in raw_dataset.take(2):
    print(raw_record)

tf.Tensor(b'\nQ\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x01\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x01\n\x13\n\x08feature2\x12\x07\n\x05\n\x03dog\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04\xd5%&?', shape=(), dtype=string)
tf.Tensor(b'\nQ\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x00\n\x13\n\x08feature2\x12\x07\n\x05\n\x03cat\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04\x03\x99\xfe\xbf', shape=(), dtype=string)


In [23]:
feature_description = {
    'feature0': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'feature1': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'feature2': tf.io.FixedLenFeature([], tf.string, default_value=''),
    'feature3': tf.io.FixedLenFeature([], tf.float32, default_value=0.0),
}

def parse_function(example_proto):
    return tf.io.parse_single_example(example_proto, feature_description)

In [24]:
parsed_dataset = raw_dataset.map(parse_function)

for parsed_record in parsed_dataset.take(2):
    print(parsed_record)

{'feature1': <tf.Tensor: id=50123, shape=(), dtype=int64, numpy=1>, 'feature3': <tf.Tensor: id=50125, shape=(), dtype=float32, numpy=0.6490148>, 'feature2': <tf.Tensor: id=50124, shape=(), dtype=string, numpy=b'dog'>, 'feature0': <tf.Tensor: id=50122, shape=(), dtype=int64, numpy=1>}
{'feature1': <tf.Tensor: id=50127, shape=(), dtype=int64, numpy=0>, 'feature3': <tf.Tensor: id=50129, shape=(), dtype=float32, numpy=-1.9890445>, 'feature2': <tf.Tensor: id=50128, shape=(), dtype=string, numpy=b'cat'>, 'feature0': <tf.Tensor: id=50126, shape=(), dtype=int64, numpy=0>}
