In [None]:
import tensorflow as tf

# Start

In [2]:
# Parsing function.
def parse_example(example_proto, feature_description):
    # Parse the example.
    parsed_features = tf.io.parse_single_example(example_proto, feature_description)

    # Convert SparseTensors to DenseTensors.
    #   This is necessary because we loaded features with variable length.
    #   Although the parsed files have the desired data, they have other things 
    #   (indices, values and dense_shape). Neural networks must work with values
    #   in a more direct way.
    dense_features = {
        key: tf.sparse.to_dense(value, default_value=0.0)
        if isinstance(value, tf.SparseTensor) else value
        for key, value in parsed_features.items()
    }

    return dense_features

In [3]:
# Load some example file.
tfrecord_file = '../data/TFRecord_data/archean_data2_0-833.parquet_388.tfrecord'

## Reading TFRecords with known feature_description

In [4]:
# Define feature description for parsing.
#   Comment every feature that will not be used.
known_feature_description = {
    'NOISY_ALBEDO_B-NIR': tf.io.VarLenFeature(tf.float32),
    'NOISY_ALBEDO_B-UV': tf.io.VarLenFeature(tf.float32),
    'NOISY_ALBEDO_B-Vis': tf.io.VarLenFeature(tf.float32),
    'NOISY_ALBEDO_SS-NIR': tf.io.VarLenFeature(tf.float32),
    'NOISY_ALBEDO_SS-UV': tf.io.VarLenFeature(tf.float32),
    'NOISY_ALBEDO_SS-Vis': tf.io.VarLenFeature(tf.float32),
    'OBJECT-DIAMETER': tf.io.FixedLenFeature([], tf.float32),
    'OBJECT-GRAVITY': tf.io.FixedLenFeature([], tf.float32),
    'ATMOSPHERE-TEMPERATURE': tf.io.FixedLenFeature([], tf.float32),
    'ATMOSPHERE-PRESSURE': tf.io.FixedLenFeature([], tf.float32),
    'Earth_type': tf.io.FixedLenFeature([], tf.string),
    'C2H6': tf.io.FixedLenFeature([], tf.float32),
    'CH4': tf.io.FixedLenFeature([], tf.float32),
    'CO': tf.io.FixedLenFeature([], tf.float32),
    'CO2': tf.io.FixedLenFeature([], tf.float32),
    'H2O': tf.io.FixedLenFeature([], tf.float32),
    'N2': tf.io.FixedLenFeature([], tf.float32),
    'N2O': tf.io.FixedLenFeature([], tf.float32),
    'O2': tf.io.FixedLenFeature([], tf.float32),
    'O3': tf.io.FixedLenFeature([], tf.float32)
}

In [5]:
# Read TFRecord.
dataset = tf.data.TFRecordDataset(tfrecord_file)
parsed_dataset = dataset.map(lambda x: parse_example(x, known_feature_description))

I0000 00:00:1736515944.864124 3412183 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 29 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2060, pci bus id: 0000:08:00.0, compute capability: 7.5


In [6]:
# Count the number of samples in the loaded file.
count = 0
for record in parsed_dataset:
    count += 1

print(count)

388


2025-01-10 10:32:28.409954: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:370] TFRecordDataset `buffer_size` is unspecified, default to 262144
2025-01-10 10:32:28.542350: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [7]:
# See the last "record" of "parsed_dataset" from the "for" loop.
#   Each iteration creates a "record" dictionary.
record

{'NOISY_ALBEDO_B-NIR': <tf.Tensor: shape=(49,), dtype=float32, numpy=
 array([-1.5090859e-01, -7.9321370e-02, -9.9460274e-02,  6.8972588e-02,
         8.6344421e-02, -8.0603845e-02,  2.2635971e-01,  1.1780709e-01,
         1.9929273e-01, -3.5475228e-02, -1.5643552e-01,  2.7021240e-02,
        -5.0722493e-04, -4.4033203e-02,  1.3979220e-01,  7.1746871e-02,
        -1.0774265e-01,  1.9115032e-01,  8.2663879e-02, -2.5778314e-01,
         5.5119794e-02,  1.5002474e-02,  1.5162656e-01,  9.4391093e-02,
        -1.8220806e-01, -7.0041400e-01,  5.1917404e-01,  4.4017008e-01,
        -4.5124421e-01,  6.5128571e-01, -5.6355965e-01,  1.1865753e-01,
        -9.3585229e-01,  1.1883131e-01, -1.8446150e-01,  7.5660400e-02,
        -1.1185178e+00,  8.5702521e-01, -1.1968061e+00,  1.8319455e+00,
        -9.3576169e-01, -1.8570943e-01, -1.3535773e+00, -3.2215238e+00,
        -1.4007714e+00, -5.9641528e+00,  3.0085087e+00, -1.3679731e+00,
        -1.9166774e+00], dtype=float32)>,
 'NOISY_ALBEDO_B-UV': <t

## Read TFRecords with unknown feature_description

In [10]:
# Read the TFRecord.
raw_dataset = tf.data.TFRecordDataset(tfrecord_file)

In [11]:
# Inspect and parse a single record to infer feature names.
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)  # Inspect the parsed example
    break

features {
  feature {
    key: "OBJECT-GRAVITY"
    value {
      float_list {
        value: 12.3318214
      }
    }
  }
  feature {
    key: "OBJECT-DIAMETER"
    value {
      float_list {
        value: 14835.8418
      }
    }
  }
  feature {
    key: "O3"
    value {
      float_list {
        value: 0
      }
    }
  }
  feature {
    key: "O2"
    value {
      float_list {
        value: 0
      }
    }
  }
  feature {
    key: "NOISY_ALBEDO_SS-Vis"
    value {
      float_list {
        value: 0.0317671038
        value: 0.0355698131
        value: 0.0336367153
        value: 0.0306186229
        value: 0.0317233615
        value: 0.0340811871
        value: 0.0326634608
        value: 0.0295192
        value: 0.0270108581
        value: 0.0303595252
        value: 0.0298757534
        value: 0.0340774097
        value: 0.0262790676
        value: 0.0276228953
        value: 0.0288922042
        value: 0.0307105277
        value: 0.0311210379
        value: 0.0260297619
   

In [12]:
# Dynamically build an infered_feature_description
# Parse the inspected "example" to infer the feature description
infered_feature_description = {}
for key, value in example.features.feature.items():
    # Determine the type of the feature
    if value.HasField("bytes_list"):
        if len(value.bytes_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.string)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.string)
    elif value.HasField("float_list"):
        if len(value.float_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.float32)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.float32)
    elif value.HasField("int64_list"):
        if len(value.int64_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.int64)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.int64)

In [13]:
infered_feature_description

{'C2H6': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISY_ALBEDO_B-NIR': VarLenFeature(dtype=tf.float32),
 'ATMOSPHERE-PRESSURE': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISY_ALBEDO_SS-NIR': VarLenFeature(dtype=tf.float32),
 'NOISY_ALBEDO_B-Vis': VarLenFeature(dtype=tf.float32),
 'ATMOSPHERE-TEMPERATURE': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISY_ALBEDO_B-UV': VarLenFeature(dtype=tf.float32),
 'OBJECT-GRAVITY': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISY_ALBEDO_SS-UV': VarLenFeature(dtype=tf.float32),
 'CO': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'CO2': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'N2O': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'O2': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISY_ALBEDO_SS-Vis': VarLenFeature(dtype=tf.float32),
 'OBJECT-DIAMETER': FixedLenFeatur

In [14]:
parsed_dataset = raw_dataset.map(lambda x: parse_example(x, infered_feature_description))

# Step 5: Iterate through the parsed dataset
for parsed_record in parsed_dataset.take(5):  # Show first 5 records
    print(parsed_record)

{'NOISY_ALBEDO_B-NIR': <tf.Tensor: shape=(49,), dtype=float32, numpy=
array([ 5.1329737e-03,  1.0343651e-02,  1.1966910e-02,  1.1626874e-02,
        1.2541611e-02,  1.0323183e-02,  6.2615336e-03, -1.1718279e-03,
        1.0237716e-03, -6.7926588e-04,  1.2632142e-04, -1.6419437e-03,
        1.3113797e-03,  5.7027111e-05,  1.1839274e-02,  1.8304676e-02,
        1.1562044e-02,  1.0227684e-02,  8.5638510e-03,  1.4616835e-03,
       -2.7343656e-03,  1.3833891e-03,  1.2720912e-03, -2.1073665e-03,
       -1.5837782e-03,  2.8093117e-03,  5.2028028e-03,  2.0953410e-03,
        7.8824153e-03,  7.2892508e-03,  1.4237561e-02,  4.3530264e-03,
        2.3537036e-03,  1.9755804e-04,  3.2456394e-04, -1.0324447e-02,
       -3.7055538e-04,  2.7044495e-03,  8.8926237e-03, -1.0009882e-03,
        5.0642076e-03,  1.3183929e-03, -1.5636869e-02,  1.0505169e-02,
        5.2323565e-03,  1.6043246e-02, -6.5575987e-03,  9.6550407e-03,
       -2.3949001e-02], dtype=float32)>, 'NOISY_ALBEDO_B-UV': <tf.Tensor: shap

2025-01-10 10:33:50.814719: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
