# Setup

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
from geexhp import modelfuncs as mf

In [2]:
# Parsing function.
def parse_example(example_proto, feature_description):
    # Parse the example.
    parsed_features = tf.io.parse_single_example(example_proto, feature_description)

    # Convert SparseTensors to DenseTensors.
    #   This is necessary because we loaded features with variable length.
    #   Although the parsed files have the desired data, they have other things 
    #   (indices, values and dense_shape). Neural networks must work with values
    #   in a more direct way.
    dense_features = {
        key: tf.sparse.to_dense(value, default_value=0.0)
        if isinstance(value, tf.SparseTensor) else value
        for key, value in parsed_features.items()
    }

    return dense_features

# Write each filtered DataFrame to TFRecord files

In [3]:
mf.tfrecord_conversion.create_tfrecords(root_folder="../parallel", save_root="../data/TFRecord_data")

🌍 Progress: |[36m██████████████████████████████[0m| 100% (972/972 files) ⏳ [1:06:55 elapsed]


In [None]:
# My pc's memory BLEW UP
# mf.datasetup.concatenate_all_tfrecords(root_folder="../data/TFRecord_data", 
#                                         output_tfrecord_file="../data/all_samples.tfrecord")

# Train/Validation/Test split

In [3]:
mf.datasetup.train_val_test_split(root_folder="../data/TFRecord_data")

Concatenated TFRecord file saved to '../data/train.tfrecord'
Concatenated TFRecord file saved to '../data/val.tfrecord'
Concatenated TFRecord file saved to '../data/test.tfrecord'


# Reading TFRecord file

In [3]:
# Load some example file
tfrecord_file = "../data/train.tfrecord"

## Reading TFRecords with known feature_description

In [5]:
# Define feature description for parsing.
#   Comment every feature that will not be used.
known_feature_description = {
    'ALBEDO_B-NIR': tf.io.VarLenFeature(tf.float32),
    'ALBEDO_B-UV': tf.io.VarLenFeature(tf.float32),
    'ALBEDO_B-Vis': tf.io.VarLenFeature(tf.float32),

    'ALBEDO_SS-NIR': tf.io.VarLenFeature(tf.float32),
    'ALBEDO_SS-UV': tf.io.VarLenFeature(tf.float32),
    'ALBEDO_SS-Vis': tf.io.VarLenFeature(tf.float32),

    # "NOISE_B-NIR": tf.io.VarLenFeature(tf.float32),
    # "NOISE_B-UV": tf.io.VarLenFeature(tf.float32),
    # "NOISE_B-Vis": tf.io.VarLenFeature(tf.float32),

    # "NOISE_SS-NIR": tf.io.VarLenFeature(tf.float32),
    # "NOISE_SS-UV": tf.io.VarLenFeature(tf.float32),
    # "NOISE_SS-Vis": tf.io.VarLenFeature(tf.float32),

    "OBJECT-RADIUS-REL-EARTH": tf.io.FixedLenFeature([], tf.float32),
    "OBJECT-GRAVITY": tf.io.FixedLenFeature([], tf.float32),
    "ATMOSPHERE-TEMPERATURE": tf.io.FixedLenFeature([], tf.float32),
    "ATMOSPHERE-PRESSURE": tf.io.FixedLenFeature([], tf.float32),

    'Earth_type': tf.io.FixedLenFeature([], tf.string),

    'log_C2H6': tf.io.FixedLenFeature([], tf.float32),
    'log_CH4': tf.io.FixedLenFeature([], tf.float32),
    'log_CO': tf.io.FixedLenFeature([], tf.float32),
    'log_CO2': tf.io.FixedLenFeature([], tf.float32),
    'log_H2O': tf.io.FixedLenFeature([], tf.float32),
    'log_N2': tf.io.FixedLenFeature([], tf.float32),
    'log_N2O': tf.io.FixedLenFeature([], tf.float32),
    'log_O2': tf.io.FixedLenFeature([], tf.float32),
    'log_O3': tf.io.FixedLenFeature([], tf.float32)
}

In [6]:
# Read TFRecord.
dataset = tf.data.TFRecordDataset(tfrecord_file)
parsed_dataset = dataset.map(lambda x: parse_example(x, known_feature_description))

In [7]:
# Count the number of samples in the loaded file.
count = 0
for record in parsed_dataset.take(1):
    count += 1

In [8]:
# See the last "record" of "parsed_dataset" from the "for" loop.
#   Each iteration creates a "record" dictionary.
print(record)

{'ALBEDO_B-NIR': <tf.Tensor: shape=(49,), dtype=float32, numpy=
array([0.07281743, 0.07283936, 0.07069632, 0.06788086, 0.06567328,
       0.07061101, 0.0695759 , 0.04995696, 0.04774973, 0.05310502,
       0.06654228, 0.06953804, 0.06991164, 0.07010435, 0.06882755,
       0.06236838, 0.04542492, 0.06632992, 0.06856628, 0.0648034 ,
       0.04571091, 0.00651796, 0.00939601, 0.01355263, 0.02682355,
       0.04015636, 0.05056648, 0.05437744, 0.06758716, 0.07138806,
       0.0723701 , 0.07058319, 0.07204764, 0.07123744, 0.07214401,
       0.07200659, 0.07211933, 0.07151064, 0.06998043, 0.06719914,
       0.05820468, 0.02921344, 0.00724806, 0.00828598, 0.01412445,
       0.01386271, 0.03034944, 0.0499021 , 0.03907574], dtype=float32)>, 'ALBEDO_B-UV': <tf.Tensor: shape=(8,), dtype=float32, numpy=
array([0.00260186, 0.00412218, 0.0028862 , 0.01598497, 0.09057041,
       0.08518539, 0.07999045, 0.07617822], dtype=float32)>, 'ALBEDO_B-Vis': <tf.Tensor: shape=(94,), dtype=float32, numpy=
array([0

## Read TFRecords with unknown feature_description

In [4]:
# Read the TFRecord.
raw_dataset = tf.data.TFRecordDataset(tfrecord_file)

In [5]:
# Inspect and parse a single record to infer feature names.
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)  # Inspect the parsed example
    break

features {
  feature {
    key: "ALBEDO_B-NIR"
    value {
      float_list {
        value: 0.05025285854935646
        value: 0.0503925196826458
        value: 0.050566416233778
        value: 0.05050891265273094
        value: 0.05011831223964691
        value: 0.048100486397743225
        value: 0.040273815393447876
        value: 0.016050180420279503
        value: 0.012905932031571865
        value: 0.016228564083576202
        value: 0.0350910909473896
        value: 0.041457440704107285
        value: 0.042752113193273544
        value: 0.045443471521139145
        value: 0.049107301980257034
        value: 0.04998315870761871
        value: 0.04805140197277069
        value: 0.04848159849643707
        value: 0.042082276195287704
        value: 0.032324276864528656
        value: 0.01254535187035799
        value: 0.00010377536818850785
        value: 3.900953379343264e-05
        value: 0.0001281941367778927
        value: 0.0019388102227821946
        value: 0.00635401206091

In [6]:
# Dynamically build an infered_feature_description
# Parse the inspected "example" to infer the feature description
infered_feature_description = {}
for key, value in example.features.feature.items():
    # Determine the type of the feature
    if value.HasField("bytes_list"):
        if len(value.bytes_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.string)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.string)
    elif value.HasField("float_list"):
        if len(value.float_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.float32)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.float32)
    elif value.HasField("int64_list"):
        if len(value.int64_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.int64)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.int64)

In [7]:
infered_feature_description

{'ALBEDO_SS-NIR': VarLenFeature(dtype=tf.float32),
 'log_H2O': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'log_CH4': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISY_ALBEDO_B-UV': VarLenFeature(dtype=tf.float32),
 'NOISE_SS-NIR': VarLenFeature(dtype=tf.float32),
 'OBJECT-DIAMETER': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'log_C2H6': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'ALBEDO_SS-Vis': VarLenFeature(dtype=tf.float32),
 'NOISY_ALBEDO_B-NIR': VarLenFeature(dtype=tf.float32),
 'OBJECT-RADIUS-REL-EARTH': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISE_B-NIR': VarLenFeature(dtype=tf.float32),
 'log_O2': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'ALBEDO_B-NIR': VarLenFeature(dtype=tf.float32),
 'ATMOSPHERE-TEMPERATURE': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISE_SS-Vis': VarLenFeature(dtype=tf.float32),
 'NOISE_B-

In [24]:
parsed_dataset = raw_dataset.map(lambda x: parse_example(x, infered_feature_description))

# Step 5: Iterate through the parsed dataset
for parsed_record in parsed_dataset.take(1):  # Show the first record
    print(parsed_record)

{'ALBEDO_B-NIR': <tf.Tensor: shape=(49,), dtype=float32, numpy=
array([5.0252859e-02, 5.0392520e-02, 5.0566416e-02, 5.0508913e-02,
       5.0118312e-02, 4.8100486e-02, 4.0273815e-02, 1.6050180e-02,
       1.2905932e-02, 1.6228564e-02, 3.5091091e-02, 4.1457441e-02,
       4.2752113e-02, 4.5443472e-02, 4.9107302e-02, 4.9983159e-02,
       4.8051402e-02, 4.8481598e-02, 4.2082276e-02, 3.2324277e-02,
       1.2545352e-02, 1.0377537e-04, 3.9009534e-05, 1.2819414e-04,
       1.9388102e-03, 6.3540121e-03, 1.2887077e-02, 1.7751804e-02,
       3.7387267e-02, 4.5945153e-02, 4.9076941e-02, 4.3262489e-02,
       4.8043266e-02, 4.5236491e-02, 4.8465434e-02, 4.8232760e-02,
       4.8538815e-02, 4.6852231e-02, 4.3312185e-02, 3.8365752e-02,
       2.4018871e-02, 3.0140276e-03, 4.3719177e-05, 1.0011658e-05,
       3.8204911e-05, 7.3290459e-05, 3.0695111e-03, 1.5446892e-02,
       1.4309292e-02], dtype=float32)>, 'ALBEDO_B-UV': <tf.Tensor: shape=(8,), dtype=float32, numpy=
array([0.00810695, 0.00095465, 