# Setup

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
from geexhp import modelfuncs as mf

In [2]:
# Parsing function.
def parse_example(example_proto, feature_description):
    # Parse the example.
    parsed_features = tf.io.parse_single_example(example_proto, feature_description)

    # Convert SparseTensors to DenseTensors.
    #   This is necessary because we loaded features with variable length.
    #   Although the parsed files have the desired data, they have other things 
    #   (indices, values and dense_shape). Neural networks must work with values
    #   in a more direct way.
    dense_features = {
        key: tf.sparse.to_dense(value, default_value=0.0)
        if isinstance(value, tf.SparseTensor) else value
        for key, value in parsed_features.items()
    }

    return dense_features

# Write each filtered DataFrame to TFRecord files

In [3]:
mf.tfrecord_conversion.create_tfrecords(root_folder="../parallel", save_root="../data/TFRecord_data")

🌍 Progress: |[36m██████████████████████████████[0m| 100% (972/972 files) ⏳ [1:06:55 elapsed]


In [None]:
# My pc's memory BLEW UP
# mf.datasetup.concatenate_all_tfrecords(root_folder="../data/TFRecord_data", 
#                                         output_tfrecord_file="../data/all_samples.tfrecord")

# Train/Validation/Test split

In [3]:
mf.datasetup.train_val_test_split(root_folder="../data/TFRecord_data")

Concatenated TFRecord file saved to '../data/train.tfrecord'
Concatenated TFRecord file saved to '../data/val.tfrecord'
Concatenated TFRecord file saved to '../data/test.tfrecord'


# Reading TFRecord file

In [4]:
# Load some example file
tfrecord_file = "../data/val.tfrecord"

## Reading TFRecords with known feature_description

In [5]:
# Define feature description for parsing.
#   Comment every feature that will not be used.
known_feature_description = {
    'ALBEDO_B-NIR': tf.io.VarLenFeature(tf.float32),
    'ALBEDO_B-UV': tf.io.VarLenFeature(tf.float32),
    'ALBEDO_B-Vis': tf.io.VarLenFeature(tf.float32),

    'ALBEDO_SS-NIR': tf.io.VarLenFeature(tf.float32),
    'ALBEDO_SS-UV': tf.io.VarLenFeature(tf.float32),
    'ALBEDO_SS-Vis': tf.io.VarLenFeature(tf.float32),

    # "NOISE_B-NIR": tf.io.VarLenFeature(tf.float32),
    # "NOISE_B-UV": tf.io.VarLenFeature(tf.float32),
    # "NOISE_B-Vis": tf.io.VarLenFeature(tf.float32),

    # "NOISE_SS-NIR": tf.io.VarLenFeature(tf.float32),
    # "NOISE_SS-UV": tf.io.VarLenFeature(tf.float32),
    # "NOISE_SS-Vis": tf.io.VarLenFeature(tf.float32),

    "OBJECT-RADIUS-REL-EARTH": tf.io.FixedLenFeature([], tf.float32),
    "OBJECT-GRAVITY": tf.io.FixedLenFeature([], tf.float32),
    "ATMOSPHERE-TEMPERATURE": tf.io.FixedLenFeature([], tf.float32),
    "ATMOSPHERE-PRESSURE": tf.io.FixedLenFeature([], tf.float32),

    'Earth_type': tf.io.FixedLenFeature([], tf.string),

    'log_C2H6': tf.io.FixedLenFeature([], tf.float32),
    'log_CH4': tf.io.FixedLenFeature([], tf.float32),
    'log_CO': tf.io.FixedLenFeature([], tf.float32),
    'log_CO2': tf.io.FixedLenFeature([], tf.float32),
    'log_H2O': tf.io.FixedLenFeature([], tf.float32),
    'log_N2': tf.io.FixedLenFeature([], tf.float32),
    'log_N2O': tf.io.FixedLenFeature([], tf.float32),
    'log_O2': tf.io.FixedLenFeature([], tf.float32),
    'log_O3': tf.io.FixedLenFeature([], tf.float32)
}

In [6]:
# Read TFRecord.
dataset = tf.data.TFRecordDataset(tfrecord_file)
parsed_dataset = dataset.map(lambda x: parse_example(x, known_feature_description))

In [7]:
# Count the number of samples in the loaded file.
count = 0
for record in parsed_dataset.take(1):
    count += 1

In [8]:
# See the last "record" of "parsed_dataset" from the "for" loop.
#   Each iteration creates a "record" dictionary.
print(record)

{'ALBEDO_B-NIR': <tf.Tensor: shape=(49,), dtype=float32, numpy=
array([0.07281743, 0.07283936, 0.07069632, 0.06788086, 0.06567328,
       0.07061101, 0.0695759 , 0.04995696, 0.04774973, 0.05310502,
       0.06654228, 0.06953804, 0.06991164, 0.07010435, 0.06882755,
       0.06236838, 0.04542492, 0.06632992, 0.06856628, 0.0648034 ,
       0.04571091, 0.00651796, 0.00939601, 0.01355263, 0.02682355,
       0.04015636, 0.05056648, 0.05437744, 0.06758716, 0.07138806,
       0.0723701 , 0.07058319, 0.07204764, 0.07123744, 0.07214401,
       0.07200659, 0.07211933, 0.07151064, 0.06998043, 0.06719914,
       0.05820468, 0.02921344, 0.00724806, 0.00828598, 0.01412445,
       0.01386271, 0.03034944, 0.0499021 , 0.03907574], dtype=float32)>, 'ALBEDO_B-UV': <tf.Tensor: shape=(8,), dtype=float32, numpy=
array([0.00260186, 0.00412218, 0.0028862 , 0.01598497, 0.09057041,
       0.08518539, 0.07999045, 0.07617822], dtype=float32)>, 'ALBEDO_B-Vis': <tf.Tensor: shape=(94,), dtype=float32, numpy=
array([0

## Read TFRecords with unknown feature_description

In [9]:
# Read the TFRecord.
raw_dataset = tf.data.TFRecordDataset(tfrecord_file)

In [10]:
# Inspect and parse a single record to infer feature names.
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)  # Inspect the parsed example
    break

features {
  feature {
    key: "ALBEDO_B-NIR"
    value {
      float_list {
        value: 0.07281742990016937
        value: 0.07283936440944672
        value: 0.07069631665945053
        value: 0.06788086146116257
        value: 0.06567327678203583
        value: 0.07061101496219635
        value: 0.06957589834928513
        value: 0.04995695501565933
        value: 0.047749731689691544
        value: 0.05310502275824547
        value: 0.06654228270053864
        value: 0.06953804194927216
        value: 0.06991163641214371
        value: 0.07010434567928314
        value: 0.0688275545835495
        value: 0.062368378043174744
        value: 0.0454249233007431
        value: 0.06632991880178452
        value: 0.06856627762317657
        value: 0.06480339914560318
        value: 0.045710913836956024
        value: 0.00651795556768775
        value: 0.009396013803780079
        value: 0.013552633114159107
        value: 0.02682354860007763
        value: 0.04015635699033737
        v

In [11]:
# Dynamically build an infered_feature_description
# Parse the inspected "example" to infer the feature description
infered_feature_description = {}
for key, value in example.features.feature.items():
    # Determine the type of the feature
    if value.HasField("bytes_list"):
        if len(value.bytes_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.string)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.string)
    elif value.HasField("float_list"):
        if len(value.float_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.float32)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.float32)
    elif value.HasField("int64_list"):
        if len(value.int64_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.int64)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.int64)

In [12]:
infered_feature_description

{'log_CH4': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'log_N2': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'ATMOSPHERE-PRESSURE': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'OBJECT-RADIUS-REL-EARTH': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISY_ALBEDO_SS-NIR': VarLenFeature(dtype=tf.float32),
 'NOISE_SS-NIR': VarLenFeature(dtype=tf.float32),
 'ATMOSPHERE-TEMPERATURE': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISE_B-Vis': VarLenFeature(dtype=tf.float32),
 'log_O3': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'ALBEDO_B-Vis': VarLenFeature(dtype=tf.float32),
 'NOISY_ALBEDO_SS-UV': VarLenFeature(dtype=tf.float32),
 'ALBEDO_SS-UV': VarLenFeature(dtype=tf.float32),
 'ALBEDO_B-UV': VarLenFeature(dtype=tf.float32),
 'log_O2': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISE_SS-UV': VarLenFeature(dtype=tf.float32),
 'NOISY_ALB

In [13]:
parsed_dataset = raw_dataset.map(lambda x: parse_example(x, infered_feature_description))

# Step 5: Iterate through the parsed dataset
for parsed_record in parsed_dataset.take(1):  # Show the first record
    print(parsed_record)

{'ALBEDO_B-NIR': <tf.Tensor: shape=(49,), dtype=float32, numpy=
array([0.07281743, 0.07283936, 0.07069632, 0.06788086, 0.06567328,
       0.07061101, 0.0695759 , 0.04995696, 0.04774973, 0.05310502,
       0.06654228, 0.06953804, 0.06991164, 0.07010435, 0.06882755,
       0.06236838, 0.04542492, 0.06632992, 0.06856628, 0.0648034 ,
       0.04571091, 0.00651796, 0.00939601, 0.01355263, 0.02682355,
       0.04015636, 0.05056648, 0.05437744, 0.06758716, 0.07138806,
       0.0723701 , 0.07058319, 0.07204764, 0.07123744, 0.07214401,
       0.07200659, 0.07211933, 0.07151064, 0.06998043, 0.06719914,
       0.05820468, 0.02921344, 0.00724806, 0.00828598, 0.01412445,
       0.01386271, 0.03034944, 0.0499021 , 0.03907574], dtype=float32)>, 'ALBEDO_B-UV': <tf.Tensor: shape=(8,), dtype=float32, numpy=
array([0.00260186, 0.00412218, 0.0028862 , 0.01598497, 0.09057041,
       0.08518539, 0.07999045, 0.07617822], dtype=float32)>, 'ALBEDO_B-Vis': <tf.Tensor: shape=(94,), dtype=float32, numpy=
array([0