# Setup

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
from geexhp import modelfuncs as mf

In [2]:
# Parsing function.
def parse_example(example_proto, feature_description):
    # Parse the example.
    parsed_features = tf.io.parse_single_example(example_proto, feature_description)

    # Convert SparseTensors to DenseTensors.
    #   This is necessary because we loaded features with variable length.
    #   Although the parsed files have the desired data, they have other things 
    #   (indices, values and dense_shape). Neural networks must work with values
    #   in a more direct way.
    dense_features = {
        key: tf.sparse.to_dense(value, default_value=0.0)
        if isinstance(value, tf.SparseTensor) else value
        for key, value in parsed_features.items()
    }

    return dense_features

# Write each filtered DataFrame to TFRecord files

In [2]:
mf.tfrecord_conversion.create_tfrecords(root_folder="../parallel", save_root="../data/TFRecord_data")

🌍 Progress: |[36m██████████████████████████████[0m| 100% (972/972 files) ⏳ [1:15:12 elapsed]


In [None]:
# My pc's memory BLEW UP
# mf.datasetup.concatenate_all_tfrecords(root_folder="../data/TFRecord_data", 
#                                         output_tfrecord_file="../data/all_samples.tfrecord")

# Train/Validation/Test split

In [2]:
mf.datasetup.train_val_test_split(root_folder="../data/TFRecord_data")

Concatenated TFRecord file saved to '../data/train.tfrecord'
Concatenated TFRecord file saved to '../data/val.tfrecord'
Concatenated TFRecord file saved to '../data/test.tfrecord'


# Reading TFRecord file

In [8]:
# Load some example file
tfrecord_file = "../data/train.tfrecord"

## Reading TFRecords with known feature_description

In [9]:
# Define feature description for parsing.
#   Comment every feature that will not be used.
known_feature_description = {
    'NOISY_ALBEDO_B-NIR_norm': tf.io.VarLenFeature(tf.float32),
    'NOISY_ALBEDO_B-UV_norm': tf.io.VarLenFeature(tf.float32),
    'NOISY_ALBEDO_B-Vis_norm': tf.io.VarLenFeature(tf.float32),

    'NOISY_ALBEDO_SS-NIR_norm': tf.io.VarLenFeature(tf.float32),
    'NOISY_ALBEDO_SS-UV_norm': tf.io.VarLenFeature(tf.float32),
    'NOISY_ALBEDO_SS-Vis_norm': tf.io.VarLenFeature(tf.float32),

    "OBJECT-RADIUS-REL-EARTH": tf.io.FixedLenFeature([], tf.float32),
    "LOG-OBJECT-GRAVITY": tf.io.FixedLenFeature([], tf.float32),
    "LOG-ATMOSPHERE-TEMPERATURE": tf.io.FixedLenFeature([], tf.float32),
    "BAR-ATMOSPHERE-PRESSURE": tf.io.FixedLenFeature([], tf.float32),

    'Earth_type': tf.io.FixedLenFeature([], tf.string),

    'log_C2H6': tf.io.FixedLenFeature([], tf.float32),
    'log_CH4': tf.io.FixedLenFeature([], tf.float32),
    'log_CO': tf.io.FixedLenFeature([], tf.float32),
    'log_CO2': tf.io.FixedLenFeature([], tf.float32),
    'log_H2O': tf.io.FixedLenFeature([], tf.float32),
    'log_N2': tf.io.FixedLenFeature([], tf.float32),
    'log_N2O': tf.io.FixedLenFeature([], tf.float32),
    'log_O2': tf.io.FixedLenFeature([], tf.float32),
    'log_O3': tf.io.FixedLenFeature([], tf.float32)
}

In [10]:
# Read TFRecord.
dataset = tf.data.TFRecordDataset(tfrecord_file)
parsed_dataset = dataset.map(lambda x: parse_example(x, known_feature_description))

In [11]:
# Count the number of samples in the loaded file.
count = 0
for record in parsed_dataset:
    count += 1

count

818862

In [15]:
# See the last "record" of "parsed_dataset" from the "for" loop.
#   Each iteration creates a "record" dictionary.
print(record)

{'NOISY_ALBEDO_B-NIR_norm': <tf.Tensor: shape=(49,), dtype=float32, numpy=
array([0.19290747, 0.4705955 , 0.92336786, 0.9780375 , 1.        ,
       0.8519754 , 0.6444621 , 0.11573742, 0.08544008, 0.08230511,
       0.06641923, 0.04766941, 0.08281381, 0.35581365, 0.49564826,
       0.8360259 , 0.95209205, 0.86280465, 0.23325546, 0.10728307,
       0.09719153, 0.08755577, 0.09053305, 0.11302297, 0.10270455,
       0.09419525, 0.06915818, 0.15174624, 0.44844738, 0.70710295,
       0.91512257, 0.4217691 , 0.55867785, 0.17784967, 0.09373618,
       0.09798878, 0.03880819, 0.13935286, 0.04087469, 0.05424478,
       0.02453145, 0.09475626, 0.13082005, 0.        , 0.06039432,
       0.05327587, 0.15827681, 0.13541523, 0.22643006], dtype=float32)>, 'NOISY_ALBEDO_B-UV_norm': <tf.Tensor: shape=(8,), dtype=float32, numpy=
array([0.38739812, 1.        , 0.7866821 , 0.56027097, 0.35810304,
       0.19895948, 0.07906287, 0.        ], dtype=float32)>, 'NOISY_ALBEDO_B-Vis_norm': <tf.Tensor: shape=(94,

## Read TFRecords with unknown feature_description

In [16]:
# Read the TFRecord.
raw_dataset = tf.data.TFRecordDataset(tfrecord_file)

In [17]:
# Inspect and parse a single record to infer feature names.
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)  # Inspect the parsed example
    break

features {
  feature {
    key: "ATMOSPHERE-PRESSURE"
    value {
      float_list {
        value: 1469.404541015625
      }
    }
  }
  feature {
    key: "ATMOSPHERE-TEMPERATURE"
    value {
      float_list {
        value: 298.20367431640625
      }
    }
  }
  feature {
    key: "BAR-ATMOSPHERE-PRESSURE"
    value {
      float_list {
        value: 1.4694045782089233
      }
    }
  }
  feature {
    key: "Earth_type"
    value {
      bytes_list {
        value: "modern"
      }
    }
  }
  feature {
    key: "LOG-ATMOSPHERE-TEMPERATURE"
    value {
      float_list {
        value: 2.474513053894043
      }
    }
  }
  feature {
    key: "LOG-OBJECT-GRAVITY"
    value {
      float_list {
        value: 1.0664476156234741
      }
    }
  }
  feature {
    key: "NOISY_ALBEDO_B-NIR"
    value {
      float_list {
        value: 0.131900355219841
        value: 0.13372254371643066
        value: 0.14016485214233398
        value: 0.13960550725460052
        value: 0.1371560543775

In [18]:
# Dynamically build an infered_feature_description
# Parse the inspected "example" to infer the feature description
infered_feature_description = {}
for key, value in example.features.feature.items():
    # Determine the type of the feature
    if value.HasField("bytes_list"):
        if len(value.bytes_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.string)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.string)
    elif value.HasField("float_list"):
        if len(value.float_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.float32)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.float32)
    elif value.HasField("int64_list"):
        if len(value.int64_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.int64)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.int64)

In [19]:
infered_feature_description

{'OBJECT-GRAVITY': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISY_ALBEDO_B-UV_norm': VarLenFeature(dtype=tf.float32),
 'NOISY_ALBEDO_B-Vis_norm': VarLenFeature(dtype=tf.float32),
 'log_C2H6': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISY_ALBEDO_SS-NIR': VarLenFeature(dtype=tf.float32),
 'LOG-ATMOSPHERE-TEMPERATURE': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'log_CO2': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISY_ALBEDO_SS-Vis': VarLenFeature(dtype=tf.float32),
 'log_O2': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'log_O3': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'ATMOSPHERE-TEMPERATURE': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'log_CH4': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'OBJECT-RADIUS-REL-EARTH': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'log_H2O': Fixe

In [21]:
parsed_dataset = raw_dataset.map(lambda x: parse_example(x, infered_feature_description))

# Step 5: Iterate through the parsed dataset
for parsed_record in parsed_dataset.take(1):  # Show the first record
    print(parsed_record)

{'NOISY_ALBEDO_B-NIR': <tf.Tensor: shape=(49,), dtype=float32, numpy=
array([ 0.13190036,  0.13372254,  0.14016485,  0.1396055 ,  0.13715605,
        0.13061677,  0.11485048,  0.06300882,  0.05628837,  0.06195988,
        0.10928124,  0.11839666,  0.12006875,  0.1267577 ,  0.13167953,
        0.13820629,  0.1399527 ,  0.13899311,  0.11778532,  0.10889317,
        0.04351987,  0.00291775, -0.00448341, -0.00363231,  0.01141433,
        0.02516089,  0.05630337,  0.0775315 ,  0.11791225,  0.12504801,
        0.14137083,  0.14869632,  0.1621281 ,  0.13733484,  0.14457983,
        0.1234774 ,  0.12172243,  0.13029328,  0.13106982,  0.13412544,
        0.088451  ,  0.00223016,  0.0130842 , -0.02863907, -0.02366712,
        0.01563601,  0.0059543 ,  0.09792983,  0.10931743], dtype=float32)>, 'NOISY_ALBEDO_B-NIR_norm': <tf.Tensor: shape=(49,), dtype=float32, numpy=
array([0.8415463 , 0.85109824, 0.8848688 , 0.88193667, 0.86909664,
       0.8348178 , 0.75217104, 0.48041746, 0.44518894, 0.4749189