# Setup

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
from geexhp import modelfuncs as mf

In [2]:
# Parsing function.
def parse_example(example_proto, feature_description):
    # Parse the example.
    parsed_features = tf.io.parse_single_example(example_proto, feature_description)

    # Convert SparseTensors to DenseTensors.
    #   This is necessary because we loaded features with variable length.
    #   Although the parsed files have the desired data, they have other things 
    #   (indices, values and dense_shape). Neural networks must work with values
    #   in a more direct way.
    dense_features = {
        key: tf.sparse.to_dense(value, default_value=0.0)
        if isinstance(value, tf.SparseTensor) else value
        for key, value in parsed_features.items()
    }

    return dense_features

# Write each filtered DataFrame to TFRecord files

In [3]:
mf.tfrecord_conversion.create_tfrecords(root_folder="../parallel", save_root="../data/TFRecord_data")

🌍 Progress: |[36m██████████████████████████████[0m| 100% (1188/1188 files) ⏳ [1:17:19 elapsed]


In [4]:
mf.datasetup.concatenate_all_tfrecords(root_folder="../data/TFRecord_data", 
                                        output_tfrecord_file="../data/all_samples.tfrecord")

Concatenated TFRecord file saved to '../data/all_samples.tfrecord'


# Train/Validation/Test split

In [3]:
mf.datasetup.train_val_test_split(root_folder="../data/TFRecord_data")

Concatenated TFRecord file saved to '../data/train.tfrecord'
Concatenated TFRecord file saved to '../data/val.tfrecord'
Concatenated TFRecord file saved to '../data/test.tfrecord'


# Reading TFRecord file

In [3]:
# Load some example file
tfrecord_file = "../data/train.tfrecord"

## Reading TFRecords with known feature_description

In [None]:
# Define feature description for parsing.
#   Comment every feature that will not be used.
known_feature_description = {
    "NOISY_ALBEDO_B-NIR": tf.io.VarLenFeature(tf.float32),
    "NOISY_ALBEDO_B-UV": tf.io.VarLenFeature(tf.float32),
    "NOISY_ALBEDO_B-Vis": tf.io.VarLenFeature(tf.float32),

    "NOISY_ALBEDO_SS-NIR": tf.io.VarLenFeature(tf.float32),
    "NOISY_ALBEDO_SS-UV": tf.io.VarLenFeature(tf.float32),
    "NOISY_ALBEDO_SS-Vis": tf.io.VarLenFeature(tf.float32),

    # 'ALBEDO_B-NIR': tf.io.VarLenFeature(tf.float32),
    # 'ALBEDO_B-UV': tf.io.VarLenFeature(tf.float32),
    # 'ALBEDO_B-Vis': tf.io.VarLenFeature(tf.float32),


    # 'ALBEDO_SS-NIR': tf.io.VarLenFeature(tf.float32),
    # 'ALBEDO_SS-UV': tf.io.VarLenFeature(tf.float32),
    # 'ALBEDO_SS-Vis': tf.io.VarLenFeature(tf.float32),

    # "NOISE_B-NIR": tf.io.VarLenFeature(tf.float32),
    # "NOISE_B-UV": tf.io.VarLenFeature(tf.float32),
    # "NOISE_B-Vis": tf.io.VarLenFeature(tf.float32),

    # "NOISE_SS-NIR": tf.io.VarLenFeature(tf.float32),
    # "NOISE_SS-UV": tf.io.VarLenFeature(tf.float32),
    # "NOISE_SS-Vis": tf.io.VarLenFeature(tf.float32),

    "OBJECT-RADIUS-REL-EARTH": tf.io.FixedLenFeature([], tf.float32),
    "OBJECT-GRAVITY": tf.io.FixedLenFeature([], tf.float32),
    "ATMOSPHERE-TEMPERATURE": tf.io.FixedLenFeature([], tf.float32),
    "ATMOSPHERE-PRESSURE": tf.io.FixedLenFeature([], tf.float32),

    'Earth_type': tf.io.FixedLenFeature([], tf.string),

    'C2H6': tf.io.FixedLenFeature([], tf.float32),
    'CH4': tf.io.FixedLenFeature([], tf.float32),
    'CO': tf.io.FixedLenFeature([], tf.float32),
    'CO2': tf.io.FixedLenFeature([], tf.float32),
    'H2O': tf.io.FixedLenFeature([], tf.float32),
    'N2': tf.io.FixedLenFeature([], tf.float32),
    'N2O': tf.io.FixedLenFeature([], tf.float32),
    'O2': tf.io.FixedLenFeature([], tf.float32),
    'O3': tf.io.FixedLenFeature([], tf.float32)
}

In [5]:
# Read TFRecord.
dataset = tf.data.TFRecordDataset(tfrecord_file)
parsed_dataset = dataset.map(lambda x: parse_example(x, known_feature_description))

In [6]:
# Count the number of samples in the loaded file.
count = 0
for record in parsed_dataset:
    count += 1

In [7]:
count

892073

In [None]:
# Our complete dataset
train = 892073
val = 107422
test = 87419
train + val + test

1086914

In [None]:
# Sum of all samples simulated
(500 + 10000 + 15000 + 20000 + 2500 + 2500 + 10000 + 7500 + 25000 + 5000 + 
    5000 + 10000 + 40000 + 50000 + 10000 + 10000 + 25000 + 60000 + 70000 + 
        15000 + 10000 + 50000 + 75000 + 75000 + 400000 + 50000 + 80000 + 
            10000 + 35000 + 5000 + 30000 + 20000 + 25000) * 3

3774000

In [None]:
# Porcentage of the final dataset
1086914 / 3774000

0.2880005299417064

In [23]:
# See the last "record" of "parsed_dataset" from the "for" loop.
#   Each iteration creates a "record" dictionary.
print(record)

{'NOISY_ALBEDO_B-NIR': <tf.Tensor: shape=(49,), dtype=float32, numpy=
array([ 0.03120991,  0.02348019,  0.08156133,  0.08919134,  0.03829219,
        0.05038192,  0.05889713,  0.01329395,  0.00676298,  0.0138255 ,
       -0.0450606 ,  0.01303876, -0.00962674, -0.00400618,  0.00307687,
        0.00193075,  0.07204247,  0.06221881,  0.02312129,  0.06723561,
        0.03035603,  0.0078778 , -0.02315002, -0.05558172,  0.01798608,
       -0.02576451,  0.01677396,  0.0666002 ,  0.06341548, -0.0170542 ,
       -0.04238626,  0.07946615,  0.04889201,  0.04129051, -0.00141776,
       -0.04917079,  0.01374301, -0.04828964,  0.14550075, -0.05395419,
        0.01380909,  0.03263249, -0.30435345,  0.12736703, -0.20743378,
        0.21739912,  0.69267404, -0.39433643, -0.13628742], dtype=float32)>, 'NOISY_ALBEDO_B-UV': <tf.Tensor: shape=(8,), dtype=float32, numpy=
array([0.5803183 , 0.16433576, 0.10102464, 0.08263027, 0.05980235,
       0.05027221, 0.04765819, 0.04148417], dtype=float32)>, 'NOISY_ALB

## Read TFRecords with unknown feature_description

In [24]:
# Read the TFRecord.
raw_dataset = tf.data.TFRecordDataset(tfrecord_file)

In [25]:
# Inspect and parse a single record to infer feature names.
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)  # Inspect the parsed example
    break

features {
  feature {
    key: "ALBEDO_B-NIR"
    value {
      float_list {
        value: 0.035285308957099915
        value: 0.035266026854515076
        value: 0.019947567954659462
        value: 0.009435092099010944
        value: 0.005379306152462959
        value: 0.023336881771683693
        value: 0.03449138253927231
        value: 0.027705520391464233
        value: 0.026704657822847366
        value: 0.02855076640844345
        value: 0.03278277814388275
        value: 0.03361545875668526
        value: 0.03138037025928497
        value: 0.02486272156238556
        value: 0.013438167050480843
        value: 0.004078489728271961
        value: 0.0008636921411380172
        value: 0.01162283681333065
        value: 0.02565583400428295
        value: 0.030765628442168236
        value: 0.025681309401988983
        value: 0.006950224284082651
        value: 0.009253139607608318
        value: 0.011361568234860897
        value: 0.018072450533509254
        value: 0.022849673405

In [26]:
# Dynamically build an infered_feature_description
# Parse the inspected "example" to infer the feature description
infered_feature_description = {}
for key, value in example.features.feature.items():
    # Determine the type of the feature
    if value.HasField("bytes_list"):
        if len(value.bytes_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.string)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.string)
    elif value.HasField("float_list"):
        if len(value.float_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.float32)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.float32)
    elif value.HasField("int64_list"):
        if len(value.int64_list.value) > 1:  # Check for multiple values
            infered_feature_description[key] = tf.io.VarLenFeature(tf.int64)
        else:
            infered_feature_description[key] = tf.io.FixedLenFeature([], tf.int64)

In [27]:
infered_feature_description

{'ALBEDO_SS-UV': VarLenFeature(dtype=tf.float32),
 'CO': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'N2O': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'ALBEDO_SS-NIR': VarLenFeature(dtype=tf.float32),
 'NOISY_ALBEDO_B-UV': VarLenFeature(dtype=tf.float32),
 'O3': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'ALBEDO_B-Vis': VarLenFeature(dtype=tf.float32),
 'NOISE_SS-NIR': VarLenFeature(dtype=tf.float32),
 'ALBEDO_B-UV': VarLenFeature(dtype=tf.float32),
 'Earth_type': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
 'OBJECT-GRAVITY': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'OBJECT-RADIUS-REL-EARTH': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'NOISY_ALBEDO_B-NIR': VarLenFeature(dtype=tf.float32),
 'CH4': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 'ATMOSPHERE-TEMPERATURE': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 

In [28]:
parsed_dataset = raw_dataset.map(lambda x: parse_example(x, infered_feature_description))

# Step 5: Iterate through the parsed dataset
for parsed_record in parsed_dataset.take(1):  # Show the first record
    print(parsed_record)

{'ALBEDO_B-NIR': <tf.Tensor: shape=(49,), dtype=float32, numpy=
array([0.03528531, 0.03526603, 0.01994757, 0.00943509, 0.00537931,
       0.02333688, 0.03449138, 0.02770552, 0.02670466, 0.02855077,
       0.03278278, 0.03361546, 0.03138037, 0.02486272, 0.01343817,
       0.00407849, 0.00086369, 0.01162284, 0.02565583, 0.03076563,
       0.02568131, 0.00695022, 0.00925314, 0.01136157, 0.01807245,
       0.02284967, 0.02810887, 0.02938152, 0.03359644, 0.03429016,
       0.03450478, 0.03184685, 0.03386741, 0.03248034, 0.03224425,
       0.03121728, 0.03276105, 0.03250477, 0.0323303 , 0.03248924,
       0.02942535, 0.01825682, 0.00732374, 0.00895857, 0.01300012,
       0.01146523, 0.01681874, 0.01777527, 0.01193611], dtype=float32)>, 'ALBEDO_B-UV': <tf.Tensor: shape=(8,), dtype=float32, numpy=
array([4.8044568e-04, 9.6254626e-05, 3.0884516e-05, 1.7581064e-04,
       1.8381573e-02, 5.8300469e-02, 4.2208802e-02, 1.3562340e-02],
      dtype=float32)>, 'ALBEDO_B-Vis': <tf.Tensor: shape=(94,), 