# Setup

In [1]:
import json
import numpy as np
import tensorflow as tf

2025-03-07 22:57:11.228573: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-07 22:57:11.239800: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-07 22:57:11.325387: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-07 22:57:11.409143: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-07 22:57:11.492300: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

In [7]:
# Normalization only using training data
tfrecord_file = "../data/train.tfrecord"

# Reading TFRecord file

In [3]:
def parse_fn(example):
    features = {
        # Inputs
        'NOISY_ALBEDO_B-NIR': tf.io.VarLenFeature(tf.float32),
        'NOISY_ALBEDO_B-UV':  tf.io.VarLenFeature(tf.float32),
        'NOISY_ALBEDO_B-Vis': tf.io.VarLenFeature(tf.float32),

        "NOISY_ALBEDO_SS-NIR": tf.io.VarLenFeature(tf.float32),
        "NOISY_ALBEDO_SS-UV": tf.io.VarLenFeature(tf.float32),
        "NOISY_ALBEDO_SS-Vis": tf.io.VarLenFeature(tf.float32),

        # Planetary params
        'OBJECT-RADIUS-REL-EARTH': tf.io.FixedLenFeature([], tf.float32),
        'OBJECT-GRAVITY':           tf.io.FixedLenFeature([], tf.float32),
        'ATMOSPHERE-TEMPERATURE':   tf.io.FixedLenFeature([], tf.float32),
        'ATMOSPHERE-PRESSURE':      tf.io.FixedLenFeature([], tf.float32),

        # Chemical abundances
        # 'C2H6': tf.io.FixedLenFeature([], tf.float32),
        'CH4':  tf.io.FixedLenFeature([], tf.float32),
        'CO':   tf.io.FixedLenFeature([], tf.float32),
        'CO2':  tf.io.FixedLenFeature([], tf.float32),
        'H2O':  tf.io.FixedLenFeature([], tf.float32),
        'N2':   tf.io.FixedLenFeature([], tf.float32),
        'N2O':  tf.io.FixedLenFeature([], tf.float32),
        'O2':   tf.io.FixedLenFeature([], tf.float32),
        'O3':   tf.io.FixedLenFeature([], tf.float32),
    }

    parsed_features = tf.io.parse_single_example(example, features)

    dense_features = {
        key: tf.sparse.to_dense(value, default_value=0.0)
        if isinstance(value, tf.SparseTensor) else value
        for key, value in parsed_features.items()
    }

    return dense_features

dataset = tf.data.TFRecordDataset(tfrecord_file)
dataset = dataset.map(parse_fn)

In [None]:
# Take this information from the previous notebook (7-search_better_normalization.ipynb)
best_n_values = {
    # 'C2H6': 13.650000000000002,
    'CH4': 6.65,
    'CO': 6.8,
    'CO2': 3.65,
    'H2O': 4.8,
    'N2': 0.35000000000000003,
    'N2O': 16.150000000000002,
    'O2': 2.35,
    'O3': 13.8,
    'ATMOSPHERE-TEMPERATURE': 3.0
    }

# Normalization statistics into a dictionary

In [5]:
def compute_normalization_stats(train_tfrecord_path, best_n_values):
    stats = {
        'inputs': {
            'B-UV':   {'sum': 0., 'sq_sum': 0., 'count': 0}, 
            'B-Vis':  {'sum': 0., 'sq_sum': 0., 'count': 0},
            'B-NIR':  {'sum': 0., 'sq_sum': 0., 'count': 0},
            'SS-UV':   {'sum': 0., 'sq_sum': 0., 'count': 0},
            'SS-Vis':  {'sum': 0., 'sq_sum': 0., 'count': 0},
            'SS-NIR':  {'sum': 0., 'sq_sum': 0., 'count': 0}
        },
        'outputs': {
            # Planetary parameters
            'OBJECT-RADIUS-REL-EARTH':  {'min': np.inf, 'max': -np.inf, 'best_n' : 1},
            'OBJECT-GRAVITY':           {'min': np.inf, 'max': -np.inf, 'best_n' : 1},
            'ATMOSPHERE-TEMPERATURE':   {'min': np.inf, 'max': -np.inf, 'best_n' : 1},
            'ATMOSPHERE-PRESSURE':      {'min': np.inf, 'max': -np.inf, 'best_n' : 1},

            # Chemical abundances
            # 'C2H6': {'sum': 0., 'sq_sum': 0., 'count': 0},
            'CH4':  {'best_n' : 1},
            'CO':   {'best_n' : 1},
            'CO2':  {'best_n' : 1},
            'H2O':  {'best_n' : 1},
            'N2':   {'best_n' : 1},
            'N2O':  {'best_n' : 1},
            'O2':   {'best_n' : 1},
            'O3':   {'best_n' : 1}
        }
    }

    def parse_fn(example):
        features = {
            # Inputs
            'NOISY_ALBEDO_B-NIR': tf.io.VarLenFeature(tf.float32),
            'NOISY_ALBEDO_B-UV':  tf.io.VarLenFeature(tf.float32),
            'NOISY_ALBEDO_B-Vis': tf.io.VarLenFeature(tf.float32),

            "NOISY_ALBEDO_SS-NIR": tf.io.VarLenFeature(tf.float32),
            "NOISY_ALBEDO_SS-UV": tf.io.VarLenFeature(tf.float32),
            "NOISY_ALBEDO_SS-Vis": tf.io.VarLenFeature(tf.float32),

            # Planetary params
            'OBJECT-RADIUS-REL-EARTH': tf.io.FixedLenFeature([], tf.float32),
            'OBJECT-GRAVITY':           tf.io.FixedLenFeature([], tf.float32),
            'ATMOSPHERE-TEMPERATURE':   tf.io.FixedLenFeature([], tf.float32),
            'ATMOSPHERE-PRESSURE':      tf.io.FixedLenFeature([], tf.float32),

            # Chemical abundances
            # 'C2H6': tf.io.FixedLenFeature([], tf.float32),
            'CH4':  tf.io.FixedLenFeature([], tf.float32),
            'CO':   tf.io.FixedLenFeature([], tf.float32),
            'CO2':  tf.io.FixedLenFeature([], tf.float32),
            'H2O':  tf.io.FixedLenFeature([], tf.float32),
            'N2':   tf.io.FixedLenFeature([], tf.float32),
            'N2O':  tf.io.FixedLenFeature([], tf.float32),
            'O2':   tf.io.FixedLenFeature([], tf.float32),
            'O3':   tf.io.FixedLenFeature([], tf.float32),
        }
        return tf.io.parse_single_example(example, features)

    dataset = tf.data.TFRecordDataset(train_tfrecord_path)
    dataset = dataset.map(parse_fn)

    for batch in dataset.batch(1000):  # Process in chunks
        # =========== 1) Inputs (spectra)  ===========
        for region in ['B-UV', 'B-Vis', 'B-NIR', 'SS-UV', 'SS-Vis', 'SS-NIR']:
            key = f'NOISY_ALBEDO_{region}'
            data = tf.sparse.to_dense(batch[key]).numpy()
            
            stats['inputs'][region]['sum']    += np.sum(data)
            stats['inputs'][region]['sq_sum'] += np.sum(data**2)
            stats['inputs'][region]['count']  += data.size


        # =========== 2) Planetary parameters ===========
        for param in ['OBJECT-RADIUS-REL-EARTH', 'OBJECT-GRAVITY',
                        'ATMOSPHERE-TEMPERATURE', 'ATMOSPHERE-PRESSURE']:
            data = batch[param].numpy()
            stats['outputs'][param]['min']          = min(stats['outputs'][param]['min'], np.min(data))
            stats['outputs'][param]['max']          = max(stats['outputs'][param]['max'], np.max(data))
            if param == 'ATMOSPHERE-TEMPERATURE':
                stats['outputs'][param]['best_n']   = best_n_values[param]

        # =========== 3) Chemical abundances ===========
        for chem in ['CH4','CO','CO2','H2O','N2','N2O','O2','O3']:
            data = batch[chem].numpy() 
            stats['outputs'][chem]['best_n'] = best_n_values[chem]

    # =========== 4) Final ===========
    final_stats = {'inputs': {}, 'outputs': {}}

    # 4a) Input stats
    for region in ['B-UV', 'B-Vis', 'B-NIR', 'SS-UV', 'SS-Vis', 'SS-NIR']:
        s    = stats['inputs'][region]['sum']
        sq_s = stats['inputs'][region]['sq_sum']
        cnt  = stats['inputs'][region]['count']

        mean = s / cnt
        var  = (sq_s / cnt) - (mean**2)
        std  = np.sqrt(var)

        final_stats['inputs'][region] = {
            'mean': float(mean),
            'std':  float(std)
        }

    # 4b) Planetary output stats
    for param in ['OBJECT-RADIUS-REL-EARTH', 'OBJECT-GRAVITY',
                    'ATMOSPHERE-TEMPERATURE', 'ATMOSPHERE-PRESSURE']:
        min_    = stats['outputs'][param]['min']
        max_ = stats['outputs'][param]['max']
        best_n  = stats['outputs'][param]['best_n']

        final_stats['outputs'][param] = {
            'min': float(min_),
            'max':  float(max_),
            'best_n':  float(best_n)
        }

    # 4c) Chemical output stats
    for chem in ['CH4','CO','CO2','H2O','N2','N2O','O2','O3']:
        best_n = stats['outputs'][chem]['best_n']
        final_stats['outputs'][chem] = {
            'best_n':  float(best_n)
        }

    with open('../data/normalization_stats.json', 'w') as f:
        json.dump(final_stats, f)

    return final_stats

In [8]:
compute_normalization_stats(tfrecord_file, best_n_values)

2025-03-07 23:02:22.691343: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


{'inputs': {'B-UV': {'mean': 0.05338878821926193, 'std': 0.1786631797483721},
  'B-Vis': {'mean': 0.038637963955897844, 'std': 0.034284439700595255},
  'B-NIR': {'mean': 0.021570751695650828, 'std': 0.17476710099282122},
  'SS-UV': {'mean': 0.05403417486509094, 'std': 0.050288985466582385},
  'SS-Vis': {'mean': 0.041268854632586255, 'std': 0.03669293001236297},
  'SS-NIR': {'mean': 0.024403290984017945, 'std': 0.2002496433797539}},
 'outputs': {'OBJECT-RADIUS-REL-EARTH': {'min': 0.5808368921279907,
   'max': 1.2299998998641968,
   'best_n': 1.0},
  'OBJECT-GRAVITY': {'min': 4.208383560180664,
   'max': 13.940491676330566,
   'best_n': 1.0},
  'ATMOSPHERE-TEMPERATURE': {'min': 273.1500244140625,
   'max': 383.37969970703125,
   'best_n': 3.0},
  'ATMOSPHERE-PRESSURE': {'min': 164.91307067871094,
   'max': 2046.6563720703125,
   'best_n': 1.0},
  'CH4': {'best_n': 6.65},
  'CO': {'best_n': 6.8},
  'CO2': {'best_n': 3.65},
  'H2O': {'best_n': 4.8},
  'N2': {'best_n': 0.35000000000000003},