In [1]:
import pandas as pd
from geexhp.model import datasetup as dset
import os
import tensorflow as tf
from tqdm import tqdm

2025-01-10 09:50:37.216851: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736513437.235191 3396623 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736513437.240720 3396623 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-10 09:50:37.260205: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Auxiliary Functions

In [2]:
# Helper functions to create tf.train.Features
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))

def _float_feature(value):
    """Returns a float_list from a float / list of floats."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _float_feature_list(value):
    """Returns a float_list from a float / list of floats."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

In [3]:
# Function to serialize a row into tf.train.Example
def serialize_sample(row):
    feature = {
        'NOISY_ALBEDO_B-NIR' : _float_feature_list(row['NOISY_ALBEDO_B-NIR']),
        'NOISY_ALBEDO_B-UV' : _float_feature_list(row['NOISY_ALBEDO_B-UV']),
        'NOISY_ALBEDO_B-Vis' : _float_feature_list(row['NOISY_ALBEDO_B-Vis']),
        'NOISY_ALBEDO_SS-NIR' : _float_feature_list(row['NOISY_ALBEDO_SS-NIR']),
        'NOISY_ALBEDO_SS-UV' : _float_feature_list(row['NOISY_ALBEDO_SS-UV']),
        'NOISY_ALBEDO_SS-Vis' : _float_feature_list(row['NOISY_ALBEDO_SS-Vis']),
        'OBJECT-DIAMETER' : _float_feature(row['OBJECT-DIAMETER']),
        'OBJECT-GRAVITY' : _float_feature(row['OBJECT-GRAVITY']),
        'ATMOSPHERE-TEMPERATURE' : _float_feature(row['ATMOSPHERE-TEMPERATURE']),
        'ATMOSPHERE-PRESSURE' : _float_feature(row['ATMOSPHERE-PRESSURE']),
        'Earth_type' : _bytes_feature(row['Earth_type']),
        'C2H6' : _float_feature(row['C2H6']),
        'CH4' : _float_feature(row['CH4']),
        'CO' : _float_feature(row['CO']),
        'CO2' : _float_feature(row['CO2']),
        'H2O' : _float_feature(row['H2O']),
        'N2' : _float_feature(row['N2']),
        'N2O' : _float_feature(row['N2O']),
        'O2' : _float_feature(row['O2']),
        'O3' : _float_feature(row['O3'])
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

# Process

In [4]:
columns_of_interest = [
    'NOISY_ALBEDO_B-NIR',
    'NOISY_ALBEDO_B-UV',
    'NOISY_ALBEDO_B-Vis',
    'NOISY_ALBEDO_SS-NIR',
    'NOISY_ALBEDO_SS-UV',
    'NOISY_ALBEDO_SS-Vis',
    'OBJECT-DIAMETER',
    'OBJECT-GRAVITY',
    'ATMOSPHERE-TEMPERATURE',
    'ATMOSPHERE-PRESSURE',
    'Earth_type'
]

molecules = [
    'C2H6',
    'CH4',
    'CO',
    'CO2',
    'H2O',
    'N2',
    'N2O',
    'O2',
    'O3'
]

In [None]:
root_folder = '../parallel'

folders = os.listdir(root_folder)

# Count the total number of '.parquet' files
file_count = sum(
    len([file for file in files if file.endswith('.parquet')])
    for _, _, files in os.walk(root_folder)
)

In [7]:
with tqdm(
    total=file_count,
    desc="🌍 Progress",
    dynamic_ncols=True,
    colour='cyan',
    bar_format="{desc}: |{bar:30}| {percentage:3.0f}% ({n_fmt}/{total_fmt} files) ⏳ [{elapsed} elapsed]"
) as pbar:
    
    for folder in folders:
        folder_path = os.path.join(root_folder, folder)

        if not os.path.isdir(folder_path):
            continue  # Skip stuff that is not a folder

        files = os.listdir(folder_path)

        for file in files:
            # Some code
            file_path = os.path.join(folder_path, file)

            if not file.endswith(".parquet"):
                continue  # Skip non-parquet files

            earth_type = file.split("_")[0]
            
            df = pd.read_parquet(file_path)
            df["Earth_type"] = earth_type

            noise_columns = [col for col in df.columns if "NOISE_" in col]
            mask = ~df[noise_columns].map(lambda x: any(value > 10 for value in x)).any(axis=1)
            df = df[mask]

            df = dset.extract_abundances(df)

            filtered_df = df.copy()
            filtered_df = filtered_df[columns_of_interest]

            # Get all the molecules abundances.
            for molecule in molecules:
                if molecule in df.columns:
                    filtered_df[molecule] = df[molecule]
                else:
                    # Fill with zeros those who are not present.
                    filtered_df[molecule] = 0
            
            record_dict = filtered_df.to_dict(orient="records")

            # Writing to TFRecord file
            #   The files follow the following name structure:
            #   {earth_type}_{origin_folder}_{original_range_of_samples}_{number_of_actual_samples}
            tfrecord_file = f'{earth_type}_{folder}_{file.split("_")[1]}_{len(record_dict)}.tfrecord'
            save_root = '../data/TFRecord_data'
            if not os.path.exists(save_root):
                os.makedirs(save_root)
            save_path_file = os.path.join(save_root, tfrecord_file)

            with tf.io.TFRecordWriter(save_path_file) as writer:
                for sample in record_dict:
                    serialized_sample = serialize_sample(sample)
                    writer.write(serialized_sample)

            pbar.update(1)  # Update the progress bar for each file

🌍 Progress: |[36m                              [0m|   0% (2/972 files) ⏳ [00:02 elapsed]