# Imports

In [8]:
import h5py
from netCDF4 import Dataset
import pandas as pd
import numpy as np

# File Reader

## Set path

In [17]:
# file path to the nc file (here I used a relative path and the file and this notebook are in the same directory)
file_path = 'P5023_magnetic_gradient_reduced.nc'

## Look at top-level groups

In [18]:
# get list of groups
with h5py.File(file_path, 'r') as f:
    groups = list(f.keys())

groups

['line_index',
 'line',
 'FLIGHT',
 'DATE',
 'EASTING',
 'FIDUCIAL',
 'latitude',
 'longitude',
 'NORTHING',
 'diurnal',
 'gnss_height_geoid',
 'gradient_longitudinal_lev',
 'gradient_transverse_lev',
 'igrf',
 'line_type',
 'mag_1_diur_igrf',
 'mag_1_diur_igrf_tie',
 'mag_1_diur_igrf_tie_micro',
 'pseudo_fid',
 'pseudo_line',
 'radar_calib_edit',
 'radar_dem',
 'point',
 'crs']

## Explore Group Contents

In [19]:
# Inspect the contents of the groups in the HDF5 file
with h5py.File(file_path, 'r') as f:
    for group_name in f.keys():  # Iterate through the top-level groups
        print(f"Group: {group_name}")  # Print the name of the group
        group = f[group_name]  # Access the group
        if isinstance(group, h5py.Group):  # Check if it is a subgroup
            print("  Subgroups/Datasets:")  # Indicate that subgroups or datasets exist
            for item_name in group.keys():  # Iterate through items in the subgroup
                item = group[item_name]  # Access the item
                if isinstance(item, h5py.Group):  # Check if the item is a subgroup
                    print(f"    Subgroup: {item_name}")  # Print the name of the subgroup
                elif isinstance(item, h5py.Dataset):  # Check if the item is a dataset
                    # Print the name, shape, and data type of the dataset
                    print(f"    Dataset: {item_name} - Shape: {item.shape}, Dtype: {item.dtype}")
        elif isinstance(group, h5py.Dataset):  # Check if the top-level group is a dataset
            # Print the name, shape, and data type of the dataset
            print(f"  Dataset: {group_name} - Shape: {group.shape}, Dtype: {group.dtype}")

Group: line_index
  Dataset: line_index - Shape: (24029501,), Dtype: uint32
Group: line
  Dataset: line - Shape: (802,), Dtype: uint32
Group: FLIGHT
  Dataset: FLIGHT - Shape: (802,), Dtype: int32
Group: DATE
  Dataset: DATE - Shape: (24029501,), Dtype: int32
Group: EASTING
  Dataset: EASTING - Shape: (24029501,), Dtype: float64
Group: FIDUCIAL
  Dataset: FIDUCIAL - Shape: (24029501,), Dtype: int32
Group: latitude
  Dataset: latitude - Shape: (24029501,), Dtype: float64
Group: longitude
  Dataset: longitude - Shape: (24029501,), Dtype: float64
Group: NORTHING
  Dataset: NORTHING - Shape: (24029501,), Dtype: float64
Group: diurnal
  Dataset: diurnal - Shape: (24029501,), Dtype: float64
Group: gnss_height_geoid
  Dataset: gnss_height_geoid - Shape: (24029501,), Dtype: float64
Group: gradient_longitudinal_lev
  Dataset: gradient_longitudinal_lev - Shape: (24029501,), Dtype: float64
Group: gradient_transverse_lev
  Dataset: gradient_transverse_lev - Shape: (24029501,), Dtype: float64
Group

## Extract Data From Datasets and Store Them in a Dataframe

In [20]:
# Initialize an empty dictionary to store datasets
data_dict = {}

# Open the NetCDF file and extract datasets
with h5py.File(file_path, 'r') as f:
    for group_name in f.keys():
        group = f[group_name]
        if isinstance(group, h5py.Dataset):  # Check if it's a dataset
            print(f"Loading dataset: {group_name}")
            if group.shape == ():  # Check if the dataset is scalar
                data_dict[group_name] = [group[()]]  # Wrap scalar in a list
            else:
                # Convert big-endian data to little-endian if necessary
                data = group[:]
                if data.dtype.byteorder == '>':  # Big-endian
                    data = data.astype(data.dtype.newbyteorder('<'))  # Convert to little-endian
                data_dict[group_name] = data  # Store the converted data

# Convert the dictionary to a Pandas DataFrame, automatically aligning lengths
df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data_dict.items()]))

# Display the first few rows of the DataFrame
print(df.head())

Loading dataset: line_index
Loading dataset: line
Loading dataset: FLIGHT
Loading dataset: DATE
Loading dataset: EASTING
Loading dataset: FIDUCIAL
Loading dataset: latitude
Loading dataset: longitude
Loading dataset: NORTHING
Loading dataset: diurnal
Loading dataset: gnss_height_geoid
Loading dataset: gradient_longitudinal_lev
Loading dataset: gradient_transverse_lev
Loading dataset: igrf
Loading dataset: line_type
Loading dataset: mag_1_diur_igrf
Loading dataset: mag_1_diur_igrf_tie
Loading dataset: mag_1_diur_igrf_tie_micro
Loading dataset: pseudo_fid
Loading dataset: pseudo_line
Loading dataset: radar_calib_edit
Loading dataset: radar_dem
Loading dataset: point
Loading dataset: crs
   line_index      line  FLIGHT      DATE    EASTING  FIDUCIAL   latitude  \
0           0  190180.0     2.0  20230522  355950.44   2120000 -31.800268   
1           0  100010.0     2.0  20230522  355950.41   2120050 -31.800291   
2           0  100020.0     2.0  20230522  355950.38   2120100 -31.800313  

## Export to csv file

In [None]:
# Convert the DataFrame to a CSV file
df.to_csv(f'{file_path}.csv', index=False)