In [1]:
# Clone into the EMIT Resources Repository to access to helpful visualization tools, ONLY DO THIS ONCE
#!git clone https://github.com/nasa/EMIT-Data-Resources.git
#!cd EMIT-Data-Resources/python/modules
#!cp EMIT-Data-Resources/python/modules/emit_tools.py /home/jupyter/HIR_EMIT/hyperspectral_image_reconstruction_EMIT.ipynb

In [1]:
# Install necessary Python libraries
#!pip install torch --quiet
!pip install hvplot --quiet
!pip install netCDF4 --quiet
!pip install spectral --quiet
!pip install rasterio --quiet
!pip install rioxarray --quiet
!pip install cartopy geoviews --quiet
!pip install s3fs --quiet

In [2]:
import xarray as xr
import matplotlib.pyplot as plt
import torch as nn
import os
import numpy as np
import rioxarray
import xarray as xr
import holoviews as hv
import hvplot.xarray
import netCDF4 as nc
import math
import warnings
import sys
sys.path.append('/home/jupyter/HIR_EMIT/EMIT-Data-Resources/python/modules')

from emit_tools import emit_xarray

warnings.simplefilter('ignore')
hv.extension('bokeh')

In [3]:
class HyperspectralImageProcessor:
    def __init__(self, input_file, output_file):
        self.input_file = input_file
        self.output_file = output_file
        
    def load_dataset(self):
        """
        Load in the hyperspectral image and separate 'sensor_band_parameters' and 'location' in order to access any additional data dimeensions we need
        (i.e. wavelengths and geographic details).
        """
        
        # Initially opening file into xarray
        ds_nc = nc.Dataset(self.input_file)
        ds = xr.open_dataset(self.input_file)
        
        wvl = xr.open_dataset(file_path, group='sensor_band_parameters')
        loc = xr.open_dataset(file_path, group='location')
        
        # Set the group's data as coordinates to the main dataset (using the spatial variables of downtrack and crosstrack)
        ds = ds.assign_coords({'downtrack': ds.downtrack.data,
                       'crosstrack': ds.crosstrack.data,
                       **wvl.variables,
                       **loc.variables})
        
        # Since these datasets are large, we can go ahead and delete objects we won't be using to conserve memory.
        del wvl
        del loc
        
        # Switch dimensions of the dataset from bands to wavelength to provide easier access to our spectral data
        ds = ds.swap_dims({'bands': 'wavelengths'})
        return ds
    
    def orthorectify(self):
        """
        Orthorectification of the granule
        """
        ds_geo = emit_xarray(self.input_file, ortho=True)
        ds_geo.reflectance.data[ds_geo.reflectance.data == -9999] = np.nan  # Mask invalid values
        return ds_geo
    
    def save_chip_to_disk(self, chip, chip_index):
        """
        Save a single chip to a NetCDF file only if it has valid reflectance data (i.e., not all NaNs).

        Parameters:
            chip: The chip to be saved (xarray Dataset or DataArray)
            chip_index: The index of the chip (for naming the output file)
        """
        # Check if the reflectance data has any valid (non-NaN) values
        if chip.reflectance.isnull().all():
            print(f"Chip {chip_index} contains only NaN values and will not be saved.")
        else:
            output_file = f"{self.output_file}chip_{chip_index}.nc"
            chip.to_netcdf(output_file)
            print(f"Chip {chip_index} saved to {output_file}")
    
    def split_into_chips(self, ds_oc, chip_size=(256,256), overlap_percent=10):
        """
        Split the orthorectified hyperspectral image into smaller chips (subsets).
        
        Parameters:
            ds_geo: The orthorectified dataset
            chip_size: Tuple (height, width) for the chip size
            overlap: Whether to allow overlap between chips (default is False)
            
        Returns:
            List of chips (subsets of the image data)
        """
        lat_height,lon_width = chip_size
        chips_saved = 0
        
        # Calculate step size based on overlap_percent
        lat_step = int(lat_height * (1 - overlap_percent / 100))
        lon_step = int(lon_width * (1 - overlap_percent / 100))
        
        data_lat, data_lon, num_wavelength = ds_oc.reflectance.shape
        
        # Loop over the granule to extract chips
        for i in range(0, data_lat, lat_step):
            for j in range(0, data_lon, lon_step):
                # Ensure the chip fits within the bounds of the image
                chip = ds_oc.isel(latitude=slice(i, i+lat_height), longitude=slice(j, j+lon_width))
                
                # Save the chip to disk
                chips_saved += 1
                self.save_chip_to_disk(chip, chips_saved)
                
                # Delete the chip from memory to free up space
                del chip
        
        print(f"Saved {chips_saved} chips to disk.")
        
    def create_random_xy_pair(self, file_path, num_x_wavelengths=50):
        """
        Create X and Y pairs for model training with randomly selected wavelengths for X from a chip file.

        Parameters:
            file_path: The path to the NetCDF file containing the chip data.
            num_x_wavelengths: Number of wavelengths to choose randomly for X.

        Returns:
            X, Y: The input features (X) and target values (Y).
        """
        # Open the chip and get the list of available wavelengths in the dataset
        ds = xr.open_dataset(file_path)
        wavelengths = ds.wavelengths.values
        
        x_indicies = np.random.choice(len(wavelengths), size=num_x_wavelengths, replace=False)
        X_wavelengths = wavelengths[x_indicies]
        Y_wavelengths = np.setdiff1d(wavelengths, X_wavelengths)
        
        # Extract X and Y reflectance data based on the chosen wavelengths
        X = ds.sel(wavelengths=X_wavelengths)
        Y = ds.sel(wavelengths=Y_wavelengths)
        
        self.save_x_y_to_disk(X, Y, file_path)
        
        del X
        del Y
        
    def save_x_y_to_disk(self, X, Y, chip_index):
        """
        Save the X and Y pairs (input features and target values) to disk for each chip.
        
        Parameters:
            X: The input features (wavelengths selected for X).
            Y: The target values (wavelengths for Y).
            chip_index: The index of the chip to be used in the output filenames.
        """
        # Save X to disk
        X_output_file = f"{self.output_file}/chip_{chip_index}_X.nc"
        X.to_netcdf(X_output_file)
        print(f"Saved X to {X_output_file}")

        # Save Y to disk
        Y_output_file = f"{self.output_file}/chip_{chip_index}_Y.nc"
        Y.to_netcdf(Y_output_file)
        print(f"Saved Y to {Y_output_file}")
        
    def process_chip_directory(self, chip_dir, num_x_wavelengths=50):
        """
        Process all chips in the given directory and create random X-Y pairs for each chip.

        Parameters:
            chip_dir: The directory containing all the chip files.
            num_x_wavelengths: Number of wavelengths to choose randomly for X.
        """
        # List all NetCDF files in the chip directory
        chip_files = [f for f in os.listdir(chip_dir) if f.endswith('.nc')]

        # Process each chip file
        for chip_index, chip_file in enumerate(chip_files):
            file_path = os.path.join(chip_dir, chip_file)
            
            # Create random X-Y pair for this chip
            X, Y = self.create_random_xy_pair(file_path, num_x_wavelengths)
            
            # Print or save the created X and Y for debugging or training
            print(f"Processed chip {chip_index+1}: {chip_file}")
            print(f"X wavelengths: {X.wavelengths.values}")
            print(f"Y wavelengths: {Y.wavelengths.values}")
            

In [4]:
# State the location of the granule you are currently looking at
file_path = "EMIT_DATASET/EMIT_L2A_RFL_001_20241028T045642_2430203_015.nc"
output_path = "chips_XY/"
processor = HyperspectralImageProcessor(file_path, output_path)

In [5]:
ds = processor.load_dataset()
ds_oc = processor.orthorectify()

In [6]:
chip_directory = "chips/"
processor.process_chip_directory(chip_directory, num_x_wavelengths=50)

PermissionError: [Errno 13] Permission denied: '/home/jupyter/HIR_EMIT/chips_XY/chip_chips/chip_57.nc_X.nc'

In [None]:
ds_oc.wavelengths.values

In [12]:
chip_1 = xr.open_dataset('chips/chip_13.nc')
chip_1_wavelength = chip_1.sel(wavelengths=2600, method='nearest')
chip_1_wavelength.hvplot.image(cmap='viridis', aspect='equal', frame_width=720).opts(title=f"{chip_1_wavelength.wavelengths.values:.3f} {chip_1_wavelength.wavelengths.units}")
chip_1.sel(wavelengths=385, method='nearest').hvplot.image(cmap='Viridis', geo=True, tiles='ESRI', alpha=0.8, frame_height=600).opts(
    title=f"Reflectance at {chip_1_wavelength.wavelengths.values:.3f} {chip_1_wavelength.wavelengths.units} (Orthorectified)"
)

In [None]:
# We want to split granule wavelength into x and y, x being the prior (All the other wavelengths) and y being the wavelength we want to reconstruct
# Train, test, val in the spatial domain, we can use different granules, chips, different subsets of the image
# Upres, unet
# Model: super simple 2 CNN, no pooling, enough padding to reduce downsizing