# Organizing Data into AI-Ready Format
Task: Prepare the cleaned data for machine learning, ensuring it is properly annotated and structured.
- Convert your data into a format suitable for ML (e.g., pandas DataFrame, NumPy arrays, Xarray).

- Ensure the data is well-documented with attributes, labels, and metadata.

- Include a notebook (notebooks/Prepare_AI_Ready_Data.ipynb) that clearly describes:

- The final shape of the data (number of samples, features, and target labels).

- A description of each feature/attribute.

- Save the final AI-ready data in a dedicated folder (data/ai_ready/).

In [None]:
# When preparing our data to be 'AI-ready', our main focus is spatially aligning all of our datasets and making sure
# they're of the same shape (and handling missing values)

# below is the same writeup for data cleaning since they two sections are intertwined for us

# you'll see in every one of the downloaded data that we load here, we squeeze the data, rename columns to short yet explanatory names
# with no special characters, and ensure spatial overlap by projecting to the same CRS

# a big part of this preprocessing since we're working with data of differing spatial resolution is the rio.enums.Resampling.bilinear method.
# as you know, our target data is in 50x50m spatial resolution which is what we're re-fitting other data sources to. This method allows data
# of larger spatial resolution to "shrink" so the 50x50m output cells take on whatever value of the bigger cell they're within. This calue
# is averaged between multiple cells if the topology is not perfect. This same averaging principle is in place when mapping finer spatial resolutions to 50x50cm

# for the main loop, you'll see that we iterate over tiles. by using the "Create grid" tool in QGIS, we are able to create 32x32km grids (1024 km**2)
# over the ASO LiDAR flights. For each grid, we subdivide all data within into 128x128 rasters of 50x50m spatial resolution (a little larger than 40 km**2).
# All subvised tiles are kept as long as they contain some raster data. It should be noted that the ASO LiDAR data are clipped to a water basin
# which might alarm some users that a lot of NaN values will exhibited by subtiles around the edges, but this is actually not the case after some 
# manual inspection. All subtiles have a significant-enough amount of data

# for handling missing values, you'll see that we have "gap maps" or various features where there are gaps in the 'ground truth' snow depth,
# sentinel-1 backscatter, and sentinel-2 multispectral data. The FCF data and elevation data do not have missing values so they didn't need to be handled
# we can treet these maps as masks when processing the data and ml model

In [1]:
from glob import glob
import numpy as np
import xarray as xr
import rasterio as rio
import rioxarray as rxr
import geopandas as gpd
import random
import matplotlib.pyplot as plt
from rioxarray import merge
import gc
from pyproj import Proj, transform
import math
import xdem

import warnings
warnings.filterwarnings("ignore")

In [26]:
def open_all_data(aso_path, home_path):
    aso_fn = aso_path.split('\\')[-1][:-4]
    S1_snowon_path = glob(rf'{home_path}\S1_rtc\S1_snow-on_*_for_{aso_fn}.nc')[0]
    S1_snowoff_path = glob(rf'{home_path}\S1_rtc\S1_snow-off_*_for_{aso_fn}.nc')[0]
    S1_snowon_mean_path = glob(rf'{home_path}\S1_rtc_mean\S1_snow-on_*_for_{aso_fn}.nc')[0]
    S1_snowoff_mean_path = glob(rf'{home_path}\S1_rtc_mean\S1_snow-off_*_for_{aso_fn}.nc')[0]
    # i messed up the file naming for sentinel 2 when i preprocessed the data, so i have to do this
    S2_path = glob(rf'{home_path}\sentinel_2\{aso_path.split('\\')[-1].split('_')[-2]}_for_{aso_fn}.nc')[0]
    fcf_path = glob(rf'{home_path}\fcf\fcf_for_{aso_fn}.nc')[0]
    dem_path = glob(rf'{home_path}\cop30\cop30_for_{aso_fn}.nc')[0]
    
    aso_ds = xr.open_dataset(aso_path).squeeze()
    aso_ds = aso_ds.rename({'band_data': 'aso_sd'})
    aso_ds['aso_sd'] = aso_ds['aso_sd'].where(aso_ds['aso_sd'] >= 0)
    
    S1_snowon_ds = xr.open_dataset(S1_snowon_path).squeeze()
    S1_snowon_ds = S1_snowon_ds.rename({'vv': 'snowon_vv', 'vh':'snowon_vh'})
    S1_snowon_ds = S1_snowon_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)
    S1_snowoff_ds = xr.open_dataset(S1_snowoff_path).squeeze()
    S1_snowoff_ds = S1_snowoff_ds.rename({'vv': 'snowoff_vv', 'vh':'snowoff_vh'})
    S1_snowoff_ds = S1_snowoff_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)
    
    S1_snowon_mean_ds = xr.open_dataset(S1_snowon_mean_path).squeeze()
    S1_snowon_mean_ds = S1_snowon_mean_ds.rename({'vv': 'snowon_vv_mean', 'vh':'snowon_vh_mean'})
    S1_snowon_mean_ds = S1_snowon_mean_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)
    S1_snowoff_mean_ds = xr.open_dataset(S1_snowoff_mean_path).squeeze()
    S1_snowoff_mean_ds = S1_snowoff_mean_ds.rename({'vv': 'snowoff_vv_mean', 'vh':'snowoff_vh_mean'})
    S1_snowoff_mean_ds = S1_snowoff_mean_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)
    
    S2_ds = xr.open_dataset(S2_path).squeeze()
    S2_ds = S2_ds.rio.write_crs(aso_ds.rio.crs)
    S2_ds = S2_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs) 
    
    fcf_ds = xr.open_dataset(fcf_path).squeeze()
    fcf_ds = fcf_ds.rename({'__xarray_dataarray_variable__': 'fcf'})
    fcf_ds = fcf_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)
    
    dem_ds = xr.open_dataset(dem_path).squeeze()
    dem_ds = dem_ds.rio.write_crs(aso_ds.rio.crs)
    dem_ds = dem_ds.rename({'__xarray_dataarray_variable__': 'elevation'})
    dem_ds = dem_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)
    
    ds_list = [aso_ds, S1_snowon_ds, S1_snowoff_ds, S2_ds, fcf_ds, dem_ds]
    
    ds = xr.merge(ds_list, compat='override', join='override').squeeze()

    # add terrain variables
    transform = (50, 0.0, ds.isel(x=0, y=0).x.item(), 0.0, 50, ds.isel(x=0, y=0).y.item())
    dem = xdem.DEM.from_array(ds.elevation.values, transform, crs=ds.rio.crs)
    ds['aspect'] = (('y', 'x'), xdem.terrain.aspect(dem).data.data)
    ds['slope'] = (('y', 'x'), xdem.terrain.slope(dem).data.data)
    ds['curvature'] = (('y', 'x'), xdem.terrain.curvature(dem).data.data)
    ds['tpi'] = (('y', 'x'), xdem.terrain.topographic_position_index(dem).data.data)
    ds['tri'] = (('y', 'x'), xdem.terrain.terrain_ruggedness_index(dem).data.data)

    return aso_fn, ds

In [41]:
subset_size = 128

home_path = r"C:\Users\JackE\uw\courses\aut24\ml_geo\final_data"
utm_zones = ['utm10n', 'utm11n', 'utm12n', 'utm13n']
total_subsets = 0

# loop through utm zones
for i, utm_zone in enumerate(utm_zones):
    print(f'working on {utm_zone} file {i}')
    aso_paths = glob(rf'{home_path}\ASO_50m_SD_cleaned\{utm_zone}\*')
    tile_names = [f'train_{utm_zone}_32km.shp', f'test_{utm_zone}_32km.shp', f'val_{utm_zone}_32km.shp']

    # define projections for lat and lon data variables
    utm_proj = Proj(proj='utm', zone=utm_zone[3:-1], ellps='WGS84')
    wgs84_proj = Proj(proj='latlong', datum='WGS84')

    #loop through train, val, test tiles
    for tile_set in tile_names:
        # open tiles 
        print(f'working on {tile_set}')
        tiles = gpd.read_file(rf'{home_path}\tiles\{tile_set}')

        #loop through ASO rasters
        for aso_path in aso_paths:
            # open aso raster
            raster_subsets = 0
            try:
                print(aso_path, home_path)
                aso_fn, ds = open_all_data(aso_path, home_path)
            except Exception as e:
                print(f'encountered error opening dataset {aso_fn}... missing S2 data, skipping')
                aso_fn = aso_path.split('\\')[-1][:-4]
                print(rf'no {home_path}\sentinel_2\{aso_path.split('\\')[-1].split('_')[-2]}_for_{aso_fn}.nc')
                continue
            print(f'working on {aso_fn}')

            # loop through tiles
            for tile in tiles.iterrows():
                # clip to tile extent
                try:
                    tile_ds = ds.rio.clip([tile[1].geometry], crs=ds.rio.crs, drop=True)
                except: #except if tile does not overlap aso raster
                    continue

                # pad ds to tile extent
                tile_ds = tile_ds.rio.pad_box(miny=tile[1].geometry.bounds[1],
                                              minx=tile[1].geometry.bounds[0],
                                              maxy=tile[1].geometry.bounds[3],
                                              maxx=tile[1].geometry.bounds[2])

                subset_count = 0
                #loop through subset locations
                for i in range(round((len(tile_ds.x)/subset_size))):
                    for j in range(round((len(tile_ds.y)/subset_size))):
                        ymin = j*subset_size
                        ymax = ymin+subset_size
                        xmin = i*subset_size
                        xmax = xmin+subset_size
            
                        subset_ds = tile_ds.isel(x=slice(xmin, xmax), y=slice(ymin, ymax))
            
                        valid_aso = np.invert(np.isnan(subset_ds.aso_sd))
                        
                        # check if subset has valid ASO pixels
                        if valid_aso.sum() == 0:
                            continue

                        # make sure dimensions are correct
                        if len(subset_ds.x) != subset_size or len(subset_ds.y) != subset_size:
                            continue
                    
                        else: # save subset
                            # create map of gaps
                            subset_ds['aso_gap_map'] = np.multiply(np.isnan(subset_ds.aso_sd), 1)
                            
                            # radar gap maps
                            subset_ds['rtc_gap_map'] = np.multiply(((np.isnan(subset_ds.snowon_vv) +
                                                                     np.isnan(subset_ds.snowon_vh) +
                                                                     np.isnan(subset_ds.snowoff_vv) +
                                                                     np.isnan(subset_ds.snowoff_vh)) > 0), 1)
    
                            #subset_ds['rtc_mean_gap_map'] = np.multiply(((np.isnan(subset_ds.snowon_vv_mean) +
                            #                                         np.isnan(subset_ds.snowon_vh_mean) +
                            #                                         np.isnan(subset_ds.snowoff_vv_mean) +
                            #                                         np.isnan(subset_ds.snowoff_vh_mean)) > 0), 1)
    
                            # s2 gap maps
                            subset_ds['s2_gap_map'] = np.multiply(((np.isnan(subset_ds.B02) +
                                                                   np.isnan(subset_ds.B03) +
                                                                   np.isnan(subset_ds.B04) +
                                                                   np.isnan(subset_ds.B08) +
                                                                   np.isnan(subset_ds.B11)) > 0), 1)

                            # add gaps for high probablity cloud cover
                            subset_ds['s2_gap_map'] = xr.where(subset_ds['SCL'] == 9, 1, subset_ds['s2_gap_map'])
                            subset_ds['s2_gap_map'] = xr.where(subset_ds['SCL'] == 0, 1, subset_ds['s2_gap_map'])
                            
                            # fill nans with 0 
                            subset_ds = subset_ds.fillna(0)

                            # add lat and lon variables 
                            x, y = np.meshgrid(subset_ds['x'].values, subset_ds['y'].values)
                            lon, lat = transform(utm_proj, wgs84_proj, x, y)
                            subset_ds['latitude'] = (('y', 'x'), lat)
                            subset_ds['longitude'] = (('y', 'x'), lon)
                            
                            subset_count+=1
                            total_subsets+=1
                            raster_subsets+=1
                            
                            subset_ds.to_netcdf(rf'{home_path}\subsets_v4\{tile_set.split("_")[0]}\{aso_fn}_tile{int(tile[1].id)}_s{subset_count}.nc')
                            # reproject to wgs for heatmaps
                            subset_ds = subset_ds.rio.reproject("EPSG:4326")
                            subset_ds.aso_sd.rio.to_raster(rf'{home_path}\subsets_v4_tif\{tile_set.split("_")[0]}\{aso_fn}_tile{int(tile[1].id)}_s{subset_count}.tif')
                        
            print(f'total subsets from {aso_fn}: {raster_subsets}')
            gc.collect()

working on utm10n file 0
working on train_utm10n_32km.shp
C:\Users\JackE\uw\courses\aut24\ml_geo\final_data\ASO_50m_SD_cleaned\utm10n\ASO_50M_SD_American_20230131_clean.tif C:\Users\JackE\uw\courses\aut24\ml_geo\final_data
working on ASO_50M_SD_American_20230131_clean
total subsets from ASO_50M_SD_American_20230131_clean: 100
C:\Users\JackE\uw\courses\aut24\ml_geo\final_data\ASO_50m_SD_cleaned\utm10n\ASO_50M_SD_American_20230413_clean.tif C:\Users\JackE\uw\courses\aut24\ml_geo\final_data
encountered error opening dataset ASO_50M_SD_American_20230131_clean... missing S2 data, skipping
no C:\Users\JackE\uw\courses\aut24\ml_geo\final_data\sentinel_2\20230413_for_ASO_50M_SD_American_20230413_clean.nc
C:\Users\JackE\uw\courses\aut24\ml_geo\final_data\ASO_50m_SD_cleaned\utm10n\ASO_50M_SD_American_20230428_clean.tif C:\Users\JackE\uw\courses\aut24\ml_geo\final_data
encountered error opening dataset ASO_50M_SD_American_20230413_clean... missing S2 data, skipping
no C:\Users\JackE\uw\courses\au

In [3]:
# final sizes
# each file is a 128x128 pixel raster with 50m resolution
# targets are not labels but continuous values representing snow depth in meters
print(f'Training data file count: {len(glob(r"C:\Users\JackE\uw\courses\aut24\ml_geo\final_data\subsets_v4\train\*.nc"))}')
print(f'Testing data file count: {len(glob(r"C:\Users\JackE\uw\courses\aut24\ml_geo\final_data\subsets_v4\test\*.nc"))}')
print(f'Validation data file count: {len(glob(r"C:\Users\JackE\uw\courses\aut24\ml_geo\final_data\subsets_v4\val\*.nc"))}')

Training data file count: 3366
Testing data file count: 451
Validation data file count: 589
