# subset training data
This notebook contains functions to generate subsets from multiple raster sources

In [3]:
from glob import glob
import numpy as np
import xarray as xr
import rasterio as rio
import rioxarray as rxr
import geopandas as gpd
import random
import matplotlib.pyplot as plt
from rioxarray import merge
import gc
from pyproj import Proj, transform
import math
import xdem

import warnings
warnings.filterwarnings("ignore")

In [21]:
home_path = r"/mnt/c/Users/JackE/uw/courses/aut24/ml_geo/final_data"
utm_zones = ['utm10n', 'utm11n', 'utm12n', 'utm13n']
utm_zone = 'utm10n'
aso_paths = glob(rf'{home_path}/ASO_50m_SD_cleaned/{utm_zone}/*')
aso_path = aso_paths[0]
aso_fn = aso_path.split('/')[-1][:-4]
S1_snowon_mean_path = glob(rf'{home_path}/S1_rtc_mean/S1_snow-on_*_for_{aso_fn}.nc')[0]
S1_snowoff_mean_path = glob(rf'{home_path}/S1_rtc_mean/S1_snow-off_*_for_{aso_fn}.nc')[0]

In [7]:
def open_all_data(aso_path, home_path):
    aso_fn = aso_path.split('/')[-1][:-4]
    S1_snowon_path = glob(rf'{home_path}/S1_rtc/S1_snow-on_*_for_{aso_fn}.nc')[0]
    S1_snowoff_path = glob(rf'{home_path}/S1_rtc/S1_snow-off_*_for_{aso_fn}.nc')[0]
    S1_snowon_mean_path = glob(rf'{home_path}/S1_rtc_mean/S1_snow-on_*_for_{aso_fn}.nc')[0]
    S1_snowoff_mean_path = glob(rf'{home_path}/S1_rtc_mean/S1_snow-off_*_for_{aso_fn}.nc')[0]
    # i messed up the file naming for sentinel 2 when i preprocessed the data, so i have to do this
    S2_path = glob(rf'{home_path}/sentinel_2/{aso_path.split('/')[-1].split('_')[-2]}_for_{aso_fn}.nc')[0]
    fcf_path = glob(rf'{home_path}/fcf/fcf_for_{aso_fn}.nc')[0]
    dem_path = glob(rf'{home_path}/cop30/cop30_for_{aso_fn}.nc')[0]
    
    aso_ds = xr.open_dataset(aso_path).squeeze()
    aso_ds = aso_ds.rename({'band_data': 'aso_sd'})
    aso_ds['aso_sd'] = aso_ds['aso_sd'].where(aso_ds['aso_sd'] >= 0)
    
    S1_snowon_ds = xr.open_dataset(S1_snowon_path).squeeze()
    S1_snowon_ds = S1_snowon_ds.rename({'vv': 'snowon_vv', 'vh':'snowon_vh'})
    S1_snowon_ds = S1_snowon_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)
    S1_snowoff_ds = xr.open_dataset(S1_snowoff_path).squeeze()
    S1_snowoff_ds = S1_snowoff_ds.rename({'vv': 'snowoff_vv', 'vh':'snowoff_vh'})
    S1_snowoff_ds = S1_snowoff_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)
    
    S1_snowon_mean_ds = xr.open_dataset(S1_snowon_mean_path).squeeze()
    S1_snowon_mean_ds = S1_snowon_mean_ds.rename({'vv': 'snowon_vv_mean', 'vh':'snowon_vh_mean'})
    S1_snowon_mean_ds = S1_snowon_mean_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)
    S1_snowoff_mean_ds = xr.open_dataset(S1_snowoff_mean_path).squeeze()
    S1_snowoff_mean_ds = S1_snowoff_mean_ds.rename({'vv': 'snowoff_vv_mean', 'vh':'snowoff_vh_mean'})
    S1_snowoff_mean_ds = S1_snowoff_mean_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)
    
    S2_ds = xr.open_dataset(S2_path).squeeze()
    S2_ds = S2_ds.rio.write_crs(aso_ds.rio.crs)
    S2_ds = S2_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs) 
    
    fcf_ds = xr.open_dataset(fcf_path).squeeze()
    fcf_ds = fcf_ds.rename({'__xarray_dataarray_variable__': 'fcf'})
    fcf_ds = fcf_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)
    
    dem_ds = xr.open_dataset(dem_path).squeeze()
    dem_ds = dem_ds.rio.write_crs(aso_ds.rio.crs)
    dem_ds = dem_ds.rename({'__xarray_dataarray_variable__': 'elevation'})
    dem_ds = dem_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)
    
    ds_list = [aso_ds, S1_snowon_ds, S1_snowoff_ds, S1_snowon_mean_ds, S1_snowoff_mean_ds, S2_ds, fcf_ds, dem_ds]
    
    ds = xr.merge(ds_list, compat='override', join='override').squeeze()

    # add terrain variables
    transform = (50, 0.0, ds.isel(x=0, y=0).x.item(), 0.0, 50, ds.isel(x=0, y=0).y.item())
    dem = xdem.DEM.from_array(ds.elevation.values, transform, crs=ds.rio.crs)
    ds['aspect'] = (('y', 'x'), xdem.terrain.aspect(dem).data.data)
    ds['slope'] = (('y', 'x'), xdem.terrain.slope(dem).data.data)
    ds['curvature'] = (('y', 'x'), xdem.terrain.curvature(dem).data.data)
    ds['tpi'] = (('y', 'x'), xdem.terrain.topographic_position_index(dem).data.data)
    ds['tri'] = (('y', 'x'), xdem.terrain.terrain_ruggedness_index(dem).data.data)

    return aso_fn, ds

In [45]:
aso_ds = xr.open_dataset(aso_path).squeeze()
aso_ds = aso_ds.rename({'band_data': 'aso_sd'})
aso_ds['aso_sd'] = aso_ds['aso_sd'].where(aso_ds['aso_sd'] >= 0)

In [46]:
S1_snowon_mean_ds = xr.open_dataset(S1_snowon_mean_path).squeeze()
S1_snowon_mean_ds = S1_snowon_mean_ds.rename({'vv': 'snowon_vv_mean', 'vh':'snowon_vh_mean'})
S1_snowon_mean_ds = S1_snowon_mean_ds.rio.reproject_match(aso_ds, resampling=rio.enums.Resampling.bilinear, crs=aso_ds.rio.crs)

In [47]:
S1_snowon_mean_ds

In [8]:
subset_size = 128

home_path = "/mnt/c/Users/JackE/uw/courses/aut24/ml_geo/final_data"
utm_zones = ['utm10n', 'utm11n', 'utm12n', 'utm13n']
total_subsets = 0

# loop through utm zones
for i, utm_zone in enumerate(utm_zones):
    print(f'working on {utm_zone} file {i}')
    aso_paths = glob(rf'{home_path}/ASO_50m_SD_cleaned/{utm_zone}/*')
    tile_names = [f'train_{utm_zone}_32km.shp', f'test_{utm_zone}_32km.shp', f'val_{utm_zone}_32km.shp']

    # define projections for lat and lon data variables
    utm_proj = Proj(proj='utm', zone=utm_zone[3:-1], ellps='WGS84')
    wgs84_proj = Proj(proj='latlong', datum='WGS84')

    #loop through train, val, test tiles
    for tile_set in tile_names:
        # open tiles 
        print(f'working on {tile_set}')
        tiles = gpd.read_file(rf'{home_path}/tiles/{tile_set}')

        #loop through ASO rasters
        for aso_path in aso_paths:
            # open aso raster
            raster_subsets = 0

            try:
                print(aso_path, home_path)
                aso_fn, ds = open_all_data(aso_path, home_path)
            except Exception as e:
                print(f'encountered error opening dataset {aso_fn}... missing S2 data, skipping')
                aso_fn = aso_path.split('/')[-1][:-4]
                print(e)
                # print(rf'no {home_path}/sentinel_2/{aso_path.split('/')[-1].split('_')[-2]}_for_{aso_fn}.nc')
                continue
            print(f'working on {aso_fn}')

            # loop through tiles
            for tile in tiles.iterrows():
                # clip to tile extent
                try:
                    tile_ds = ds.rio.clip([tile[1].geometry], crs=ds.rio.crs, drop=True)
                except: #except if tile does not overlap aso raster
                    continue

                # pad ds to tile extent
                tile_ds = tile_ds.rio.pad_box(miny=tile[1].geometry.bounds[1],
                                              minx=tile[1].geometry.bounds[0],
                                              maxy=tile[1].geometry.bounds[3],
                                              maxx=tile[1].geometry.bounds[2])

                subset_count = 0
                #loop through subset locations
                for i in range(round((len(tile_ds.x)/subset_size))):
                    for j in range(round((len(tile_ds.y)/subset_size))):
                        ymin = j*subset_size
                        ymax = ymin+subset_size
                        xmin = i*subset_size
                        xmax = xmin+subset_size
            
                        subset_ds = tile_ds.isel(x=slice(xmin, xmax), y=slice(ymin, ymax))
            
                        valid_aso = np.invert(np.isnan(subset_ds.aso_sd))
                        
                        # check if subset has valid ASO pixels
                        if valid_aso.sum() == 0:
                            continue

                        # make sure dimensions are correct
                        if len(subset_ds.x) != subset_size or len(subset_ds.y) != subset_size:
                            continue
                    
                        else: # save subset
                            # create map of gaps
                            subset_ds['aso_gap_map'] = np.multiply(np.isnan(subset_ds.aso_sd), 1)
                            
                            # radar gap maps
                            subset_ds['rtc_gap_map'] = np.multiply(((np.isnan(subset_ds.snowon_vv) +
                                                                     np.isnan(subset_ds.snowon_vh) +
                                                                     np.isnan(subset_ds.snowoff_vv) +
                                                                     np.isnan(subset_ds.snowoff_vh)) > 0), 1)
    
                            subset_ds['rtc_mean_gap_map'] = np.multiply(((np.isnan(subset_ds.snowon_vv_mean) +
                                                                     np.isnan(subset_ds.snowon_vh_mean) +
                                                                     np.isnan(subset_ds.snowoff_vv_mean) +
                                                                     np.isnan(subset_ds.snowoff_vh_mean)) > 0), 1)
    
                            # s2 gap maps
                            subset_ds['s2_gap_map'] = np.multiply(((np.isnan(subset_ds.B02) +
                                                                   np.isnan(subset_ds.B03) +
                                                                   np.isnan(subset_ds.B04) +
                                                                   np.isnan(subset_ds.B08) +
                                                                   np.isnan(subset_ds.B11)) > 0), 1)

                            # add gaps for high probablity cloud cover
                            subset_ds['s2_gap_map'] = xr.where(subset_ds['SCL'] == 9, 1, subset_ds['s2_gap_map'])
                            subset_ds['s2_gap_map'] = xr.where(subset_ds['SCL'] == 0, 1, subset_ds['s2_gap_map'])
                            
                            # fill nans with 0 
                            subset_ds = subset_ds.fillna(0)

                            # add lat and lon variables 
                            x, y = np.meshgrid(subset_ds['x'].values, subset_ds['y'].values)
                            lon, lat = transform(utm_proj, wgs84_proj, x, y)
                            subset_ds['latitude'] = (('y', 'x'), lat)
                            subset_ds['longitude'] = (('y', 'x'), lon)
                            
                            subset_count+=1
                            total_subsets+=1
                            raster_subsets+=1
                            
                            subset_ds.to_netcdf(rf'{home_path}/subsets_v5/{tile_set.split("_")[0]}/{aso_fn}_tile{int(tile[1].id)}_s{subset_count}.nc')
                            # reproject to wgs for heatmaps
                            subset_ds = subset_ds.rio.reproject("EPSG:4326")
                            subset_ds.aso_sd.rio.to_raster(rf'{home_path}/subsets_v5_tif/{tile_set.split("_")[0]}/{aso_fn}_tile{int(tile[1].id)}_s{subset_count}.tif')
                
                        
            print(f'total subsets from {aso_fn}: {raster_subsets}')
            gc.collect()

working on utm10n file 0
working on train_utm10n_32km.shp
/mnt/c/Users/JackE/uw/courses/aut24/ml_geo/final_data/ASO_50m_SD_cleaned/utm10n/ASO_50M_SD_American_20230131_clean.tif /mnt/c/Users/JackE/uw/courses/aut24/ml_geo/final_data
working on ASO_50M_SD_American_20230131_clean
total subsets from ASO_50M_SD_American_20230131_clean: 100
/mnt/c/Users/JackE/uw/courses/aut24/ml_geo/final_data/ASO_50m_SD_cleaned/utm10n/ASO_50M_SD_American_20230413_clean.tif /mnt/c/Users/JackE/uw/courses/aut24/ml_geo/final_data
encountered error opening dataset ASO_50M_SD_American_20230131_clean... missing S2 data, skipping
list index out of range
/mnt/c/Users/JackE/uw/courses/aut24/ml_geo/final_data/ASO_50m_SD_cleaned/utm10n/ASO_50M_SD_American_20230428_clean.tif /mnt/c/Users/JackE/uw/courses/aut24/ml_geo/final_data
encountered error opening dataset ASO_50M_SD_American_20230413_clean... missing S2 data, skipping
list index out of range
/mnt/c/Users/JackE/uw/courses/aut24/ml_geo/final_data/ASO_50m_SD_cleaned/u

: 

In [60]:
tile_set.split("_")[0]

'train'

In [66]:
test = xr.open_dataset(path_list[0])

In [67]:
test

In [4]:
# grab data paths
data_dir = '/mnt/c/Users/JackE/uw/courses/aut24/ml_geo/final_data/subsets_v5/train'
path_list = glob(f'{data_dir}/ASO_50M_SD*.nc')

In [2]:
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
import torch
from glob import glob
import seaborn as sns

import deep_snow.dataset

In [5]:
# define data to be returned by dataloader
selected_channels = [
    # ASO products
    'aso_sd', # ASO lidar snow depth (target dataset)
    'aso_gap_map', # gaps in ASO data
    
    # Sentinel-1 products
    'snowon_vv', # snow on Sentinel-1 VV polarization backscatter in dB, closest acquisition to ASO acquisition
    'snowon_vh', # snow on Sentinel-1 VH polarization backscatter in dB, closest acquisition to ASO acquisition
    'snowoff_vv', # snow off Sentinel-1 VV polarization backscatter in dB, closest acquisition to ASO acquisition
    'snowoff_vh', # snow off Sentinel-1 VH polarization backscatter in dB, closest acquisition to ASO acquisition
    'snowon_vv_mean', # snow on Sentinel-1 VV polarization backscatter in dB, mean of acquisition in 4 week period around ASO acquisition
    'snowon_vh_mean', # snow on Sentinel-1 VH polarization backscatter in dB, mean of acquisition in 4 week period around ASO acquisition
    'snowoff_vv_mean', # snow off Sentinel-1 VV polarization backscatter in dB, mean of acquisition in 4 week period around ASO acquisition
    'snowoff_vh_mean', # snow off Sentinel-1 VH polarization backscatter in dB, mean of acquisition in 4 week period around ASO acquisition
    'snowon_cr', # cross ratio, snowon_vh - snowon_vv
    'snowoff_cr', # cross ratio, snowoff_vh - snowoff_vv
    'delta_cr', # change in cross ratio, snowon_cr - snowoff_cr
    'rtc_gap_map', # gaps in Sentinel-1 data
    'rtc_mean_gap_map', # gaps in Sentinel-1 mean data
    
    # Sentinel-2 products 
    'aerosol_optical_thickness', # snow on Sentinel-2 aerosol optical thickness band 
    'coastal_aerosol', # snow on Sentinel-2 coastal aerosol band
    'blue', # snow on Sentinel-2 blue band
    'green', # snow on Sentinel-2 green band
    'red', # snow on Sentinel-2 red band
    'red_edge1', # snow on Sentinel-2 red edge 1 band
    'red_edge2', # snow on Sentinel-2 red edge 2 band
    'red_edge3', # snow on Sentinel-2 red edge 3 band
    'nir', # snow on Sentinel-2 near infrared band
    'water_vapor', # snow on Sentinel-2 water vapor
    'swir1', # snow on Sentinel-2 shortwave infrared band 1
    'swir2', # snow on Sentinel-2 shortwave infrared band 2
    'scene_class_map', # snow on Sentinel-2 scene classification product
    'water_vapor_product', # snow on Sentinel-2 water vapor product
    'ndvi', # Normalized Difference Vegetation Index from Sentinel-2
    'ndsi', # Normalized Difference Snow Index from Sentinel-2
    'ndwi', # Normalized Difference Water Index from Sentinel-2
    's2_gap_map', # gaps in Sentinel-2 data

    # PROBA-V global land cover dataset (Buchhorn et al., 2020)
    'fcf', # fractional forest cover
    
    # COP30 digital elevation model      
    'elevation',
    'slope',
    'aspect',
    'curvature',
    'tpi',
    'tri',

    # latitude and longitude
    'latitude',
    'longitude',

    # day of water year
    'dowy'
                    ]

# prepare training and validation dataloaders
train_data = deep_snow.dataset.Dataset(path_list, selected_channels, norm=True)
train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=1, shuffle=True)

In [6]:
for i, data_tuple in enumerate(train_loader):
    break

In [4]:
# def sample_ds(ds, subset_size):
#     minx = 0
#     miny = 0
#     maxx = len(ds.x)-subset_size
#     maxy = len(ds.y)-subset_size

#     sub_minx = random.randint(minx, maxx)
#     sub_miny = random.randint(miny, maxy)
#     subset = ds.isel(x=slice(sub_minx, sub_minx+subset_size), y=slice(sub_miny, sub_miny+subset_size))
    
#     return subset

In [5]:
# subset_size=128
# # avg_subs_per_pixel=3

# home_path = '../..'
# utm_zones = ['utm10n', 'utm11n', 'utm12n', 'utm13n']
# total_subsets = 0

# # loop through utm zones
# for utm_zone in utm_zones:
#     print(f'working on {utm_zone}')
#     aso_paths = glob(f'{home_path}/data/ASO/ASO_50m_SD_withS1overpass/{utm_zone}/*')
#     #tile_names = [f'train_aea_25km.shp', f'test_aea_25km.shp', f'val_aea_25km.shp']
#     tile_names = [f'train_{utm_zone}_25km.shp', f'test_{utm_zone}_25km.shp', f'val_{utm_zone}_25km.shp']

#     # define projections for lat and lon data variables
#     utm_proj = Proj(proj='utm', zone=utm_zone[3:-1], ellps='WGS84')
#     wgs84_proj = Proj(proj='latlong', datum='WGS84')

#     #loop through train, val, test tiles
#     for tile_set in tile_names:
#         # open tiles 
#         print(f'working on {tile_set}')
#         tiles = gpd.read_file(f'{home_path}/data/polygons/{tile_set}')

#         #loop through ASO rasters
#         for aso_path in aso_paths:
#             # open aso raster
#             raster_subsets = 0
#             try:
#                 aso_fn, ds = open_all_data(aso_path, home_path)
#             except:
#                 print('encountered error opening dataset, skipping')
#                 continue
#             print(f'working on {aso_fn}')
            
#             # reproject tile to utm zone (only needed for aea tiles)
#             #tiles = tiles.to_crs(ds.rio.crs)

#             # loop through tiles
#             for tile in tiles.iterrows():
#                 # clip to tile extent
#                 try:
#                     tile_ds = ds.rio.clip([tile[1].geometry], crs=ds.rio.crs, drop=True)
#                 except: #except if tile does not overlap aso raster
#                     continue
#                 # set number of subsets to grab based on valid pixel count in tile 
#                 tile_pixel_count = np.invert(np.isnan(tile_ds.aso_sd.values)).sum()
#                 tile_coverage_target = 0.95
#                 tile_pixel_coverage_target = round(tile_coverage_target*tile_pixel_count)
#                 # subset_goal = round(tile_pixel_count/(subset_size**2)*avg_subs_per_pixel)

#                 # pad ds to tile extent
#                 tile_ds = tile_ds.rio.pad_box(miny=tile[1].geometry.bounds[1],
#                                               minx=tile[1].geometry.bounds[0],
#                                               maxy=tile[1].geometry.bounds[3],
#                                               maxx=tile[1].geometry.bounds[2])

#                 # initialize coverage array
#                 tile_ds['selected_data'] = (('y', 'x'), np.full((len(tile_ds.y), len(tile_ds.x)), False))
#                 tile_ds['count_map'] = (('y', 'x'), np.full((len(tile_ds.y), len(tile_ds.x)), 0))

#                 subset_count = 0
#                 tile_pixel_coverage = 0
#                 while tile_pixel_coverage < tile_pixel_coverage_target:
#                     subset_ds = sample_ds(tile_ds, subset_size)
#                     valid_aso = np.invert(np.isnan(subset_ds.aso_sd))
                    
#                     # check if subset has valid ASO pixels
#                     if valid_aso.sum() == 0:
#                         continue

#                     # check if subset contains ASO data not previously sampled
#                     new_data = valid_aso & ~tile_ds.selected_data.sel(x=slice(subset_ds.x.min(), subset_ds.x.max()),
#                                                                         y=slice(subset_ds.y.max(), subset_ds.y.min()))
#                     if new_data.sum() == 0:
#                         continue
                    
#                     else: # save subset
#                         # create map of gaps
#                         subset_ds['aso_gap_map'] = np.multiply(np.isnan(subset_ds.aso_sd), 1)
                        
#                         # radar gap maps
#                         subset_ds['rtc_gap_map'] = np.multiply(((np.isnan(subset_ds.snowon_vv) +
#                                                                  np.isnan(subset_ds.snowon_vh) +
#                                                                  np.isnan(subset_ds.snowoff_vv) +
#                                                                  np.isnan(subset_ds.snowoff_vh)) > 0), 1)

#                         subset_ds['rtc_mean_gap_map'] = np.multiply(((np.isnan(subset_ds.snowon_vv_mean) +
#                                                                  np.isnan(subset_ds.snowon_vh_mean) +
#                                                                  np.isnan(subset_ds.snowoff_vv_mean) +
#                                                                  np.isnan(subset_ds.snowoff_vh_mean)) > 0), 1)

#                         # s2 gap maps
#                         subset_ds['s2_gap_map'] = np.multiply(np.isnan(subset_ds.B02), 1)

#                         # Update the selected array
#                         tile_ds['selected_data'].loc[dict(x=slice(subset_ds.x.min(), subset_ds.x.max()),
#                                                           y=slice(subset_ds.y.max(), subset_ds.y.min()))] = valid_aso
#                         tile_ds['count_map'].loc[dict(x=slice(subset_ds.x.min(), subset_ds.x.max()),
#                                                           y=slice(subset_ds.y.max(), subset_ds.y.min()))] += 1
#                         tile_pixel_coverage = tile_ds['selected_data'].sum()
                        
#                         # fill nans with 0 
#                         subset_ds = subset_ds.fillna(0)

#                         # add lat and lon variables 
#                         x, y = np.meshgrid(subset_ds['x'].values, subset_ds['y'].values)
#                         lon, lat = transform(utm_proj, wgs84_proj, x, y)
#                         subset_ds['latitude'] = (('y', 'x'), lat)
#                         subset_ds['longitude'] = (('y', 'x'), lon)
                        
#                         subset_count+=1
#                         total_subsets+=1
#                         raster_subsets+=1
                        
#                         subset_ds.to_netcdf(f'{home_path}/data/subsets_v2/{tile_set.split("_")[0]}/{aso_fn}_tile{int(tile[1].id)}_s{subset_count}.nc')
#                         # reproject to wgs for heatmaps
#                         subset_ds = subset_ds.rio.reproject("EPSG:4326")
#                         subset_ds.aso_sd.rio.to_raster(f'{home_path}/data/subsets_v2_tif/{tile_set.split("_")[0]}/{aso_fn}_tile{int(tile[1].id)}_s{subset_count}.tif')

#                 # remove redundant subsets
#                 subset_fns = glob(f'{home_path}/data/subsets_v2/{tile_set.split("_")[0]}/{aso_fn}_tile{int(tile[1].id)}_*.nc')
#                 removed_count = 0
#                 for fn in subset_fns:
#                     subset_ds = xr.open_dataset(fn)
#                     count_map_min = tile_ds['count_map'].sel(x=slice(subset_ds.x.min(), subset_ds.x.max()), y=slice(subset_ds.y.max(), subset_ds.y.min())).min()
#                     if count_map_min >= 2:
#                         !rm $fn
#                         tif_fn = f'{home_path}/data/subsets_v2_tif/{tile_set.split("_")[0]}/{fn.split("/")[-1][:-3]}.tif'
#                         !rm $tif_fn
#                         tile_ds['count_map'].loc[dict(x=slice(subset_ds.x.min(), subset_ds.x.max()),
#                                                       y=slice(subset_ds.y.max(), subset_ds.y.min()))] -= 1
#                         removed_count += 1
#                         raster_subsets -=1
                        
#             print(f'total subsets from {aso_fn}: {raster_subsets}')
#             gc.collect()