With ALOS PALSAR and co-incident USGS DSWE data downloaded in the previous notebook, we need to generate training data for our convolutional neural network.

For this, we additionally require [Global Forest Change (GFC)](https://storage.googleapis.com/earthenginepartners-hansen/GFC-2020-v1.8/download.html) data and Copernicus GLO-30 DEM derived Height Above Nearest Drainage (HAND) data, which we download for each ALOS scene. 

We then split each scene into 512x512 non-overlapping chips that can be stacked and provided as inputs to our model. Our input stack contains the following data channels: [HH, HV, RED, NIR, SWIR1, SWIR2, DEM, HAND], and are mapped to the corresponding DSWE labels. In the DSWE data, we retain the following three classes: "not water", "high confidence open surface water" and "moderate confidence open surface water". The two water classes are combined into a single open surface water class.

Any input stack which maps to a label array containing less than 10% valid data is discarded.

In [None]:
 # GIS imports
import rasterio
from rasterio.warp import transform_bounds

# scikit / numpy / pandas
import numpy as np
import pandas as pd

# Misc imports
import random
import pathlib
from pathlib import Path
from typing import List
from collections import defaultdict

# cutlery imports
from tools import retrieve_hansen_mosaic, return_windowed_merge, denoise, return_slice_list, get_cropped_profile, return_nodata_mask, retrieve_hand_data

# set for repeatable experiments
random.seed(42)

In [None]:
data_path = Path('../data/scenes')
alos_folders = [x.name for x in list(data_path.glob('AP*')) if x.is_dir()]
print(f"Number of ALOS scenes: {len(alos_folders)}")

In [None]:
output_path = Path('../data/training_data/chips')
output_path.mkdir(parents=True, exist_ok=True)

# setup folders
chip_types = ['hh', 'hv', 'red', 'nir', 'swir1', 'swir2', 'dem', 'hand', 'labels']
chip_paths = []
for c in chip_types:
    (output_path/c).mkdir(exist_ok=True)
    chip_paths.append(output_path/c)

chip_path_dict = dict(zip(chip_types, chip_paths))

In [None]:
chips_list = defaultdict(list)

for scene in alos_folders:
    print(scene)

    hh_file = list((data_path/scene).glob('*HH*'))[0]
    hv_file = list((data_path/scene).glob('*HV*'))[0]
    dem_file = list((data_path/scene).glob('*dem*'))[0]

    # we dealt with downloading the dswe data in a separate notebook
    usgs_dswe_files = list((data_path/scene/'usgs_dswe').glob('*.TIF'))

    with rasterio.open(hh_file) as ds:
        hh_img = ds.read(1)
        sar_bounds = ds.bounds 
        sar_profile = ds.profile 
        sar_crs = ds.crs
    
    with rasterio.open(hv_file) as ds:
        hv_img = ds.read(1)
    
    with rasterio.open(dem_file) as ds:
        dem_img = ds.read(1)
        dem_profile = ds.profile

    with rasterio.open(usgs_dswe_files[0]) as ds:
        dswe_crs = ds.crs
        dswe_profile = ds.profile

    # Let's retrieve the Hansen tiles overlapping the SAR scene
    sar_bounds_4326 = transform_bounds(sar_crs.to_epsg(), 4326, *sar_bounds)
    hansen_files = retrieve_hansen_mosaic(sar_bounds_4326, data_product = 'first', download_path=Path('../data/hand_data/'))

    hansen_img, hansen_profile = return_windowed_merge(hansen_files, sar_bounds_4326, sar_profile)

    mask = return_nodata_mask([hh_img, hv_img], nodata=0)
    mask += return_nodata_mask([hansen_img[0]], hansen_profile['nodata'])

    # Obtain HAND data
    hand_files = retrieve_hand_data(sar_bounds_4326, download_path=Path('../data/hansen_mosaics/'))
    hand_img, hand_profile = return_windowed_merge(hand_files, sar_bounds_4326, sar_profile)
    hand_img = np.squeeze(hand_img)

    # treat the dswe img for use as labels
    # Refer to the document 'Landsat Collection 2 Level 3 Dynamic Surface Water Extent Data Format Control Book' for more information
    sar_bounds_dswe_crs = transform_bounds(sar_crs, dswe_crs, *sar_bounds)
    labels, labels_profile = return_windowed_merge(usgs_dswe_files, sar_bounds_dswe_crs, sar_profile)
    labels = np.squeeze(labels)

    # mask += return_nodata_mask([labels], nodata=255)

    # Mask out no data regions
    mask = np.where(mask>0, 0, 1).astype('uint8')
    hh_img *= mask
    hv_img *= mask
    hansen_img *= mask
    dem_img *= mask
    hand_img *= mask
    
    idx1 = np.where(labels == 0)  # not water
    idx2 = np.where((labels == 1) | (labels == 2)) # High and moderate confidence open water classes
    idx3 = np.where(mask == 0)
    idx4 = np.where((labels == 3) | (labels == 4)) # Conservative and aggressive partial surface water classes
    
    # The labeling used here is simply because it aligns well with backscatter signatures
    # i.e., low back scatter = water surfaces (0), high back scatter = not water (1)
    # All other pixels are labeled 255, which will correspond to a 'don't care' value during training
    labels = 255 + np.zeros(np.squeeze(hh_img).shape, dtype=dswe_profile['dtype'])
    labels[idx2] = 0
    labels[idx1] = 1
    labels[idx3] = 255
    labels[idx4] = 255 # we do not want to score the model over classifications of PSW pixels

    chip_prefix = f"AP_{scene[3:8]}{scene[14:18]}"

    image_dict = {
        'hh': (hh_img, sar_profile['nodata'], sar_profile['dtype']),
        'hv': (hv_img, sar_profile['nodata'], sar_profile['dtype']),
        'red': (hansen_img[0, ...], 0, 'int16'),
        'nir': (hansen_img[1, ...], 0, 'int16'),
        'swir1': (hansen_img[2, ...], 0, 'int16'),
        'swir2': (hansen_img[3, ...], 0, 'int16'),
        'dem': (dem_img, dem_profile['nodata'], dem_img.dtype),
        'hand': (hand_img, hand_profile['nodata'], hand_img.dtype),
        'labels': (labels, 255, 'uint8')
    }
    
    # Write out labels for future reference
    with rasterio.open(hh_file.parent/'labels.tif', 'w', **labels_profile) as ds:
        ds.write(labels.reshape(1, *labels.shape).astype('uint8'))

    # slice up the image into chips and iterate over slices
    slice_list = return_slice_list(hh_img.shape, (512, 512))
    
    count = 0

    for (y_slice, x_slice) in slice_list:
        
        label_chip = labels[y_slice, x_slice]

        # Any time a SAR image has less than 10% data, we'll skip that chip
        if np.sum(label_chip != 255)/(label_chip.size) < 0.1:
            continue

        current_filename = f"{chip_prefix}_{str(count).zfill(5)}.tif"
        chip_profile = get_cropped_profile(labels_profile, x_slice, y_slice)

        for _chip_type, _chip_output_path in chip_path_dict.items():
            chip_profile['nodata'] = image_dict[_chip_type][1]
            chip_profile['dtype'] = image_dict[_chip_type][2]
            temp_chip = image_dict[_chip_type][0][y_slice, x_slice]
            with rasterio.open(_chip_output_path / current_filename, 'w', **chip_profile) as ds:
                ds.write(temp_chip.reshape(1, *temp_chip.shape))
            
            chips_list[_chip_type].append(_chip_output_path / current_filename)        
        
        count += 1

In [None]:
df = pd.DataFrame(chips_list)
df.to_csv(output_path/'training_data.csv')
print(f"Number of training samples: {len(df)}")