In training RFs to expand our validation dataset, we've come across two situations that are not captured by the approach thus far - 
1. The spectral characteristics of water within the validation chip is not representative of water surfaces present in the broader Planet image
2. A validation chip contains no water classification, while the broader Planet image contains some water. Since the RF trained on this data is not given any OSW examples, it will classify the entire Planet image as "not-water"

We can mitigate this by training a single model on validation data from multiple sites, rather than on a per-site basis.

This notebook demonstrates how to train a random forest model using all of the available validation data, and produce water/not-water inferences.

At the end of successfully executing the notebook cells, the `data` folder will contain a trained random forest model within the `trained_models/rf_model` folder , and each of the Planet scene folders will contain a file with model inferences.

For example, for the Planet scene `20210903_150800_60_2458` the updated folder structure will be:
```
    .
    ├── data
    │   ├─ 20210903_150800_60_2458
    │   │  ├─ 20210903_150800_60_2458_3B_AnalyticMS_8b_metadata.xml
    │   │  ├─ 20210903_150800_60_2458_3B_AnalyticMS_SR_8b.tif
    │   │  ├─ HLS.L30.T18UXG.2021245T154154.v2.0.Fmask.tif
    │   │  ├─ OPERA_L3_DSWx-HLS_T18UXG_20210902T154154Z_20230906T035356Z_L8_30_v1.1_B01_WTR.tif
    │   │  ├─ OPERA_L3_DSWx-HLS_T18UXG_20210902T154154Z_20230906T035356Z_L8_30_v1.1_B03_CONF.tif
    │   │  ├─ rf_classification.tif
    │   │  └─ new_rf_classification.tif # output inferences
    │   ├─ ...    
    │   └─ trained_models
    │      └─ rf_model
    │         └─ rf_model.joblib # trained model
    ├── notebooks
    │   └─ ...
    ├── environment.yml
    └── README.md       
```

In [None]:
# GIS/data imports
import rasterio
import pandas as pd
import numpy as np

# ML imports
from sklearn.ensemble import RandomForestClassifier
from skimage.segmentation import felzenszwalb
from tools import get_superpixel_stds_as_features, get_superpixel_means_as_features, get_array_from_features, reproject_arr_to_match_profile
from sklearn.model_selection import train_test_split
import joblib
from joblib import dump

# misc imports
from tqdm import tqdm
from pathlib import Path
from collections import defaultdict
import random
from typing import Union

# local imports
from rf_funcs import calc_ndwi, calc_ndvi, return_grn_indices, return_img_bands, return_reflectance_coeffs

# for repeatability
np.random.seed(42)

In [None]:
# Set to false if only performing inference with a previously trained model
RETRAIN_MODEL= True

# Set to false to skip inference step
MAKE_INFERENCES = True 

EVALUATE_MODEL = True # Split available data and print model performance on test set
TEST_SET_SPLIT = 0.15 # If evaluating model, specify data split for testing. train split will be 1-TEST_SET_SPLIT

In [None]:
# Felzenszwalb segmentation parameters
F_SCALE = 20 # float value. Larger value returns larger segments
F_SIGMA = 0 # float value. Denoising parameter - larger value denoises more and returns smoother segments
F_MINSIZE = 20 # int value. Minimum segment size

In [None]:
# Read the validation database
data_path = Path('../data/')
val_chips_db = data_path / 'validation_table.csv'
val_df = pd.read_csv(val_chips_db)

site_names = list(val_df['site_name'])
planet_ids = list(val_df['planet_id'])

# Extract planet IDs and associated strata
site_names_stratified = defaultdict(list)
for sn, planet_id in zip(site_names, planet_ids):
    site_names_stratified[sn[:2]].append(planet_id)

training_sites = []

# We can modify strata_list to exclude certain scenes from the training data
strata_list = ['1_', '2_', '3_', '4_']

for key in site_names_stratified.keys():
    if key in strata_list:
        training_sites.extend(site_names_stratified[key])

print("# of Training sites: ", len(training_sites))

In [None]:
# Read the validation database
data_path = Path('../data/')
val_chips_db = data_path / 'validation_table.csv'
val_df = pd.read_csv(val_chips_db)

site_names = list(val_df['site_name'])
planet_ids = list(val_df['planet_id'])

# Extract planet IDs and associated strata
site_names_stratified = defaultdict(list)
for sn, planet_id in zip(site_names, planet_ids):
    site_names_stratified[sn[:2]].append(planet_id)

print(site_names_stratified.keys())

We have the name of the planet ids. For each, do the following - 
1. Read the cropped planet image and the corresponding validation labels
2. Generate superpixels and calculate mean and std dev.
3. Append to list
4. Train and save model
5. Apply model to broader Planet images

In [None]:
rf_model_folder = data_path / 'trained_models' / 'rf_model'
rf_model_folder.mkdir(exist_ok=True, parents=True)
model_path = rf_model_folder/"rf_model_alldata.joblib"

if RETRAIN_MODEL:
    with open(f"{model_path}.txt", 'w') as f:
        f.write(f"FELZENSZWALB SCALE:{F_SCALE}\n")
        f.write(f"FELZENSZWALB MIN SIZE:{F_MINSIZE}\n")
        f.write(f"FELZENSZWALB SIGMA:{F_SIGMA}\n")
        
    X, class_features = None, None
    for idx, site in enumerate(training_sites):
        print(f"Currently processing site # {idx}")

        current_img_path = data_path / site
        cropped_img_path = data_path / 'planet_images_cropped' / site
        
        xml_file = list(current_img_path.glob('*.xml'))[0]
        chip = list(cropped_img_path.glob(f'cropped_{site}*.tif'))[0]
        classification = list(cropped_img_path.glob(f'site_name*{site}*.tif'))[0]

        band_idxs = return_grn_indices(xml_file)
        coeffs = return_reflectance_coeffs(xml_file, band_idxs)
        chip_img = return_img_bands(chip, band_idxs, denoising_weight=None)

        with rasterio.open(chip) as src_ds:
            ref_profile = src_ds.profile

        green = chip_img[0]*coeffs[band_idxs[0]]
        red = chip_img[1]*coeffs[band_idxs[1]]
        nir = chip_img[2]*coeffs[band_idxs[2]]

        with rasterio.open(classification) as src_ds:
            cl = src_ds.read(1)
            cl_profile = src_ds.profile

        # some classification extents are not the same as the corresponding planet chip extent
        # if they are not the same, reproject the validation data to match the profile of the planet data
        if ((ref_profile['transform'] != cl_profile['transform']) | 
            (ref_profile['width'] != cl_profile['width']) | 
            (ref_profile['height'] != cl_profile['height'])):

            cl, _ = reproject_arr_to_match_profile(cl, cl_profile, ref_profile)
            cl = np.squeeze(cl)

        ndwi = calc_ndwi(green, nir)
        ndvi = calc_ndvi(red, nir)

        # segment image using green, nir, and NDWI channels
        img_stack = np.stack([green, nir, ndwi], axis=-1)
        segments = felzenszwalb(img_stack, scale=F_SCALE, sigma=F_SIGMA, min_size=F_MINSIZE)

        # create training data that includes other channels as well
        img_stack = np.stack([red, nir, green, ndwi, ndvi], axis=-1)     
        std_features = get_superpixel_stds_as_features(segments, img_stack)
        mean_features = get_superpixel_means_as_features(segments, img_stack)

        if X is None:
            X = np.concatenate([mean_features, std_features], axis = 1)
        else:
            X_temp = np.concatenate([mean_features, std_features], axis = 1)
            X = np.concatenate([X, X_temp], axis=0)

        # We have superpixels, we now need to map each of the segments to the associated label
        # A 0 value indicates no label for the segment
        
        class_features_temp = np.zeros((mean_features.shape[0], 1)) + 255
        for class_id in [0, 1]:
            # Get all superpixel labels with particular id
            superpixel_labels_for_class = np.unique(segments[class_id == cl])
            # Label those superpixels with approrpriate class
            class_features_temp[superpixel_labels_for_class] = class_id

        if class_features is None:
            class_features = class_features_temp
        else:
            class_features = np.concatenate([class_features, class_features_temp], axis=0)


    print("Beginning model training")
    # Define an RF to be trained. setting n_jobs = -1 uses all available processors
    rf = RandomForestClassifier(n_estimators=300, class_weight='balanced', oob_score=True, random_state=0, n_jobs=-1)

    # train model on all of the available data
    rf.fit(X, class_features.ravel())

    # save for later use
    dump(rf, model_path)

else: # If a trained model already exists, load weights
    rf = joblib.load(model_path)

Let's make inferences on the broader planet images

In [None]:
def generate_inference_helper(rf, img:str|Path, xml_file:str|Path):
    band_idxs = return_grn_indices(xml_file)
    coeffs = return_reflectance_coeffs(xml_file, band_idxs)
    
    full_img = return_img_bands(img, band_idxs, denoising_weight=None)

    green = full_img[0]*coeffs[band_idxs[0]]
    red = full_img[1]*coeffs[band_idxs[1]]
    nir = full_img[2]*coeffs[band_idxs[2]]

    ndwi = calc_ndwi(green, nir)
    ndvi = calc_ndvi(red, nir)

    img_stack = np.stack([green, nir, ndwi], axis=-1)
    segments = felzenszwalb(img_stack, scale=F_SCALE, sigma=F_SIGMA, min_size=F_MINSIZE)

    # for inference we include other channels as well
    img_stack = np.stack([red, nir, green, ndwi, ndvi], axis=-1)
    std_features = get_superpixel_stds_as_features(segments, img_stack)
    mean_features = get_superpixel_means_as_features(segments, img_stack)

    X = np.concatenate([mean_features, std_features], axis = 1)
    y = rf.predict(X)

    return get_array_from_features(segments, np.expand_dims(y, axis=1))

def generate_inference(planet_id):
    """ 
    This function takes in a planet_id and generates inferences for the overlapping planet image
    """
    data_path = Path('../data')
    
    current_img_path = data_path / planet_id
    cropped_img_path = data_path / 'planet_images_cropped' / planet_id
    xml_file = list(current_img_path.glob('*.xml'))[0]
    classification = list(cropped_img_path.glob(f'classification_*.tif'))[0]

    img = list(current_img_path.glob(f'{planet_id}*.tif'))[0]

    inference = generate_inference_helper(rf, img, xml_file)

    # use planet image to mask out regions of no data in the model inference
    with rasterio.open(img) as src_ds:
        nodata_mask = np.where(src_ds.read(1) == src_ds.profile['nodata'], 1, 0)
        inference[nodata_mask==1] = 255
        profile_copy = src_ds.profile
        profile_copy.update({'count':1, 'dtype':np.uint8, 'nodata':255})

        # write out model inference
        with rasterio.open(f"{classification.parent}/new_rf_classification.tif", 'w', **profile_copy) as dst_ds:
            dst_ds.write(inference.astype(np.uint8).reshape(1, *inference.shape))

    print(f"Completed inference for plane id {planet_id}")

In [None]:
model_features = ["red_mean", "nir_mean", "green_mean", "ndwi_mean", "ndvi_mean", "red_std", "nir_std", "green_std", "ndwi_std", "ndvi_std"]
feature_importances = list(zip(model_features, rf.feature_importances_))
print(f"Feature importances: {feature_importances}")

In [None]:
_ = list(map(generate_inference, planet_id[:1]))

In [None]:
# After successfully completing one inference, do the rest of the scenes
_ = list(map(generate_inference, tqdm(planet_ids[1:])))