In this notebook, we train a random forest model to generate water/not-water inferences on a Planet image. We use the expert-drawn binary classification labels in order to train our models. To train our model -

1. For every hand-labeled dataset available, we apply a graph based image segmentation algorithm on a 3 band image containing Green, NDWI and NDVI channels from the source Planet image
2. For each segment, we compute pixel value statistics (mean and standard deviation) in the Red, NIR, Green, NDWI and NDVI channels
3. These segment statistics are mapped to the corresponding segment labels obtained from the hand-labeled imagery
4. We train a random forest model per cropped Planet scene - and then perform inference on the entire available Planet scene

At the end of successfully executing the notebook cells, the Planet scene folders will contain a trained random forest model, as well as a file containing model inferences.

For example, for the Planet scene `20210903_150800_60_2458` the updated folder structure will be:
```
    .
    ├── data
    │   ├─ 20210903_150800_60_2458
    │   │  ├─ 20210903_150800_60_2458_3B_AnalyticMS_8b_metadata.xml
    │   │  ├─ 20210903_150800_60_2458_3B_AnalyticMS_SR_8b.tif
    │   │  ├─ HLS.L30.T18UXG.2021245T154154.v2.0.Fmask.tif
    │   │  ├─ OPERA_L3_DSWx-HLS_T18UXG_20210902T154154Z_20230906T035356Z_L8_30_v1.1_B01_WTR.tif
    │   │  ├─ OPERA_L3_DSWx-HLS_T18UXG_20210902T154154Z_20230906T035356Z_L8_30_v1.1_B03_CONF.tif
    │   │  └─ rf_classification.tif # output inferences
    │   ├─ ...    
    │   └─ planet_cropped_imagery
    │       ├─ 20210903_150800_60_2458
    │       │  ├─ rf_model
    │       │  │  └─ rf_model.joblib # trained model
    │       │  └─ ...      
    │       └─ ...
    ├── notebooks
    │   └─ ...
    ├── environment.yml
    └── README.md       
```
Note: Each model is trained on data available for that scene, and may not perform well when applied to other scenes. We will train a single model capable of performing inferences on all the scenes in the next notebook.

In [None]:
# GIS imports 
import rasterio

# data ETL imports
import pandas as pd
import numpy as np

# misc imports
from pathlib import Path
from xml.dom import minidom
import xml.etree.ElementTree as ET
from multiprocessing import Pool

# ML imports
from sklearn.ensemble import RandomForestClassifier
from skimage.segmentation import felzenszwalb
from tools import get_superpixel_stds_as_features, get_superpixel_means_as_features, get_array_from_features, reproject_arr_to_match_profile
from sklearn.model_selection import train_test_split
from joblib import dump

# local imports
from rf_funcs import calc_ndwi, calc_ndvi, return_grn_indices, return_img_bands, return_reflectance_coeffs

In [None]:
data_path = Path('../data')
df = pd.read_csv(data_path/'validation_table.csv')
df = df[['site_name', 'planet_id', 'water_stratum']]

planet_ids = list(df['planet_id'])

In [None]:
# Felzenszwalb segmentation parameters
F_SCALE = 20 # float value. Larger value returns larger segments
F_SIGMA = 0 # float value. Denoising parameter - larger value denoises more and returns smoother segments
F_MINSIZE = 20 # int value. Minimum segment size

In [None]:
def return_trained_rf(chip:str|Path, classification:str|Path, xml_file:str|Path):
    """ 
    Given a planet image chip and corresponding water/not-water classification mask,
    return a trained random forest model. 

    Save the trained model for future use.
    """
        
    band_idxs = return_grn_indices(xml_file)
    coeffs = return_reflectance_coeffs(xml_file, band_idxs)
    chip_img = return_img_bands(chip, band_idxs, denoising_weight=None)
    
    with rasterio.open(chip) as src_ds:
        ref_profile = src_ds.profile

    green = chip_img[0]*coeffs[band_idxs[0]]
    red = chip_img[1]*coeffs[band_idxs[1]]
    nir = chip_img[2]*coeffs[band_idxs[2]]

    with rasterio.open(classification) as src_ds:
        cl = src_ds.read(1)
        cl_profile = src_ds.profile

    # some classification extents are not the same as the corresponding planet chip extent
    # if they are not the same, reproject the validation data to match the profile of the planet data
    if ((ref_profile['transform'] != cl_profile['transform']) | 
        (ref_profile['width'] != cl_profile['width']) | 
        (ref_profile['height'] != cl_profile['height'])):

        cl, _ = reproject_arr_to_match_profile(cl, cl_profile, ref_profile)
        cl = np.squeeze(cl)

    ndwi = calc_ndwi(green, nir)
    ndvi = calc_ndvi(red, nir)

    # segment image using green, nir, and NDWI channels
    img_stack = np.stack([green, nir, ndwi], axis=-1)
    segments = felzenszwalb(img_stack, scale=F_SCALE, sigma=F_SIGMA, min_size=F_MINSIZE)

    # create training data that includes other channels as well
    img_stack = np.stack([red, nir, green, ndwi, ndvi], axis=-1)     
    std_features = get_superpixel_stds_as_features(segments, img_stack)
    mean_features = get_superpixel_means_as_features(segments, img_stack)

    X = np.concatenate([mean_features, std_features], axis = 1)
    
    # We have superpixels, we now need to map each of the segments to the associated label
    # A 0 value indicates no label for the segment
    class_features = np.zeros((mean_features.shape[0], 1))
    for class_id in [0, 1]:
        # Get all superpixel labels with particular id
        superpixel_labels_for_class = np.unique(segments[class_id == cl])
        # Label those superpixels with approrpriate class
        class_features[superpixel_labels_for_class] = class_id

    # Define an RF to be trained. setting n_jobs = -1 uses all available processors
    rf = RandomForestClassifier(n_estimators=250, class_weight='balanced', oob_score=True, random_state=0, n_jobs=-1)

    # train model on all of the available data
    rf.fit(X, class_features.ravel())

    rf_model_folder = Path(chip).parent / 'rf_model'
    rf_model_folder.mkdir(exist_ok=True)
    model_path = rf_model_folder/"rf_model.joblib"
    
    # save for later use
    dump(rf, model_path)

    return rf

def generate_inference(rf, img:str|Path, xml_file:str|Path):
    band_idxs = return_grn_indices(xml_file)
    coeffs = return_reflectance_coeffs(xml_file, band_idxs)
    
    full_img = return_img_bands(img, band_idxs, denoising_weight=None)

    green = full_img[0]*coeffs[band_idxs[0]]
    red = full_img[1]*coeffs[band_idxs[1]]
    nir = full_img[2]*coeffs[band_idxs[2]]

    ndwi = calc_ndwi(green, nir)
    ndvi = calc_ndvi(red, nir)

    img_stack = np.stack([green, nir, ndwi], axis=-1)
    segments = felzenszwalb(img_stack, scale=F_SCALE, sigma=F_SIGMA, min_size=F_MINSIZE)

    # for inference we include other channels as well
    img_stack = np.stack([red, nir, green, ndwi, ndvi], axis=-1)
    std_features = get_superpixel_stds_as_features(segments, img_stack)
    mean_features = get_superpixel_means_as_features(segments, img_stack)

    X = np.concatenate([mean_features, std_features], axis = 1)
    y = rf.predict(X)

    return get_array_from_features(segments, np.expand_dims(y, axis=1))

In [None]:
def train_and_infer(planet_id):
    """ 
    This function takes in a planet_id and finds the appropriate cropped planet image and corresponding hand labeled data
    We use this data to train a random forest model, and subsequently use the model to perform inference on the larger 
    Planet image. 
    
    We write out the inferences to a new file.
    """
    data_path = Path('../data')
    
    current_img_path = data_path / planet_id
    cropped_img_path = data_path / 'planet_images_cropped' / planet_id
    xml_file = list(current_img_path.glob('*.xml'))[0]

    img = list(current_img_path.glob(f'{planet_id}*.tif'))[0]
    chip = list(cropped_img_path.glob(f'cropped_{planet_id}*.tif'))[0]
    classification = list(cropped_img_path.glob(f'classification_*.tif'))[0]

    rf = return_trained_rf(chip=chip, classification=classification, xml_file=xml_file)
    inference = generate_inference(rf, img, xml_file)

    # use planet image to mask out regions of no data in the model inference
    with rasterio.open(img) as src_ds:
        nodata_mask = np.where(src_ds.read(1) == src_ds.profile['nodata'], 1, 0)
        inference[nodata_mask==1] = 255
        profile_copy = src_ds.profile
        profile_copy.update({'count':1, 'dtype':np.uint8, 'nodata':255})

        # write out model inference
        with rasterio.open(f"{img.parent}/rf_classification.tif", 'w', **profile_copy) as dst_ds:
            dst_ds.write(inference.astype(np.uint8).reshape(1, *inference.shape))

    print(f"Completed inference for planet id {planet_id}")
    
    return rf

In [None]:
# Perform inference on one site
model_features = ["red_mean", "nir_mean", "green_mean", "ndwi_mean", "ndvi_mean", "red_std", "nir_std", "green_std", "ndwi_std", "ndvi_std"]
model = train_and_infer(planet_ids[0])
feature_importances = list(zip(model_features, model.feature_importances_)) 

# The feature importances sum to 1, and provide an idea of how the model weights
# various features in making water/not-water inferences
print(f"Feature importances: {feature_importances}")

In [None]:
# Perform inferences on remaining sites
# We use multiprocessing to speed things up - 
# change the value of N_PROC below to use more/fewer parallel processes

N_PROC = 4
PARALLEL_EXECUTION = False

if PARALLEL_EXECUTION:
    with Pool(N_PROC) as p:
        for id in planet_ids[1:]:
            _ = p.apply_async(train_and_infer, (id,))

        p.close()
        p.join()
else:
    _ = list(map(generate_inference, planet_ids))