In [1]:
# GIS imports 
import rasterio

# data ETL imports
import pandas as pd
import numpy as np

# misc imports
from pathlib import Path
from xml.dom import minidom
import xml.etree.ElementTree as ET
from xml.dom import minidom
from multiprocessing import Pool

# ML imports
from skimage.restoration import denoise_tv_bregman
from sklearn.ensemble import RandomForestClassifier
from skimage.segmentation import felzenszwalb
from tools import get_superpixel_stds_as_features, get_superpixel_means_as_features, get_array_from_features, reproject_arr_to_match_profile
from sklearn.model_selection import train_test_split
from joblib import dump

In [2]:
data_path = Path('../data')

In [4]:
df = pd.read_csv(data_path/'validation_table.csv')
df = df[['site_name', 'planet_id', 'water_stratum']]

# notebook disconnected and stopped working. delete ONLY this line after all inferences are generated
df = pd.DataFrame({'planet_id':[f.name for f in (data_path/'planet_images_cropped').iterdir() if len(list(f.glob('full_img_rf*'))) == 0]})

df.head()

Unnamed: 0,planet_id
0,20210916_010848_94_2407


#### Helper functions to read Planet imagery

In [16]:
def return_grn_indices(xml_file:str|Path)->list:
    """ 
    Return the indices of the (green, red, and NIR) channels of a 4 or 8 band Planet image
    """
    tree = ET.parse(xml_file)
    numbands = None
    for elem in tree.iter():
        if 'numBands' in elem.tag:
            numBands = int(elem.text)
        
    # print(f"Chip id: {row.planet_id}, bands: {numBands}")

    # we always want (green, red, nir) indices in the image
    if numBands == 4:
        band_idxs = [2, 3, 4] # BGRN image
    else:
        band_idxs = [4, 6, 8] # 8 band MS image
    
    return band_idxs

def return_reflectance_coeffs(xml_file:str|Path, band_idx:int|list):
    """
    Read XML file associated with a Planet image and return the TOA reflectance coefficients
    for specified band indices
    """
    assert isinstance(band_idx, (list, int)), "band_idx must be of type int or list"
    
    if isinstance(band_idx, int):
        band_idx = [band_idx]
    
    # parse XML metadata to obtain TOA reflectance coefficients
    xmldoc = minidom.parse(str(xml_file))
    nodes = xmldoc.getElementsByTagName("ps:bandSpecificMetadata")
    coeffs = {}
    for node in nodes:
        bn = node.getElementsByTagName("ps:bandNumber")[0].firstChild.data
        if bn in [str(x) for x in band_idx]:
            i = int(bn)
            value = node.getElementsByTagName("ps:reflectanceCoefficient")[0].firstChild.data
            coeffs[i] = float(value)
    
    return coeffs

def return_img_bands(img:str|Path, band_idx:int|list, denoising_weight=None)->np.ndarray:
    """ 
    Read a Planet file and return an numpy array containing data from specified bands. The image 
    will be band-wise denoised (using TV denoising) if a denoising weight is specified
    """
    if isinstance(band_idx, int):
        band_idx = list(band_idx)
    
    img_stack = []
    with rasterio.open(img) as ds:
        for idx in band_idx:
            if denoising_weight is not None:
                img_stack.append(denoise_tv_bregman(ds.read(idx), denoising_weight))
            else:
                img_stack.append(ds.read(idx))
    
    return np.stack(img_stack, axis = 0)


#### Helper functions to train a random forest and produce inferences

In [17]:
def calc_ndwi(green, red, min_value=1e-5):
    with np.errstate(divide='ignore', invalid='ignore'):
        ndwi = (green - red)/(green + red)
    ndwi = np.where(np.isnan(ndwi), min_value, ndwi)
    return ndwi

def calc_ndvi(red, nir, min_value=1e-5):
    with np.errstate(divide='ignore', invalid='ignore'):
        ndvi = (red - nir)/(red + nir)
    ndvi = np.where(np.isnan(ndvi), min_value, ndvi)
    return ndvi

def return_trained_rf(chip:str|Path, classification:str|Path, xml_file:str|Path):
        
    band_idxs = return_grn_indices(xml_file)
    coeffs = return_reflectance_coeffs(xml_file, band_idxs)
    chip_img = return_img_bands(chip, band_idxs, denoising_weight=None)
    
    with rasterio.open(chip) as src_ds:
        ref_profile = src_ds.profile

    green = chip_img[0]*coeffs[band_idxs[0]]
    red = chip_img[1]*coeffs[band_idxs[1]]
    nir = chip_img[2]*coeffs[band_idxs[2]]

    with rasterio.open(classification) as src_ds:
        cl = src_ds.read(1)
        cl_profile = src_ds.profile

    # some classification extents are not the same as the corresponding chip extent
    # this will be reflected in the transforms of the two rasters
    # if this is the case, reproject the classification such that it matches the chip extent
    if ref_profile['transform'] != cl_profile['transform']:
        ref_transform = ref_profile['transform']
        width = ref_profile['width']
        height = ref_profile['height']
        ref_profile = cl_profile.copy()
        ref_profile.update({'transform':ref_transform, 'width':width, 'height': height})
        cl, _ = reproject_arr_to_match_profile(cl, cl_profile, ref_profile)
        cl = np.squeeze(cl)

    ndwi_1 = calc_ndwi(green, red)
    ndwi_2 = calc_ndwi(green, nir)
    ndvi = calc_ndvi(red, nir)

    # segment image using red and NDWI channels
    img_stack = np.stack([red, ndwi_1, ndwi_2], axis=-1)
    segments = felzenszwalb(img_stack, sigma=0, min_size=3)

    # create training data that includes other channels as well
    img_stack = np.stack([red, nir, green, ndwi_1, ndwi_2, ndvi], axis=-1)    
    std_features = get_superpixel_stds_as_features(segments, img_stack)
    mean_features = get_superpixel_means_as_features(segments, img_stack)

    X = np.concatenate([mean_features, std_features], axis = 1)
    
    # We have superpixels, we now need to map each of the segments to the associated label
    # A 0 value indicates no label for the segment
    class_features = np.zeros((mean_features.shape[0], 1))
    for class_id in [0, 1]:
        # Get all superpixel labels with particular id
        superpixel_labels_for_class = np.unique(segments[class_id == cl])
        # Label those superpixels with approrpriate class
        class_features[superpixel_labels_for_class] = class_id

    class_arr_superpixels = get_array_from_features(segments, class_features)

    # Split data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, class_features, test_size=0.15, random_state=0)

    # Define an RF to be trained. setting n_jobs = -1 uses all available processors
    rf = RandomForestClassifier(n_estimators=250, class_weight='balanced', oob_score=True, random_state=0, n_jobs=-1)

    # # train model and print performance
    rf.fit(X_train, y_train.ravel())
    # print("Model OOB score: ", rf.oob_score_)
    # print("Model test score: ", rf.score(X_test, y_test.ravel()))

    rf_model_folder = Path(chip).parent / 'rf_model'
    rf_model_folder.mkdir(exist_ok=True)
    model_path = rf_model_folder/"rf_model.joblib"
    
    # save for later use
    dump(rf, model_path)

    return rf

def generate_inference(rf, img:str|Path, xml_file:str|Path):
    band_idxs = return_grn_indices(xml_file)
    coeffs = return_reflectance_coeffs(xml_file, band_idxs)
    
    full_img = return_img_bands(img, band_idxs, denoising_weight=None)

    green = full_img[0]*coeffs[band_idxs[0]]
    red = full_img[1]*coeffs[band_idxs[1]]
    nir = full_img[2]*coeffs[band_idxs[2]]

    ndwi_1 = calc_ndwi(green, red)
    ndwi_2 = calc_ndwi(green, nir)
    ndvi = calc_ndvi(red, nir)

    img_stack = np.stack([red, ndwi_1, ndwi_2], axis=-1)
    segments = felzenszwalb(img_stack, sigma=0, min_size=3)

    # for inference we include other channels as well
    img_stack = np.stack([red, nir, green, ndwi_1, ndwi_2, ndvi], axis=-1) 
    std_features = get_superpixel_stds_as_features(segments, img_stack)
    mean_features = get_superpixel_means_as_features(segments, img_stack)

    X = np.concatenate([mean_features, std_features], axis = 1)
    y = rf.predict(X)

    return get_array_from_features(segments, np.expand_dims(y, axis=1))

In [18]:
def train_and_infer(row):
    data_path = Path('../data')
    
    current_img_path = data_path / row.planet_id
    cropped_img_path = data_path / 'planet_images_cropped' / row.planet_id
    xml_file = list(current_img_path.glob('*.xml'))[0]

    img = list(current_img_path.glob(f'{row.planet_id}*.tif'))[0]
    chip = list(cropped_img_path.glob(f'cropped_{row.planet_id}*.tif'))[0]
    classification = list(cropped_img_path.glob(f'*{row.planet_id}*.tif'))[0]

    rf = return_trained_rf(chip=chip, classification=classification, xml_file=xml_file)
    inference = generate_inference(rf, img, xml_file)

    with rasterio.open(img) as src_ds:
        profile_copy = src_ds.profile
        profile_copy.update({'count':1, 'dtype':np.uint8, 'nodata':255})
        with rasterio.open(f"{classification.parent}/full_img_rf_classification_{classification.name}", 'w', **profile_copy) as dst_ds:
            dst_ds.write(inference.astype(np.uint8).reshape(1, *inference.shape))

In [19]:
with Pool(4) as pool:
    _ = pool.map(train_and_infer, [row for _, row in df.iterrows()])

  means = sums / counts
  means = sums / counts
  means = sums / counts
  means = sums / counts
  means = sums / counts
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  means = sums / counts
  means = sums / counts
  means = sums / counts
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  means = sums / counts
  means = sums / counts
  means = sums / counts
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


ValueError: With n_samples=1, test_size=0.15 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.