# Backscatter Distributions by Slope (Foreslope, Backslope, Flat)

**Alex Lewandowski; Alaska Satellite Facility, University of Alaska Fairbanks**

## Plots the RTC backscatter distributions of each slope category of every MGRS tile and polarization

**Notebook Requires**
- MGRS tiles of prepared OPERA RTC CalVal data created with Prep_OPERA_RTC_CalVal_data_stage1_part3.ipynb

In [None]:
import copy
from ipyfilechooser import FileChooser
import numpy.ma as ma
import numpy as np
import pandas as pd
from pathlib import Path
import rioxarray as rxr
from scipy import stats

from matplotlib.patches import Rectangle
import matplotlib.pyplot as plt
import matplotlib.lines as lines
from matplotlib.offsetbox import AnchoredText

import opensarlab_lib as osl

In [None]:
print("Select whether you will be downloading data or accessing data already stored on your volume.")
sources = ['Download Data from S3 Bucket', 'Access Locally Stored Data']
data_source = osl.select_parameter(sources)
display(data_source)

In [None]:
s3 = data_source.value == sources[0]
local = data_source.value == sources[1]

if local:
    print("Select the directory holding your MGRS tile sub-directories")
    fc = FileChooser(Path.cwd())
    display(fc)
elif s3:
    sites = {
        "Site 1 (Amazon Rainforest), Descending Orbit, Summer": "mgrs_site1_desc.zip",
        "Site 2 (Southern California), Ascending Orbit, Summer": "mgrs_site2_asc.zip",
        "Site 2 (Southern California), Descending Orbit, Summer": "mgrs_site2_desc.zip",
        "Site 3 (Interior Alaska), Ascending Orbit, Summer": "mgrs_site3_asc_summer.zip",
        "Site 3 (Interior Alaska), Ascending Orbit, Winter": "mgrs_site3_asc_winter.zip",
        "Site 3 (Interior Alaska), Descending Orbit, Summer": "mgrs_site3_desc_summer.zip",
        "Site 3 (Interior Alaska), Descending Orbit, Winter": "mgrs_site3_desc_winter.zip"
    }
    
    landcovers = {
        "Forest": "forest",
        "Shrub": "shrub",
        "Herbaceous Vegetation": "herbs",
        "Agriculture": "agriculture"
    }
    
    print("Select a dataset:")
    site = osl.select_parameter(sites)
    display(site)
    
    print("\nSelect a land cover classification:")
    landcover = osl.select_parameter(landcovers)
    display(landcover)
    

In [None]:
if local: 
    data_dir = Path(fc.selected_path)
elif s3:
    s3_pth = "s3://asf-jupyter-data-west/OPERA_CalVal/Prepared_MGRS_Tiles"
    s3_pth = f"{s3_pth}/{landcover.value}/{landcover.value}_{site.value}"
    ds_zip = Path(s3_pth).name
    !aws --region=us-west-2 --no-sign-request s3 cp {s3_pth} {ds_zip}
    osl.asf_unzip(str(Path.cwd()), ds_zip)
    Path(ds_zip).unlink()
    data_dir = Path.cwd()/Path(ds_zip).stem
    
mgrs = list()
for p in Path(data_dir).iterdir():
    if p.is_dir():
        mgrs.append(p)
mgrs

## Plot Backscatter Distributions

In [None]:
def plot_backscatter_distributions_by_slope(fore, back, flat, means, stds, polarization, tile, dataset_name, backscatter_minmax=None, output=None):
            # create histograms
            f, ax = plt.subplots(figsize=(18, 8))
            n_bins = 200
            colors = ['blue', 'green', 'darkorange']
            n, bins, patches = ax.hist([fore,back,flat], n_bins, color=colors,
                                       range=backscatter_minmax, histtype='step')

            # fill 1st standard deviation for each histogram and add line at mean
            std_colors = ['skyblue', 'lightgreen', 'orange']
            for j, hist in enumerate(patches):
                y_max = hist[0].get_path().get_extents().y1
                hist_path = hist[0].get_path().vertices
                std_hist = plt.Polygon(hist_path, color=std_colors[j], fill=True, alpha=0.2)
                ax.add_patch(std_hist)
                std_clip = plt.Rectangle([means[j]-stds[j],means[j]+stds[j]], stds[j]*2, y_max, 
                                          fill=True, visible=False)
                ax.add_patch(std_clip)
                std_hist.set_clip_path(std_clip)
                mean_line = lines.Line2D([means[j],means[j]], [0, y_max], color=colors[j], ls='--')
                ax.add_artist(mean_line)
                mean_line.set_clip_path(hist[0])

            annotation = AnchoredText(
                (f"PIXEL COUNTS:\n"
                 f"foreslope:  {np.count_nonzero(~np.isnan(fore))}\n"
                 f"backslope: {np.count_nonzero(~np.isnan(back))}\n"
                 f"flat:           {np.count_nonzero(~np.isnan(flat))}\n\n"
                 f"MEAN:\n"
                 f"foreslope:  {fore_mean}\n"
                 f"backslope: {back_mean}\n"
                 f"flat:           {flat_mean}\n\n"
                 f"STANDARD DEVIATION:\n"
                 f"foreslope:  {fore_std}\n"
                 f"backslope: {back_std}\n"
                 f"flat:           {flat_std}"
                ),
                loc='upper left', prop=dict(size=12), frameon=True, bbox_to_anchor=(1.0,1.0), bbox_transform=ax.transAxes)
            annotation.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
            ax.add_artist(annotation)  

            # add histogram legend
            hist_handles = [lines.Line2D([0,1], [0,0], lw=1, color=c) for c in colors]
            hist_legend = ax.legend(handles=hist_handles, labels=['foreslope','backslope','flat'], loc='upper right')
            ax.add_artist(hist_legend)

            # add standard deviation legend
            std_handles = [Rectangle((0,0),1,1,color=c,ec="k",alpha=0.2) for c in std_colors]
            std_legend = ax.legend(handles=std_handles, labels=['foreslope 1 std', 'backslope 1 std', 'flat 1 std'], loc='center right', bbox_to_anchor=(1,0.75))
            ax.add_artist(std_legend)

            # add mean legend
            mean_handles = [lines.Line2D([0,0], [0,1], color=c, ls='--') for c in colors]
            mean_legend = ax.legend(handles=mean_handles, labels=['foreslope mean', 'backslope mean', 'flat mean'], loc='center right', bbox_to_anchor=(1,0.55))
            ax.add_artist(mean_legend)

            ax.set(title=f"Distribution of {polarization} Foreslope, Backslope, and Flat Backscatter Values\n{dataset_name}\nMGRS: {tile}",
                   xlabel='Backscatter',
                   ylabel='Frequency')
            if output:
                plt.savefig(output, dpi=300, transparent='true')
            plt.show()

In [None]:
print("Select the scale in which to work:")
scale_choice = osl.select_parameter(['log scale', 'power scale'])
display(scale_choice)

print("Would you like to save output plots?")
save_choice = osl.select_parameter(["Save Plots", "Do not save plots"])
display(save_choice)

### Set thresholds for removing low and high value outliers

- Identify thresholds for outliers
- Change these values to limit the x range of the backscatter distribution histograms
- Changing these values will NOT change the already calculated means and standard deviations
- This will only affect the range of values plotted in the histogram

In [None]:
high_outlier_thresh = 0.01
low_outlier_thresh = 0.01

### Generate Histograms

In [None]:
save = save_choice.value == "Save Plots"

log = scale_choice.value == 'log scale'
pols = ['VH', 'VV']

vh_total = [np.array([]), np.array([]), np.array([])]
vv_total = [np.array([]), np.array([]), np.array([])]

if save:
    plot_dir = data_dir.parent/f"{data_dir.name}_PLOTS"
    if not plot_dir.is_dir():
        plot_dir.mkdir()

vh_back = np.array(list())
vh_fore = np.array(list())
vv_back = np.array(list())
vv_fore = np.array(list())
        
for i, m in enumerate(mgrs):
    tile = m.stem
    
    for p in pols:
        fore_pth = list(m.glob(f"*{p}_clip_*_foreslope.tif"))[0]
        back_pth = list(m.glob(f"*{p}_clip_*_backslope.tif"))[0]
        flat_pth = list(m.glob(f"*{p}_clip_*_flat.tif"))[0]
        
        fore = rxr.open_rasterio(str(fore_pth), masked=True).to_numpy().flatten()
        back = rxr.open_rasterio(str(back_pth), masked=True).to_numpy().flatten()
        flat = rxr.open_rasterio(str(flat_pth), masked=True).to_numpy().flatten()
        
        if np.count_nonzero(~np.isnan(fore)) < 1000 or np.count_nonzero(~np.isnan(back)) < 1000 or np.count_nonzero(~np.isnan(flat)) < 1000:
            print(f"Skipping Tile: {tile}")
            print(f"It contains a backscatter layer with less than 1000 data points")
            break

        if log:
            fore = 10 * np.log10(fore)
            back = 10 * np.log10(back)
            flat = 10 * np.log10(flat)
            
        if p == 'VH':
            vh_total[0] = np.concatenate([vh_total[0], fore])
            vh_total[1] = np.concatenate([vh_total[1], back])
            vh_total[2] = np.concatenate([vh_total[2], flat])
        else:
            vv_total[0] = np.concatenate([vv_total[0], fore])
            vv_total[1] = np.concatenate([vv_total[1], back])
            vv_total[2] = np.concatenate([vv_total[2], flat])            

        # calculate means and standard deviations before removing outliers
        fore_mean = np.nanmean(fore)
        fore_std = np.nanstd(fore)
        back_mean = np.nanmean(back)
        back_std = np.nanstd(back)
        flat_mean = np.nanmean(flat)
        flat_std = np.nanstd(flat)
    
        # Use the outlier thresholds defined in the previous cell to find the min and max x range for the histograms
        fore_max = np.nanquantile(fore, 1-high_outlier_thresh)
        back_max = np.nanquantile(back, 1-high_outlier_thresh)
        flat_max = np.nanquantile(flat, 1-high_outlier_thresh)
        fore_min = np.nanquantile(fore, low_outlier_thresh)
        back_min = np.nanquantile(back, low_outlier_thresh)
        flat_min = np.nanquantile(flat, low_outlier_thresh)
        backscatter_max = max([fore_max, back_max, flat_max])
        backscatter_min = min([fore_min, back_min, flat_min])

        # gather a list of means for a pairwise T-test from outlier sanitized datasets
        if p == 'VH':
            vh_fore = np.append(vh_fore, fore)
            vh_back = np.append(vh_back, back)
        else:
            vv_fore = np.append(vv_fore, fore)
            vv_back = np.append(vv_back, back)     
        
        means = [fore_mean, back_mean, flat_mean]
        stds = [fore_std, back_std, flat_std]
        
        if save:
            output = m.parents[1]/f"{m.parents[0].relative_to(m.parents[1])}_PLOTS/{m.stem}_{p}"
        else:
            output = None
        plot_backscatter_distributions_by_slope(fore, back, flat, means, stds, p, tile, data_dir.stem, [backscatter_min, backscatter_max], output)
    
    # uncomment following 2 lines for development if stopping after 1st plot
    # if i == 0:
    #     break
        
# calculate means and standard deviations for full scene
vh_means = [np.nanmean(vh_total[0]), np.nanmean(vh_total[1]), np.nanmean(vh_total[2])]
vv_means = [np.nanmean(vv_total[0]), np.nanmean(vv_total[1]), np.nanmean(vv_total[2])]
vh_stds = [np.nanstd(vh_total[0]), np.nanstd(vh_total[1]), np.nanstd(vh_total[2])]
vv_stds = [np.nanstd(vv_total[0]), np.nanstd(vv_total[1]), np.nanstd(vv_total[2])]

# Use the outlier thresholds defined in the previous cell to find the min and max x range for the summary histograms
vh_max = max([np.nanquantile(b, 1-high_outlier_thresh) for b in vh_total])
vh_min = min([np.nanquantile(b, low_outlier_thresh) for b in vh_total])
vv_max = max([np.nanquantile(b, 1-high_outlier_thresh) for b in vv_total])
vv_min = min([np.nanquantile(b, low_outlier_thresh) for b in vv_total])

if save:
    vh_output = f"{plot_dir}/full_scene_VH_PLOT"
    vv_output = f"{plot_dir}/full_scene_VV_PLOT"
else:
    vh_output = None
    vv_output = None
    
plot_backscatter_distributions_by_slope(vh_total[0], vh_total[1], vh_total[2], vh_means, vh_stds, 'FULL SCENE VH', [m.stem for m in mgrs], 
                                        data_dir.stem, [vh_min,vh_max], vh_output)
plot_backscatter_distributions_by_slope(vv_total[0], vv_total[1], vv_total[2], vv_means, vv_stds, 'FULL SCENE VV', [m.stem for m in mgrs], 
                                        data_dir.stem, [vv_min,vv_max], vv_output)

## Determine whether or not to remove outliers for T-Testing

- if removing outliers
    - define outlier thresholds
    - create arrays absent the outliers

In [None]:
print("When performing T-Tests, do you wish to remove or keep outliers?")
outlier_choice = osl.select_parameter(["Keep Outliers", "Remove Outliers"])
display(outlier_choice)

In [None]:
remove_outliers = outlier_choice.value == "Remove Outliers"

if remove_outliers:
    print("Outlier thresholds are given as a decimal percentage")
    print("A low value threshold of 0.01 would remove values in the 1% quantile\n")
    print("A high value threshold of 0.01 would remove values in the 99% quantile\n")
    low_outlier_thresh = float(input("Enter a low value outlier threshold"))
    high_outlier_thresh = float(input("Enter a high value outlier threshold"))                          

In [None]:
if remove_outliers:
    vh_fore_max = np.nanquantile(vh_fore, 1-high_outlier_thresh)
    vh_back_max = np.nanquantile(vh_back, 1-high_outlier_thresh)
    vh_fore_min = np.nanquantile(vh_fore, low_outlier_thresh)
    vh_back_min = np.nanquantile(vh_back, low_outlier_thresh)
    vh_max = max([vh_fore_max, vh_back_max])
    vh_min = min([vh_fore_min, vh_back_min])
    
    vv_fore_max = np.nanquantile(vv_fore, 1-high_outlier_thresh)
    vv_back_max = np.nanquantile(vv_back, 1-high_outlier_thresh)
    vv_fore_min = np.nanquantile(vv_fore, low_outlier_thresh)
    vv_back_min = np.nanquantile(vv_back, low_outlier_thresh)
    vv_max = max([vv_fore_max, vv_back_max])
    vv_min = min([vv_fore_min, vv_back_min])

    vh_f = np.where(vh_fore<vh_max, vh_fore, np.nan)
    vh_f = np.where(vh_f>vh_min, vh_f, np.nan)
    vh_b = np.where(vh_back<vh_max, vh_back, np.nan)
    vh_b = np.where(vh_b>vh_min, vh_b, np.nan)

    vv_f = np.where(vv_fore<vv_max, vv_fore, np.nan)
    vv_f = np.where(vv_f>vv_min, vv_f, np.nan)
    vv_b = np.where(vv_back<vv_max, vv_back, np.nan)
    vv_b = np.where(vv_b>vv_min, vv_b, np.nan)
else: 
    vh_f = vh_fore
    vh_b = vh_back
    vv_f = vv_fore
    vv_b = vv_back

---
## Subset datasets for T-tests

For each polarization:
- avoid using adjoining foreslope and backslope pixels to ensure independent sampling
    - keep every 30th foreslope pixel value, starting at index 0
    - keep every 30th backslope pixel value, starting at index 15
- remove nan values from subsets
- randomly shuffle subsets to avoid limiting samples to a geographic region
- keep first 5000 pixels from each subset

In [None]:
vh_fore_subset = vh_f[::30]
vh_fore_subset = vh_fore_subset[~np.isnan(vh_fore_subset)]
np.random.shuffle(vh_fore_subset)
vh_fore_subset = vh_fore_subset[:5000]

vh_back_subset = vh_b[15::30]
vh_back_subset = vh_back_subset[~np.isnan(vh_back_subset)]
np.random.shuffle(vh_back_subset)
vh_back_subset = vh_back_subset[:5000]

vv_fore_subset = vv_f[::30]
vv_fore_subset = vv_fore_subset[~np.isnan(vv_fore_subset)]
np.random.shuffle(vv_fore_subset)
vv_fore_subset = vv_fore_subset[:5000]

vv_back_subset = vv_b[15::30]
vv_back_subset = vv_back_subset[~np.isnan(vv_back_subset)]
np.random.shuffle(vv_back_subset)
vv_back_subset = vv_back_subset[:5000]

---
## Perform Shapiro-Wilk tests to confirm that the subset backscatter data are normally distributed for each polarization and slope

### VH Foreslope Normality

In [None]:
_, ax = plt.subplots(figsize=(8, 6))
ax.hist(vh_fore_subset, bins=200)
ax.set(title='VH Foreslope Subset Pixel Distribution', xlabel='backscatter', ylabel='Frequency')
plt.show()

In [None]:
vh_fore_shapiro = stats.shapiro(vh_fore_subset)
print(f"{vh_fore_shapiro}\n")
vh_fore_normal = vh_fore_shapiro.pvalue >= 0.05
if vh_fore_normal:
    print(f"The VH foreslope subset backscatter values are normally distributed")
else:
    print(f"The VH foreslope subset backscatter values are NOT normally distributed")

### VH Backslope Normality

In [None]:
_, ax = plt.subplots(figsize=(8, 6))
ax.hist(vh_back_subset, bins=200)
ax.set(title='VH Backslope Subset Pixel Distribution', xlabel='backscatter', ylabel='Frequency')
plt.show()

In [None]:
vh_back_shapiro = stats.shapiro(vh_back_subset)
print(f"{vh_back_shapiro}\n")
vh_back_normal = vh_back_shapiro.pvalue >= 0.05
if vh_back_normal:
    print(f"The VH backslope subset backscatter values are normally distributed")
else:
    print(f"The VH backslope subset backscatter values are NOT normally distributed")

### VV Foreslope Normality

In [None]:
_, ax = plt.subplots(figsize=(8, 6))
ax.hist(vv_fore_subset, bins=200)
ax.set(title='VV Foreslope Subset Pixel Distribution', xlabel='backscatter', ylabel='Frequency')
plt.show()

In [None]:
vv_fore_shapiro = stats.shapiro(vv_fore_subset)
print(f"{vv_fore_shapiro}\n")
vv_fore_normal = vv_fore_shapiro.pvalue >= 0.05
if vv_fore_normal:
    print(f"The VV foreslope subset backscatter values are normally distributed")
else:
    print(f"The VV foreslope subset backscatter values are NOT normally distributed")

### VV Backslope Normality

In [None]:
_, ax = plt.subplots(figsize=(8, 6))
ax.hist(vv_back_subset, bins=200)
ax.set(title='VV Backslope Subset Pixel Distribution', xlabel='backscatter', ylabel='Frequency')
plt.show()

In [None]:
vv_back_shapiro = stats.shapiro(vv_back_subset)
print(f"{vv_back_shapiro}\n")
vv_back_normal = vv_back_shapiro.pvalue >= 0.05
if vv_back_normal:
    print(f"The VV backslope subset backscatter values are normally distributed")
else:
    print(f"The VV backslope subset backscatter values are NOT normally distributed")

---
## VH T-Tests

### Print some general sample stats

In [None]:
print(f"vh_fore_subset:\n{stats.describe(vh_fore_subset)}")
print(f"\nvh_back_subset:\n{stats.describe(vh_back_subset)}")

### VH T-test for means of two independent samples from descriptive statistics.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind_from_stats.html#scipy.stats.ttest_ind_from_stats

- This is a test for the null hypothesis that two independent samples have identical average (expected) values.

In [None]:
stats.ttest_ind_from_stats(np.mean(vh_fore_subset), np.std(vh_fore_subset), len(vh_fore_subset), 
                           np.mean(vh_back_subset), np.std(vh_back_subset), len(vh_back_subset), 
                           equal_var=False, alternative='two-sided')

### VH T-test for the means of two independent samples.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html

- This is a test for the null hypothesis that 2 independent samples have identical average (expected) values

In [None]:
stats.ttest_ind(vh_fore_subset, vh_back_subset, equal_var=False)

### VH T-test on TWO RELATED samples of scores, a and b.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html

- This is a test for the null hypothesis that two related or repeated samples have identical average (expected) values.

In [None]:
stats.ttest_rel(vh_fore_subset, vh_back_subset)

---
## VV T-Tests

### Print some general sample stats

In [None]:
print(f"vv_fore_subset:\n{stats.describe(vv_fore_subset)}")
print(f"\nvv_back_subset:\n{stats.describe(vv_back_subset)}")

### VV T-test for means of two independent samples from descriptive statistics.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind_from_stats.html#scipy.stats.ttest_ind_from_stats

- This is a test for the null hypothesis that two independent samples have identical average (expected) values.

In [None]:
stats.ttest_ind_from_stats(np.mean(vv_fore_subset), np.std(vv_fore_subset), len(vv_fore_subset), 
                           np.mean(vv_back_subset), np.std(vv_back_subset), len(vv_back_subset), 
                           equal_var=False, alternative='two-sided')

### VV T-test for the means of two independent samples.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html

- This is a test for the null hypothesis that 2 independent samples have identical average (expected) values

In [None]:
stats.ttest_ind(vv_fore_subset, vv_back_subset, equal_var=False)

### VV T-test on TWO RELATED samples of scores, a and b.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html

- This is a test for the null hypothesis that two related or repeated samples have identical average (expected) values.

In [None]:
stats.ttest_rel(vv_fore_subset, vv_back_subset)

*Backscatter_Distributions_by_Slope - Version 0.1.0 - April 2022*