# Backscatter Distributions by Slope (Foreslope, Backslope, Flat)

**Alex Lewandowski; Alaska Satellite Facility, University of Alaska Fairbanks**

## Plots the RTC backscatter distributions of each slope category of every MGRS tile and polarization

**Notebook Requires**
- MGRS tiles of prepared OPERA RTC CalVal data created with Prep_OPERA_RTC_CalVal_data_stage1_part3.ipynb

In [None]:
import copy
from ipyfilechooser import FileChooser
import numpy.ma as ma
import numpy as np
import pandas as pd
from pathlib import Path
from pprint import pprint
import rioxarray as rxr
from scipy import stats

from matplotlib.patches import Rectangle
import matplotlib.pyplot as plt
import matplotlib.lines as lines
from matplotlib.offsetbox import AnchoredText

import opensarlab_lib as osl

In [None]:
# ******* IMPORTANT NOTE *******
# Updated data has not yet been stored to S3 (as of May 27 2022). 
# Update March 5 2023, Chip 1 Tree Cover data now accessible with more soon to follow

print("Select whether you will be downloading data or accessing data already stored on your volume.")
sources = ['Download Data from S3 Bucket', 'Access Locally Stored Data']
data_source = osl.select_parameter(sources)
display(data_source)

In [None]:
s3 = data_source.value == sources[0]
local = data_source.value == sources[1]
# s3 = False
# local = True

if local:
    print("Select the directory holding your MGRS tile sub-directories")
    fc = FileChooser(Path.cwd())
    display(fc)
elif s3:
    chips = {
        "Chip 1 (Amazon Rainforest), Descending Orbit, Summer": "S1A_IW_20150430T092132_DVP_RTC30_G_gpuned_35F5.zip",
        "Chip 2 (Southern California), Ascending Orbit, Summer": "S1A_IW_20201107T015034_DVP_RTC30_G_gpuned_B760.zip",
        "Chip 2 (Southern California), Descending Orbit, Summer": "S1A_IW_20201107T135214_DVP_RTC30_G_gpuned_A5CF.zip",
        "Chip 3 (Interior Alaska), Ascending Orbit, Summer": "S1B_IW_20200730T032005_DVP_RTC30_G_gpuned_5B61.zip",
        "Chip 3 (Interior Alaska), Ascending Orbit, Winter": "S1B_IW_20200120T031959_DVP_RTC30_G_gpuned_97CF.zip",
        "Chip 3 (Interior Alaska), Descending Orbit, Summer": "S1A_IW_20200716T161219_DVP_RTC30_G_gpuned_1B48.zip",
        "Chip 3 (Interior Alaska), Descending Orbit, Winter": "S1A_IW_20200118T161214_DVP_RTC30_G_gpuned_3D73.zip",
        "Chip 4 (Vermont and New Hampshire), Ascending Orbit, Summer": "S1A_IW_20200831T224357_DVP_RTC30_G_gpuned_6B97.zip",
        "Chip 4 (Vermont and New Hampshire), Descending Orbit, Summer": "mgrs_chip3_desc_winter.zip", # chip_4_desc not in s3 bucket
        "Chip 5 (Eastern Siberia), Ascending Orbit, Summer": "S1A_IW_20200622T092025_DVP_RTC30_G_gpuned_2A64.zip",
        "Chip 5 (Eastern Siberia), Descending Orbit, Summer": "S1B_IW_20200617T215129_DVP_RTC30_G_gpuned_210E.zip"
    }
    
    landcovers = {
        "Tree Cover": "tree_cover",
        "Shrub": "shrub",
        "Herbaceous Vegetation": "herbs",
        "Agriculture": "agriculture"
    }
    
    print("Select a dataset:")
    chip = osl.select_parameter(chips)
    display(chip)
    
    print("\nSelect a land cover classification:")
    landcover = osl.select_parameter(landcovers)
    display(landcover)
    

In [None]:
if local: 
    data_dir = Path(fc.selected_path)
elif s3:
    s3_pth = "s3://asf-opera-rtc-calval/compare_gamma0_on_foreslope_flat_backslope/sentinel_1/prepped_for_slope_comparisons"
    s3_pth = f"{s3_pth}/{landcover.value.lower()}_{chip.value}"
    ds_zip = Path(s3_pth).name
    # !aws --region=us-west-2 --no-sign-request s3 cp {s3_pth} {ds_zip} # if public bucket access
    !aws s3 cp {s3_pth} {ds_zip}
    osl.asf_unzip(str(Path.cwd()), ds_zip)
    Path(ds_zip).unlink()
    data_dir = Path.cwd()/Path(ds_zip).stem
    
mgrs = list()
full_scene_tiffs = list()
for p in Path(data_dir).iterdir():
    if p.is_dir() and len(p.name) == 5:
        mgrs.append(p)
    else:
        full_scene_tiffs.append(p)
        
pprint(mgrs)
print()
pprint(full_scene_tiffs)

## Plot Backscatter Distributions

In [None]:
def plot_backscatter_distributions_by_slope(fore, back, flat, central_moments, polarization, dataset_name, tile=None, backscatter_minmax=None, output=None):
            # create histograms
            f, ax = plt.subplots(figsize=(18, 8))
            n_bins = 200
            colors = ['blue', 'green', 'darkorange']
            n, bins, patches = ax.hist([fore,back,flat], n_bins, color=colors,
                                       range=backscatter_minmax, histtype='step')

            # fill 1st standard deviation for each histogram and add line at mean
            std_colors = ['skyblue', 'lightgreen', 'orange']
            for j, hist in enumerate(patches):
                y_max = hist[0].get_path().get_extents().y1
                hist_path = hist[0].get_path().vertices
                std_hist = plt.Polygon(hist_path, color=std_colors[j], fill=True, alpha=0.2)
                ax.add_patch(std_hist)
                std_clip = plt.Rectangle([means[j]-stds[j],means[j]+stds[j]], stds[j]*2, y_max, 
                                          fill=True, visible=False)
                ax.add_patch(std_clip)
                std_hist.set_clip_path(std_clip)
                mean_line = lines.Line2D([means[j],means[j]], [0, y_max], color=colors[j], ls='--')
                ax.add_artist(mean_line)
                mean_line.set_clip_path(hist[0])

            annotation = AnchoredText(
                (f"PIXEL COUNTS:\n"
                 f"foreslope:  {np.count_nonzero(~np.isnan(fore))}\n"
                 f"backslope: {np.count_nonzero(~np.isnan(back))}\n"
                 f"flat:           {np.count_nonzero(~np.isnan(flat))}\n\n"
                 f"MEAN:\n"
                 f"foreslope:  {central_moments[0][0]}\n"
                 f"backslope: {central_moments[0][1]}\n"
                 f"flat:           {central_moments[0][2]}\n\n"
                 f"MEDIAN:\n"
                 f"foreslope:  {central_moments[1][0]}\n"
                 f"backslope: {central_moments[1][1]}\n"
                 f"flat:           {central_moments[1][2]}\n\n"
                 f"MODE:\n"
                 f"foreslope:  {central_moments[2][0]}\n"
                 f"backslope: {central_moments[2][1]}\n"
                 f"flat:           {central_moments[2][2]}\n\n"
                 f"STANDARD DEVIATION:\n"
                 f"foreslope:  {central_moments[3][0]}\n"
                 f"backslope: {central_moments[3][1]}\n"
                 f"flat:           {central_moments[3][2]}"                 
                ),
                loc='upper left', prop=dict(size=12), frameon=True, bbox_to_anchor=(1.0,1.0), bbox_transform=ax.transAxes)
            annotation.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
            ax.add_artist(annotation)  

            # add histogram legend
            hist_handles = [lines.Line2D([0,1], [0,0], lw=1, color=c) for c in colors]
            hist_legend = ax.legend(handles=hist_handles, labels=['foreslope','backslope','flat'], loc='upper right')
            ax.add_artist(hist_legend)

            # add standard deviation legend
            std_handles = [Rectangle((0,0),1,1,color=c,ec="k",alpha=0.2) for c in std_colors]
            std_legend = ax.legend(handles=std_handles, labels=['foreslope 1 std', 'backslope 1 std', 'flat 1 std'], loc='center right', bbox_to_anchor=(1,0.75))
            ax.add_artist(std_legend)

            # add mean legend
            mean_handles = [lines.Line2D([0,0], [0,1], color=c, ls='--') for c in colors]
            mean_legend = ax.legend(handles=mean_handles, labels=['foreslope mean', 'backslope mean', 'flat mean'], loc='center right', bbox_to_anchor=(1,0.55))
            ax.add_artist(mean_legend)

            if tile:
                title = f"Distribution of {polarization} Foreslope, Backslope, and Flat Backscatter Values\n{dataset_name}\nMGRS: {tile}"
            else:
                title = f"Distribution of {polarization} Foreslope, Backslope, and Flat Backscatter Values\n{dataset_name}"
            
            ax.set(title=title,
                   xlabel='Backscatter',
                   ylabel='Frequency')
            if output:
                plt.savefig(output, dpi=300, transparent='true')
            plt.show()

In [None]:
print("Select the scale in which to work:")
scale_choice = osl.select_parameter(['log scale', 'power scale'])
display(scale_choice)

print("Would you like to save output plots?")
save_choice = osl.select_parameter(["Save Plots", "Do not save plots"])
display(save_choice)

### Generate Histograms for MGRS Tiles

In [None]:
save = save_choice.value == "Save Plots"

log = scale_choice.value == 'log scale'
pols = ['VH', 'VV']

vh_total = [np.array([]), np.array([]), np.array([])]
vv_total = [np.array([]), np.array([]), np.array([])]

if save:
    plot_dir = data_dir.parent/f"{data_dir.name}_PLOTS"
    if not plot_dir.is_dir():
        plot_dir.mkdir()
        
for i, m in enumerate(mgrs):
    tile = m.stem
    for p in pols:
        fore_pth = list(m.glob(f"*{p}_clip_foreslope_*.tif"))[0]
        back_pth = list(m.glob(f"*{p}_clip_backslope_*.tif"))[0]
        flat_pth = list(m.glob(f"*{p}_clip_flat_*.tif"))[0]
        
        fore = rxr.open_rasterio(str(fore_pth), masked=True).to_numpy().flatten()
        back = rxr.open_rasterio(str(back_pth), masked=True).to_numpy().flatten()
        flat = rxr.open_rasterio(str(flat_pth), masked=True).to_numpy().flatten()
        
        if np.count_nonzero(~np.isnan(fore)) < 1000 or np.count_nonzero(~np.isnan(back)) < 1000 or np.count_nonzero(~np.isnan(flat)) < 1000:
            print(f"Skipping Tile: {tile}")
            print(f"It contains a backscatter layer with less than 1000 data points")
            break

        if log:
            fore = 10 * np.log10(fore)
            back = 10 * np.log10(back)
            flat = 10 * np.log10(flat)     

        # calculate means and standard deviations
        fore_mean = np.nanmean(fore)
        fore_median = np.nanmedian(fore)
        fore_mode = stats.mode(fore[~np.isnan(fore)])[0][0]
        fore_std = np.nanstd(fore)
        
        back_mean = np.nanmean(back)
        back_median = np.nanmedian(back)
        back_mode = stats.mode(back[~np.isnan(back)])[0][0]
        back_std = np.nanstd(back)
        
        flat_mean = np.nanmean(flat)
        flat_median = np.nanmedian(flat)
        flat_mode = stats.mode(flat[~np.isnan(flat)])[0][0]
        flat_std = np.nanstd(flat)  
        
        means = [fore_mean, back_mean, flat_mean]
        medians = [fore_median, back_median, flat_median]
        modes = [fore_mode, back_mode, flat_mode]
        stds = [fore_std, back_std, flat_std]
        central_moments = [means, medians, modes, stds]
        
        if save:
            output = m.parents[1]/f"{m.parents[0].relative_to(m.parents[1])}_PLOTS/{m.stem}_{p}"
        else:
            output = None
            
        if p == 'VH':
            minmax = [-19, -7]
        else:
            minmax = [-14, -1]
        
        plot_backscatter_distributions_by_slope(fore, back, flat, central_moments, p, data_dir.stem, tile=tile, backscatter_minmax=minmax, output=output)
    
    ## uncomment following 2 lines for development if stopping after 2nd plot
    # if i == 1:
    #     break

### Generate Summary Histograms for Full Scene Data

In [None]:
for p in pols:
    fore_pth = list(data_dir.glob(f"*{p}_clip_foreslope.tif"))[0]
    back_pth = list(data_dir.glob(f"*{p}_clip_backslope.tif"))[0]
    flat_pth = list(data_dir.glob(f"*{p}_clip_flat.tif"))[0]
    
    fore = rxr.open_rasterio(str(fore_pth), masked=True).to_numpy().flatten()
    back = rxr.open_rasterio(str(back_pth), masked=True).to_numpy().flatten()
    flat = rxr.open_rasterio(str(flat_pth), masked=True).to_numpy().flatten()
    
    if log:
        fore = 10 * np.log10(fore)
        back = 10 * np.log10(back)
        flat = 10 * np.log10(flat)    
    
    # calculate means and standard deviations for full scene
    means = [np.nanmean(fore), np.nanmean(back), np.nanmean(flat)]
    medians = [np.nanmedian(fore), np.nanmedian(back), np.nanmedian(flat)]
    modes = [stats.mode(fore[~np.isnan(fore)])[0][0], stats.mode(back[~np.isnan(back)])[0][0], stats.mode(flat[~np.isnan(flat)])[0][0]]
    stds = [np.nanstd(fore), np.nanstd(back), np.nanstd(flat)]
    central_moments = [means, medians, modes, stds]
    
    if save:
        output = f"{plot_dir}/full_scene_{p}_PLOT"
    else:
        output = None        
        
    if p == 'VH':
        minmax = [-19, -7]
        vh_fore = fore
        vh_back = back
    else:
        minmax = [-14, -1]
        vv_fore = fore
        vv_back = back

    plot_backscatter_distributions_by_slope(fore, back, flat, central_moments, f'FULL SCENE {p}', data_dir.stem, backscatter_minmax=minmax, output=output)

---
## Subset datasets for T-tests

For each polarization:
- avoid using adjoining foreslope and backslope pixels to ensure independent sampling
    - keep every 30th foreslope pixel value, starting at index 0
    - keep every 30th backslope pixel value, starting at index 15
- remove nan values from subsets
- select n pixels from each subset (as defined by `sample_size` below

In [None]:
sample_size = 500

vh_fore_subset = vh_fore[::30]
vh_fore_subset = vh_fore_subset[~np.isnan(vh_fore_subset)]
vh_fore_subset = np.random.choice(vh_fore_subset, size=sample_size, replace=False)

vh_back_subset = vh_back[15::30]
vh_back_subset = vh_back_subset[~np.isnan(vh_back_subset)]
vh_back_subset = np.random.choice(vh_back_subset, size=sample_size, replace=False)

vv_fore_subset = vv_fore[::30]
vv_fore_subset = vv_fore_subset[~np.isnan(vv_fore_subset)]
vv_fore_subset = np.random.choice(vv_fore_subset, size=sample_size, replace=False)

vv_back_subset = vv_back[15::30]
vv_back_subset = vv_back_subset[~np.isnan(vv_back_subset)]
vv_back_subset = np.random.choice(vv_back_subset, size=sample_size, replace=False)

---
## Perform Shapiro-Wilk tests to confirm that the subset backscatter data are normally distributed for each polarization and slope

### VH Foreslope Normality

In [None]:
import math
vh_fore_stats = stats.describe(vh_fore_subset)
print(f"vh_fore_subset:\n{vh_fore_stats}")
mean = vh_fore_stats.mean
std = math.sqrt(vh_fore_stats.variance)

In [None]:
_, ax = plt.subplots(figsize=(8, 6))
ax.hist(vh_fore_subset, bins=50)
ax.set(title='VH Foreslope Subset Pixel Distribution', xlabel='backscatter', ylabel='Frequency')
plt.show()

In [None]:
stats.anderson(vh_fore_subset, dist='norm')

In [None]:
vh_fore_shapiro = stats.shapiro(vh_fore_subset)
print(f"{vh_fore_shapiro}\n")
vh_fore_normal = vh_fore_shapiro.pvalue >= 0.05
if vh_fore_normal:
    print(f"The VH foreslope subset backscatter values are normally distributed")
else:
    print(f"The VH foreslope subset backscatter values are NOT normally distributed")

### VH Backslope Normality

In [None]:
_, ax = plt.subplots(figsize=(8, 6))
ax.hist(vh_back_subset, bins=200)
ax.set(title='VH Backslope Subset Pixel Distribution', xlabel='backscatter', ylabel='Frequency')
plt.show()

In [None]:
stats.anderson(vh_back_subset, dist='norm')

In [None]:
vh_back_shapiro = stats.shapiro(vh_back_subset)
print(f"{vh_back_shapiro}\n")
vh_back_normal = vh_back_shapiro.pvalue >= 0.05
if vh_back_normal:
    print(f"The VH backslope subset backscatter values are normally distributed")
else:
    print(f"The VH backslope subset backscatter values are NOT normally distributed")

### VV Foreslope Normality

In [None]:
_, ax = plt.subplots(figsize=(8, 6))
ax.hist(vv_fore_subset, bins=200)
ax.set(title='VV Foreslope Subset Pixel Distribution', xlabel='backscatter', ylabel='Frequency')
plt.show()

In [None]:
stats.anderson(vv_fore_subset, dist='norm')

In [None]:
vv_fore_shapiro = stats.shapiro(vv_fore_subset)
print(f"{vv_fore_shapiro}\n")
vv_fore_normal = vv_fore_shapiro.pvalue >= 0.05
if vv_fore_normal:
    print(f"The VV foreslope subset backscatter values are normally distributed")
else:
    print(f"The VV foreslope subset backscatter values are NOT normally distributed")

### VV Backslope Normality

In [None]:
_, ax = plt.subplots(figsize=(8, 6))
ax.hist(vv_back_subset, bins=200)
ax.set(title='VV Backslope Subset Pixel Distribution', xlabel='backscatter', ylabel='Frequency')
plt.show()

In [None]:
stats.anderson(vv_back_subset, dist='norm')

In [None]:
vv_back_shapiro = stats.shapiro(vv_back_subset)
print(f"{vv_back_shapiro}\n")
vv_back_normal = vv_back_shapiro.pvalue >= 0.05
if vv_back_normal:
    print(f"The VV backslope subset backscatter values are normally distributed")
else:
    print(f"The VV backslope subset backscatter values are NOT normally distributed")

---
## VH T-Tests

### Print some general sample stats

In [None]:
print(f"vh_fore_subset:\n{stats.describe(vh_fore_subset)}")
print(f"\nvh_back_subset:\n{stats.describe(vh_back_subset)}")

### VH T-test for means of two independent samples from descriptive statistics.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind_from_stats.html#scipy.stats.ttest_ind_from_stats

- This is a test for the null hypothesis that two independent samples have identical average (expected) values.

In [None]:
stats.ttest_ind_from_stats(np.mean(vh_fore_subset), np.std(vh_fore_subset), len(vh_fore_subset), 
                           np.mean(vh_back_subset), np.std(vh_back_subset), len(vh_back_subset), 
                           equal_var=False, alternative='two-sided')

### VH T-test for the means of two independent samples.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html

- This is a test for the null hypothesis that 2 independent samples have identical average (expected) values

In [None]:
stats.ttest_ind(vh_fore_subset, vh_back_subset, equal_var=False)

### VH T-test on TWO RELATED samples of scores, a and b.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html

- This is a test for the null hypothesis that two related or repeated samples have identical average (expected) values.

In [None]:
stats.ttest_rel(vh_fore_subset, vh_back_subset)

---
## VV T-Tests

### Print some general sample stats

In [None]:
print(f"vv_fore_subset:\n{stats.describe(vv_fore_subset)}")
print(f"\nvv_back_subset:\n{stats.describe(vv_back_subset)}")

### VV T-test for means of two independent samples from descriptive statistics.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind_from_stats.html#scipy.stats.ttest_ind_from_stats

- This is a test for the null hypothesis that two independent samples have identical average (expected) values.

In [None]:
stats.ttest_ind_from_stats(np.mean(vv_fore_subset), np.std(vv_fore_subset), len(vv_fore_subset), 
                           np.mean(vv_back_subset), np.std(vv_back_subset), len(vv_back_subset), 
                           equal_var=False, alternative='two-sided')

### VV T-test for the means of two independent samples.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html

- This is a test for the null hypothesis that 2 independent samples have identical average (expected) values

In [None]:
stats.ttest_ind(vv_fore_subset, vv_back_subset, equal_var=False)

### VV T-test on TWO RELATED samples of scores, a and b.
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html

- This is a test for the null hypothesis that two related or repeated samples have identical average (expected) values.

In [None]:
stats.ttest_rel(vv_fore_subset, vv_back_subset)

*Backscatter_Distributions_by_Slope - Version 2.0.0 - May 2022*

*Change log*

- *Do not remove outliers*
- *Fixed ranges for x-axis scaling*
  - *different ranges for VH and VV polarizations*
- *Add median and mode to central moments*