# Deforestation detection

## Imports

In [1]:
import geopandas as gpd
import hvplot.pandas
import hvplot.xarray  # noqa
import numpy as np
import pandas as pd
import xarray as xr

## Read the data

In [2]:
# Read the cube
cube_uri = "zip::https://git.geo.tuwien.ac.at/api/v4/projects/1266/repository/files/HLS_clip4plots_both_b30_v20.zarr.zip/raw?ref=dev&lfs=true"
full_cube = xr.open_dataset(
    cube_uri,
    engine="zarr",
    consolidated=False,
    ).compute()

# Read the points, reproject tro match the datacube, set the index
points_uri = "https://git.geo.tuwien.ac.at/public_projects/environmental-remote-sensing/-/raw/dev/timeline_points.geojson?ref_type=heads"
points = (
    gpd.read_file(points_uri)
    .to_crs(full_cube["spatial_ref"].attrs["crs_wkt"])
    .set_index("intact")
    )

## Specify the variables indicating cloud/shadow presence

In [3]:
cirrus_col = "cirrus cloud"
cloud_col = "cloud"
adjacent_col = "adjacent to cloud"
shadow_col = "cloud shadow"
tainted_cols = [adjacent_col, cloud_col, cirrus_col, shadow_col]

## Select the clearest observations

In [4]:
tainted_frame = (
    full_cube[tainted_cols].to_dataarray(dim="mask")
    .any(dim="mask").sum(dim=["x", "y"])
    .to_dataframe(name="count_tainted")
    .sort_values(by="count_tainted")
    )

monthly_clearest = (
    tainted_frame
    .groupby(pd.Grouper(freq='ME')).head(1)
    .index
    .values
    )
monthly_cube = full_cube.sel(time=monthly_clearest)

fully_clear = tainted_frame[tainted_frame["count_tainted"] == 0].index.values
clear_cube = full_cube.sel(time=fully_clear)

## How an image time series looks like

### Images need to be converted to 8 bits to set the color stretch

In [5]:
def to_rgb8(cube, r, g, b, vmax, vmin=0):
    selected = cube[[r, g, b]].to_dataarray("band")
    stretched = (selected - vmin)/(vmax - vmin)

    is_positive = ((stretched >= 0) & np.isfinite(stretched))
    positive = stretched.where(is_positive.all(dim="band"), 0)
    clipped = (
        np.clip(255 * positive, 0, 255)
        .astype(np.uint8)
        .expand_dims({"composite": [", ".join((r, g, b))]})
        )
    clipped["band"] = np.array(["r", "g", "b"], dtype="unicode")

    return clipped

def plot_rgb(*args, dimname="composite"):
    return xr.concat(args, dim=dimname).hvplot.rgb(
        x='x',
        y='y',
        bands='band',
        by=dimname,
        groupby="time",
        subplots=True,
        rasterize=True,
        data_aspect=1,
        xaxis=False,
        yaxis=None,
        widget_location="bottom"
        )

## Plot a time series of the images

In [6]:
plot_rgb(
    to_rgb8(monthly_cube, r="Red", g="Green", b="Blue", vmax=.15),
    to_rgb8(monthly_cube, r="NIRnarrow", g="SWIR1", b="Red", vmax=.40)
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'e7efa49a-d1a1-4702-bb74-4e45595d490b': {'version…

## Plot just the "fully" clear images

In [7]:
plot_rgb(
    to_rgb8(clear_cube, r="Red", g="Green", b="Blue", vmax=.15),
    to_rgb8(clear_cube, r="NIRnarrow", g="SWIR1", b="Red", vmax=.40)
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'e082a33d-9bd9-45a7-a341-1385827e37f8': {'version…

## Satellite pixel time series as tables

### Pick two points one that has been deforested, and one that is intact

In [8]:
# https://tutorial.xarray.dev/intermediate/indexing/advanced-indexing.html#orthogonal-indexing-in-xarray
sel_cube = full_cube.sel(
    x=xr.DataArray(points.geometry.x, dims="intact"),
    y=xr.DataArray(points.geometry.y, dims="intact"),
    method="nearest"
    )

### Flatten (xarray dataset/"cube" to pandas dataframe/table)

In [9]:
sel_frame = (
    sel_cube.drop_vars("spatial_ref").to_dataframe()
    .drop(columns=["x", "y"])
    .reset_index()
    )

### Prepare auxiliary variables for plotting

In [10]:
def normalized_difference(frame, positive, negative):
    return (frame[positive] - frame[negative]) / (frame[positive] + frame[negative])

sel_frame["NDVI"] = normalized_difference(sel_frame, "NIRnarrow", "Red")
sel_frame["NDMI"] = normalized_difference(sel_frame, "NIRnarrow", "SWIR1")
sel_frame["DOY"] = sel_frame["time"].dt.dayofyear

### Split the table into intact and deforested

In [11]:
deforested_frame = sel_frame[~sel_frame["intact"]]# to_table(xr.open_dataset(PARENT_DIR / "HLS_deforested_b30_v2.0.nc"))
intact_frame = sel_frame[sel_frame["intact"]]# to_table(xr.open_dataset(PARENT_DIR / "HLS_intact_b30_v2.0.nc"))

## Clouds and shadows taint our time series

In [12]:
intact_frame.loc[:, "flag"] = "clear"
intact_frame.loc[intact_frame[tainted_cols[1:]].any(axis=1), "flag"] = "cloud/shadow"
intact_frame.loc[intact_frame[tainted_cols[0]], "flag"] = "adjacent"


intact_frame.hvplot.scatter(x="time", y="Green", by="flag", color=["green", "black", "orange"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intact_frame.loc[:, "flag"] = "clear"


## Filter out flagged observations

In [13]:
intact_masked = intact_frame[~intact_frame[tainted_cols].any(axis=1)].copy()
deforested_masked = deforested_frame[~deforested_frame[tainted_cols].any(axis=1)].copy()

## Removed the cloud, the timeline becomes clearer

In [14]:
tetracolor_kwargs = {
    "x": "time",
    "y": ["Blue", "Green", "Red", "NIRnarrow"],
    "color": ["blue", "green", "red", "darkgray"],
}

intact_masked.hvplot(**tetracolor_kwargs)#.legend(loc="upper left", ncols=4);

## Spikes could be unmasked clouds/shadows

A massive value difference respective to its neighbors indicates the presence of possible outliers

In [15]:
def despike(frame, columns, min_spike, max_spike):

    summed = frame[columns].sum(axis=1)

    # Perform the selections
    central = summed.iloc[1:-1]
    prior = summed.shift(-1).iloc[1:-1]
    posterior = summed.shift(1).iloc[1:-1]

    #
    spikyness = central - (prior + posterior) / 2
    floor, ceiling = spikyness.quantile((min_spike, max_spike))
    selected = central[spikyness.between(floor, ceiling)]

    return frame.loc[selected.index]


cutoff = 0.05
band_names = ["Blue", "Green", "Red", "NIRnarrow", "SWIR1", "SWIR2"]

intact_despiked = despike(intact_masked, band_names, cutoff, 1 - cutoff)
deforested_despiked = despike(deforested_masked, band_names, cutoff, 1 - cutoff)

In [16]:
(
    intact_masked.hvplot(x="time", y="NIRnarrow", color="k") *
    intact_despiked.hvplot(x="time", y="NIRnarrow", color="darkgray")
)

## The spikes as anomalies

These spikes are anolalies on the context where they appear, but they may or
may not be global outliers when compared with the full population.

In [17]:
spike_frame = intact_masked.iloc[1:-1].drop(index=intact_despiked.index)

In [18]:
(
    intact_masked.hvplot.scatter(
        x="Red", y="NIRnarrow", c="DOY", colormap="twilight"
        )
    *
    spike_frame.hvplot.scatter(
        x="Red", y="NIRnarrow", marker="x", s=90, color="red")
)

The isolated points far above and below the general population would be
global outliers, whereas the rest were outliers on their specific context.

## The signature of deforestation

In [19]:
deforested_despiked.hvplot(**tetracolor_kwargs)

In [20]:
intact_despiked.hvplot(**tetracolor_kwargs)

## With Indices

In [21]:
compare_frame = (
    pd.concat(
        {"deforested": deforested_despiked, "intact": intact_despiked},
        axis=0
    )
    .reset_index(names=["history", "index"]).drop(columns="index")
    )

In [23]:
compare_frame.hvplot.line(x="time", y="NDVI", by="history", color=["black", "limegreen"])