# Final Site Analysis of the Minicubes

This notebooks anaysis the minicubes by site. The idea is to measure trends and quantiles on a specific site. The analysis is done on the minicubes that have been created in the previous notebook.


In [None]:
import xarray as xr
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, HuberRegressor, RANSACRegressor, TheilSenRegressor
import os
import matplotlib.pyplot as plt

In [None]:
# Parameters
testsite = 2
threshhold = 0.15
dead = 0.2
growing_season_start = '05-20'
growing_season_end = '08-31'
years = [2018, 2019, 2020, 2021]

In [None]:
# find all directorys in data/FinalSites
threshvalue = threshhold * 10000
deadvalue = dead * 10000
dirs = os.listdir('data/FinalSites')

In [None]:
df_healthy = pd.DataFrame()
df_healthy_season = pd.DataFrame()
df_dead = pd.DataFrame()
df_dead_season = pd.DataFrame()

for dir in dirs:
    ds = xr.open_dataset(f'data/FinalSites/{dir}', engine='zarr')

    # Create masks for deadwood conditions
    mask_healthy = (ds['deadwood_2018'] < threshvalue).values & (ds['deadwood_2019'] < threshvalue).values & (ds['deadwood_2020'] < threshvalue).values & (ds['deadwood_2021'] < threshvalue).values
    mask_dead = (ds['deadwood_2018'] > deadvalue).values & (ds['deadwood_2019'] > deadvalue).values & (ds['deadwood_2020'] > deadvalue).values & (ds['deadwood_2021'] > deadvalue).values
    ndvi_healthy = ds['ndvi'].where(mask_healthy)
    df_healthy = pd.concat([df_healthy, ndvi_healthy.to_dataframe().dropna()], axis=0)
    ndvi_dead = ds['ndvi'].where(mask_dead)
    df_dead = pd.concat([df_dead, ndvi_dead.to_dataframe().dropna()], axis=0)

    # Create seasonal NDVI
    seasons_healthy = {
        year: ndvi_healthy.sel(time=slice(f'{year}-{growing_season_start}', f'{year}-{growing_season_end}'))
        for year in years
    }
    seasons_dead = {
        year: ndvi_dead.sel(time=slice(f'{year}-{growing_season_start}', f'{year}-{growing_season_end}'))
        for year in years
    }
    # collect data from seasons into one xarray DataArray
    season_only_healthy = xr.concat(
        [seasons_healthy[year] for year in years], 
        dim='time'
    )
    df_healthy_season = pd.concat([df_healthy_season, season_only_healthy.to_dataframe().dropna()], axis=0)
    season_only_dead = xr.concat(
        [seasons_dead[year] for year in years], 
        dim='time'
    )
    df_dead_season = pd.concat([df_dead_season, season_only_dead.to_dataframe().dropna()], axis=0)

# finish the dataframes:

df_healthy.reset_index(inplace=True)
df_healthy_season.reset_index(inplace=True)
df_dead.reset_index(inplace=True)
df_dead_season.reset_index(inplace=True)

In [None]:
ndvi_healthy = df_healthy.groupby('time')['ndvi']
ndvi_dead = df_dead.groupby('time')['ndvi']

fig, ax = plt.subplots(figsize=(20, 6))
ndvi_healthy.mean().plot.line('b--')
ndvi_healthy.quantile(0.9).plot.line('b')
ndvi_dead.mean().plot.line('m--')
ndvi_dead.quantile(0.9).plot.line('m')
ax.set_title(f'NDVI values of Site {testsite:02} accumulated over x/y coordinates')
ax.legend([
    f'deadwood < {threshhold*100}% (mean)',
    f'deadwood < {threshhold*100}% (0.9 quantile)',
    f'deadwood > {dead*100}% (mean)',
    f'deadwood > {dead*100}% (0.9 quantile)'
    ],
    loc='lower right',
    title = 'state of the trees',
    ncol=2)
ax.set_ylim(0, 1)

for year in years:
    plt.axvspan(pd.to_datetime(f'{year}-{growing_season_start}'),
                pd.to_datetime(f'{year}-{growing_season_end}'),
                color='green', alpha=0.1)
    plt.text(pd.to_datetime(f'{year}-{growing_season_start}'),
             0.25, f'growing season\n{year}', rotation=30)

The models need to be selected and the data needs to be prepared for the models

In [None]:
healthy = df_healthy_season
dead = df_dead_season

In [None]:
healthy = healthy.sort_values('time').reset_index(drop=True)
dead = dead.sort_values('time').reset_index(drop=True)

In [None]:
vec_time_healthy = healthy.time.values.reshape(-1, 1)
vec_healthy = healthy.ndvi.values
vec_time_dead = dead.time.values.reshape(-1, 1)
vec_dead = dead.ndvi.values

outlier detection: IQR

In [None]:
def outlier_IQR_remover(vec_time, vec):
    Q1 = np.percentile(vec, 25)
    Q3 = np.percentile(vec, 75)
    IQR = Q3 - Q1
    mask = (vec > Q1 - 1.5 * IQR) & (vec < Q3 + 1.5 * IQR)
    return vec_time[mask], vec[mask]

In [None]:
# vec_time_healthy, vec_healthy = outlier_IQR_remover(vec_time_healthy, vec_healthy)
# vec_time_dead, vec_dead = outlier_IQR_remover(vec_time_dead, vec_dead)

in case you need the numeric time, use this:

In [None]:
vec_time_healthy_numeric = (vec_time_healthy - vec_time_healthy.min()).astype('timedelta64[D]').astype(int)
vec_time_healthy_numeric = vec_time_healthy_numeric.reshape(-1, 1)
vec_time_dead_numeric = (vec_time_dead - vec_time_dead.min()).astype('timedelta64[D]').astype(int)
vec_time_dead_numeric = vec_time_dead_numeric.reshape(-1, 1)

In [None]:
methodlist = [LinearRegression(), HuberRegressor(), RANSACRegressor(), TheilSenRegressor()]
methodnames = ['Linear Regression', 'Huber Regressor', 'RANSAC Regressor', 'Theil-Sen Regressor']
for method, methodname in zip(methodlist, methodnames):
    model_healthy = method
    model_dead = method
    method.fit(vec_time_healthy_numeric, vec_healthy)
    R_healthy = model_healthy.score(vec_time_healthy_numeric, vec_healthy)
    model_dead.fit(vec_time_dead_numeric, vec_dead)
    R_dead = model_dead.score(vec_time_dead_numeric, vec_dead)
    print(f'{methodname}:\nHealthy: R^2 = {R_healthy}\nDead: R^2 = {R_dead}\n')

In [None]:
# method = LinearRegression() # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
# method = TheilSenRegressor() # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.TheilSenRegressor.html
method = RANSACRegressor() # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RANSACRegressor.html
# method = HuberRegressor() # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.HuberRegressor.html

model_healthy = method
model_healthy.fit(vec_time_healthy_numeric, vec_healthy)
R_healthy = model_healthy.score(vec_time_healthy_numeric, vec_healthy)
model_dead = method
model_dead.fit(vec_time_dead_numeric, vec_dead)
R_dead = model_dead.score(vec_time_dead_numeric, vec_dead)

print(f'R^2 healthy: {R_healthy}')
print(f'R^2 dead: {R_dead}')

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
ax[0].plot(vec_time_healthy, vec_healthy, 'bo', alpha=0.5)
ax[0].plot(vec_time_healthy, model_healthy.predict(vec_time_healthy_numeric), 'r--')
ax[0].set_title('Healthy trees')
ax[0].set_xlabel('Time')
ax[0].set_ylabel('NDVI')
ax[0].set_ylim(0, 1)
ax[0].legend(['NDVI', 'Linear Regression'], loc='lower right')
ax[0].text(0.7, 0.3, f'R^2: {R_healthy:.3f}',
           transform=ax[0].transAxes, fontsize=12,
           verticalalignment='top',
           bbox = dict(boxstyle='round', facecolor='white', alpha=0.5))
ax[1].plot(vec_time_dead, vec_dead, 'mo', alpha=0.5)
ax[1].plot(vec_time_dead, model_dead.predict(vec_time_dead_numeric), 'r--')
ax[1].set_title('Deadwood trees')
ax[1].set_xlabel('Time')
ax[1].set_ylabel('NDVI')
ax[1].set_ylim(0, 1)
ax[1].legend(['NDVI', 'Linear Regression'], loc='lower right')
ax[1].text(0.7, 0.3, f'R^2: {R_dead:.3f}',
           transform=ax[1].transAxes, fontsize=12,
           verticalalignment='top',
           bbox = dict(boxstyle='round', facecolor='white', alpha=0.5))
fig.suptitle(f'NDVI trend of site {testsite:02} using the data only from the growing season')