# ELUC Predictor
Train an XGBoost model that can predict ELUC

In [None]:
# %pip install zarr
# %pip install regionmask
# %pip install xgboost
# %pip install scikit-learn

In [None]:
import numpy as np
import pandas as pd
import regionmask
import xarray as xr
import matplotlib.pyplot as plt

In [None]:
ds = xr.open_zarr("../data/gcb/processed/merged_aggregated_dataset_1850_2022.zarr.zip", consolidated=True)

# Data prep
## Shift ELUC by 1 year
According to the [ELUC Data Exploration](../data/gcb/ELUC_data_exploration.ipynb) notebook ELUC is off by one year when compared to the change in land use. Shifting it by 1 year here.

In [None]:
ds['ELUC'] = ds['ELUC'].roll(time=1, roll_coords=False)
ds['ELUC_diff'] = ds['ELUC_diff'].roll(time=1, roll_coords=False)

## Add countries

In [None]:
ds = ds.stack(latlon=('lat', 'lon'))

In [None]:
regionmask.__version__

In [None]:
# Train on countries 0-80, and test on the rest, and test on the last 15 years of data
country_mask = regionmask.defined_regions.natural_earth_v5_0_0.countries_110.mask(ds)
ds = ds.assign_coords(country=country_mask)

In [None]:
ds

In [None]:
countries = regionmask.defined_regions.natural_earth_v5_0_0.countries_110
countries

In [None]:
ds.country

In [None]:
np.unique(ds.time)

## Remove NaN ELUC_diff

In [None]:
# Create a boolean mask based on NaN values for ELUC_diff
mask = ds['ELUC_diff'].isnull()

In [None]:
# Filter the dataset based on the mask
ds = ds.where(~mask, drop=True)

## Train and test sets
Create a dataset for a single country to keep it small and easy to train on  
Country: United Kingdom  
Train on data between 1850 and 2007  
Test on data after 2007 (i.e. 2008 to 2021)

In [None]:
countries_df = countries.to_dataframe()
countries_df

In [None]:
countries_df[countries_df.names == 'United Kingdom']

In [None]:
# train_da = dataset.where(dataset.country <= 80, drop=True).where(dataset.time <= 2007, drop=True)
# test_da = dataset.where(dataset.country > 80,  drop=True).where(dataset.time > 2007, drop=True)
train_da = ds.where(ds.country == 143, drop=True).where(ds.time <= 2007, drop=True)
test_da = ds.where(ds.country == 143,  drop=True).where(ds.time > 2007, drop=True)

In [None]:
train_da

In [None]:
test_da

## Features

In [None]:
features = list(train_da.data_vars)
# This is the target
features.remove('ELUC_diff')
features.remove('ELUC')
# Do not use:
features.remove('cell_area')
features.remove('cell_area_diff')

In [None]:
features

In [None]:
# Extract features and target variable from the training dataset
X_train = train_da[features]
# y_train = train_da['ELUC_diff']
y_train = train_da['ELUC']

In [None]:
# Extract features and target variable from the test dataset
X_test = test_da[features]
# y_test = test_da['ELUC_diff']
y_test = test_da['ELUC']

In [None]:
X_test

In [None]:
y_test

In [None]:
def convert_to_dataframe(da):
    df = da.to_dataframe()
    df.index = df.index.set_names(['time', 'i_lat', 'i_lon'])
    df = df.reset_index()
    return df

In [None]:
X_test_df = convert_to_dataframe(X_test)

In [None]:
X_test_df.tail(10)

In [None]:
y_test_df = convert_to_dataframe(y_test)
y_test_df.tail()

In [None]:
np.isnan(y_test.values)

# XGBoost (xarray)

In [None]:
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error
# from sklearn.metrics import mean_absolute_error

In [None]:
# X_train = np.column_stack([X_train[feature].values for feature in features])

In [None]:
# model = xgb.XGBRegressor()

In [None]:
# %%time
# model.fit(X_train, y_train)

In [None]:
# X_test = np.column_stack([X_test[feature].values for feature in features])

In [None]:
# preds = model.predict(X_test)
# preds

In [None]:

# mse = mean_squared_error(y_test, preds, squared=True)
# rmse = mean_squared_error(y_test, preds, squared=False)
# mae = mean_absolute_error(y_test, preds)
# print(f"MSE: {mse}")
# print(f"RMSE: {rmse}")
# print(f"MAE: {mae}")

In [None]:
# preds.shape

# XGBoost Train (df)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
predictor = xgb.XGBRegressor()

In [None]:
X_train_df = convert_to_dataframe(X_train)
y_train_df = convert_to_dataframe(y_train)

In [None]:
%%time
predictor.fit(X_train_df[features], y_train_df["ELUC"])

In [None]:
preds = predictor.predict(X_test_df[features])

In [None]:
preds.shape

In [None]:
# y_test_df = y_test_df["ELUC"]

In [None]:
mse = mean_squared_error(y_test_df["ELUC"], preds, squared=True)
rmse = mean_squared_error(y_test_df["ELUC"], preds, squared=False)
mae = mean_absolute_error(y_test_df["ELUC"], preds)
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

In [None]:
feature_important = predictor.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

feature_importance_df = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
feature_importance_df.nlargest(40, columns="score").plot(kind='barh', figsize = (20,10)) ## plot top 40 features

# Evaluate
## Aggregate at the country level

In [None]:
LAND_FEATURES = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per',
 'pastr', 'primf', 'primn', 'range', 'secdf', 'secdn', 'urban']
LAND_DIFF_FEATURES = ['c3ann_diff', 'c3nfx_diff', 'c3per_diff','c4ann_diff', 'c4per_diff',
 'pastr_diff', 'primf_diff', 'primn_diff', 'range_diff', 'secdf_diff', 'secdn_diff', 'urban_diff']

In [None]:
agg_df = X_test_df \
        .groupby(["time"])[LAND_DIFF_FEATURES] \
        .mean() \
        .sort_values(by=["time"]).reset_index()

In [None]:
agg_df[["time"] + LAND_DIFF_FEATURES].plot(title="Mean LUC in UK", x='time')
plt.legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
uk_y_df = y_test_df

In [None]:
uk_preds = predictor.predict(X_test_df[features])

In [None]:
uk_y_df["ELUC_Pred"] = uk_preds

In [None]:
agg_y_df = uk_y_df \
        .groupby(["time"])[["ELUC", "ELUC_Pred"]] \
        .mean() \
        .sort_values(by=["time"]).reset_index()

In [None]:
agg_y_df.plot(x="time", y=["ELUC", "ELUC_Pred"], title="ELUC vs Predicted ELUC in UK")

## Single point

In [None]:
# Max ELUC
y_test_df["ELUC"].max()

In [None]:
y_test_df[y_test_df["ELUC"] == y_test_df["ELUC"].max()]

In [None]:
wales_y_df = y_test_df[(y_test_df.i_lat==52.875) & 
                       (y_test_df.i_lon==-4.625)]
wales_y_df

In [None]:
wales_X_df = X_test_df[(X_test_df.i_lat==52.875) & 
                       (X_test_df.i_lon==-4.625)]
wales_X_df

In [None]:
wales_X_df[["time"] + LAND_DIFF_FEATURES].plot(x='time')
plt.legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
wales_y_df[['ELUC', 'time']].plot(x="time", y="ELUC")

In [None]:
wales_preds = predictor.predict(wales_X_df)

In [None]:
wales_preds.shape

In [None]:
wales_y_df["ELUC_Pred"] = wales_preds

In [None]:
wales_y_df

In [None]:
wales_y_df.plot(x="time", y=["ELUC", "ELUC_Pred"])