# ELUC Predictor

In [None]:
# %pip install zarr
# %pip install regionmask
# %pip install xgboost
# %pip install scikit-learn

In [None]:
import numpy as np
import pandas as pd
import regionmask
import xarray as xr

In [None]:
dataset = xr.open_zarr("processed/merged_aggregated_dataset_1850_2022.zarr.zip", consolidated=True)

In [None]:
dataset = dataset.stack(latlon=('lat', 'lon'))

In [None]:
regionmask.__version__

In [None]:
# Train on countries 0-80, and test on the rest, and test on the last 15 years of data
country_mask = regionmask.defined_regions.natural_earth_v5_0_0.countries_110.mask(dataset)
dataset = dataset.assign_coords(country=country_mask)

In [None]:
dataset

In [None]:
countries = regionmask.defined_regions.natural_earth_v5_0_0.countries_110
countries

In [None]:
dataset.country

In [None]:
dataset.groupby(dataset.country).count()

In [None]:
np.unique(dataset.time)

## Remove NaN ELUC_diff

In [None]:
# Create a boolean mask based on NaN values for ELUC_diff
mask = dataset['ELUC_diff'].isnull()

In [None]:
# Filter the dataset based on the mask
dataset = dataset.where(~mask, drop=True)

## Train and test sets
Create a dataset for a single country to keep it small and easy to train on  
Country: United Kingdom  
Train on data between 1850 and 2007  
Test on data after 2007 (i.e. 2008 to 2021)

In [None]:
countries_df = countries.to_dataframe()
countries_df

In [None]:
countries_df[countries_df.names == 'United Kingdom']

In [None]:
# train_da = dataset.where(dataset.country <= 80, drop=True).where(dataset.time <= 2007, drop=True)
# test_da = dataset.where(dataset.country > 80,  drop=True).where(dataset.time > 2007, drop=True)
train_da = dataset.where(dataset.country == 143, drop=True).where(dataset.time <= 2007, drop=True)
test_da = dataset.where(dataset.country == 143,  drop=True).where(dataset.time > 2007, drop=True)

In [None]:
train_da

In [None]:
test_da

In [None]:
features = list(train_da.data_vars)
# This is the target
features.remove('ELUC_diff')
# Do not use:
features.remove('cell_area')
features.remove('cell_area_diff')

In [None]:
features

In [None]:
# Extract features and target variable from the training dataset
X_train = train_da[features]
y_train = train_da['ELUC_diff']

In [None]:
# Extract features and target variable from the test dataset
X_test = test_da[features]
y_test = test_da['ELUC_diff']

In [None]:
X_test

In [None]:
y_test

In [None]:
np.isnan(y_test.values)

## XGBoost

In [None]:
import xgboost as xgb

In [None]:
X_train = np.column_stack([X_train[feature].values for feature in features])

In [None]:
model = xgb.XGBRegressor()

In [None]:
%%time
model.fit(X_train, y_train)

In [None]:
X_test = np.column_stack([X_test[feature].values for feature in features])

In [None]:
preds = model.predict(X_test)

In [None]:
preds

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

mse = mean_squared_error(y_test, preds, squared=True)
rmse = mean_squared_error(y_test, preds, squared=False)
mae = mean_absolute_error(y_test, preds)
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

In [None]:
preds.shape