# Prediction Baseline Methods
This notebook contains four statistical baseline methods for tree loss and co2 prediction.

For statistical methods, there are four baseline methods to predict the values:
- Baseline 1: Global mean
- Baseline 2: Local mean
- Baseline 3: Latest value
- Baseline 4: Mixture of local mean and latest value

The reason of using "mixture of local mean and latest value" is because I find the global mean performs bad.

- Author: Xinshuang Liu
- Email: xil235@ucsd.edu

In [3]:
from dataset import TreeCoverLossDataset, DriverTypeDataset
import numpy as np
import math
import warnings

warnings.filterwarnings("ignore")

In [4]:
def eval_prediction(y_pred, y_test):
    mse = np.mean((y_pred - np.array(y_test)) ** 2)
    return mse

def eval_gp_model(dataset, mode, model, params):
    target_pos = 1 if mode == "tree_loss" else 2
    mse_list = []
    for train_data, test_data, _ in dataset:
        if len(train_data) == 0 or len(test_data) == 0:
            continue
        x_train = train_data[:, 0]
        y_train = train_data[:, target_pos]
        x_test = test_data[:, 0]
        y_test = test_data[:, target_pos]
        y_pred = model(x_train, y_train, x_test, params)
        mse = eval_prediction(y_pred, y_test)
        mse_list.append(mse)
    return math.sqrt(sum(mse_list) / len(mse_list))

## Baseline Method 1: Global Mean

In [5]:
def baseline1(x_train, y_train, x_test, params):
    global_mean = params['global_mean']
    y_pred = np.zeros_like(x_test) + global_mean
    return y_pred

In [12]:
def get_target_mean(dataset, pos):
    sum_value = 0
    count_value = 0
    for train_data, test_data, _ in dataset:
        try:
            sum_value += float(train_data[:, pos].sum())
            count_value += train_data[:, pos].shape[0]
        except:
            pass
    return sum_value / count_value


In [7]:
dataset = TreeCoverLossDataset("TreeCoverLoss_2001-2020_ByRegion.csv", split_train_test=True)

global_mean = get_target_mean(dataset, 1)
mse = eval_gp_model(dataset, "tree_loss", baseline1, {'global_mean': global_mean})
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020_ByRegion, tree loss prediction')
global_mean = get_target_mean(dataset, 2)
mse = eval_gp_model(dataset, "co2", baseline1, {'global_mean': global_mean})
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020_ByRegion, co2 prediction')

RMSE: 470781.58047442767, TreeCoverLoss_2001-2020_ByRegion, tree loss prediction
RMSE: 188727088.85231075, TreeCoverLoss_2001-2020_ByRegion, co2 prediction


In [13]:
dataset = TreeCoverLossDataset("TreeCoverLoss_2001-2020 _InPrimaryForest.csv", split_train_test=True)

global_mean = get_target_mean(dataset, 1)
mse = eval_gp_model(dataset, "tree_loss", baseline1, {'global_mean': global_mean})
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020 _InPrimaryForest, tree loss prediction')
global_mean = get_target_mean(dataset, 2)
mse = eval_gp_model(dataset, "co2", baseline1, {'global_mean': global_mean})
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020 _InPrimaryForest, co2 prediction')

RMSE: 170088.5484400669, TreeCoverLoss_2001-2020 _InPrimaryForest, tree loss prediction
RMSE: 114121367.39829747, TreeCoverLoss_2001-2020 _InPrimaryForest, co2 prediction


## Baseline Method 2: Local Mean

In [18]:
def baseline2(x_train, y_train, x_test, params):
    local_mean = y_train.mean()
    y_pred = np.zeros_like(x_test) + float(local_mean)
    return y_pred

In [19]:
dataset = TreeCoverLossDataset("TreeCoverLoss_2001-2020_ByRegion.csv", split_train_test=True)

mse = eval_gp_model(dataset, "tree_loss", baseline2, None)
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020_ByRegion, tree loss prediction')
mse = eval_gp_model(dataset, "co2", baseline2, None)
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020_ByRegion, co2 prediction')

RMSE: 207933.20525438452, TreeCoverLoss_2001-2020_ByRegion, tree loss prediction
RMSE: 83411307.6336327, TreeCoverLoss_2001-2020_ByRegion, co2 prediction


In [20]:
dataset = TreeCoverLossDataset("TreeCoverLoss_2001-2020 _InPrimaryForest.csv", split_train_test=True)

mse = eval_gp_model(dataset, "tree_loss", baseline2, None)
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020 _InPrimaryForest, tree loss prediction')
mse = eval_gp_model(dataset, "co2", baseline2, None)
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020 _InPrimaryForest, co2 prediction')

RMSE: 78740.57695681795, TreeCoverLoss_2001-2020 _InPrimaryForest, tree loss prediction
RMSE: 52346951.530306235, TreeCoverLoss_2001-2020 _InPrimaryForest, co2 prediction


## Baseline Method 3: Latest

In [21]:
def baseline3(x_train, y_train, x_test, params):
    latest = y_train.view(-1)[-1]
    y_pred = np.zeros_like(x_test) + float(latest)
    return y_pred

In [22]:
dataset = TreeCoverLossDataset("TreeCoverLoss_2001-2020_ByRegion.csv", split_train_test=True)

mse = eval_gp_model(dataset, "tree_loss", baseline3, None)
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020_ByRegion, tree loss prediction')
mse = eval_gp_model(dataset, "co2", baseline3, None)
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020_ByRegion, co2 prediction')

RMSE: 233968.2737682843, TreeCoverLoss_2001-2020_ByRegion, tree loss prediction
RMSE: 78785990.06862164, TreeCoverLoss_2001-2020_ByRegion, co2 prediction


In [24]:
dataset = TreeCoverLossDataset("TreeCoverLoss_2001-2020 _InPrimaryForest.csv", split_train_test=True)

mse = eval_gp_model(dataset, "tree_loss", baseline3, None)
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020 _InPrimaryForest, tree loss prediction')
mse = eval_gp_model(dataset, "co2", baseline3, None)
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020 _InPrimaryForest, co2 prediction')

RMSE: 77365.35585083769, TreeCoverLoss_2001-2020 _InPrimaryForest, tree loss prediction
RMSE: 51592066.54038596, TreeCoverLoss_2001-2020 _InPrimaryForest, co2 prediction


## Baseline Method 4: Mixture of Local Mean and Latest Value

In [25]:
def baseline4(x_train, y_train, x_test, params):
    return (baseline2(x_train, y_train, x_test, params) + baseline3(x_train, y_train, x_test, params)) / 2

In [26]:
dataset = TreeCoverLossDataset("TreeCoverLoss_2001-2020_ByRegion.csv", split_train_test=True)

mse = eval_gp_model(dataset, "tree_loss", baseline4, None)
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020_ByRegion, tree loss prediction')
mse = eval_gp_model(dataset, "co2", baseline4, None)
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020_ByRegion, co2 prediction')

RMSE: 219458.53290041632, TreeCoverLoss_2001-2020_ByRegion, tree loss prediction
RMSE: 80387962.65496935, TreeCoverLoss_2001-2020_ByRegion, co2 prediction


In [27]:
dataset = TreeCoverLossDataset("TreeCoverLoss_2001-2020 _InPrimaryForest.csv", split_train_test=True)

mse = eval_gp_model(dataset, "tree_loss", baseline4, None)
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020 _InPrimaryForest, tree loss prediction')
mse = eval_gp_model(dataset, "co2", baseline4, None)
print(f'RMSE: {mse}, TreeCoverLoss_2001-2020 _InPrimaryForest, co2 prediction')

RMSE: 76739.78856327367, TreeCoverLoss_2001-2020 _InPrimaryForest, tree loss prediction
RMSE: 50891653.39616664, TreeCoverLoss_2001-2020 _InPrimaryForest, co2 prediction
