# Notebook - Evaluation Pipeline

### Import modules

In [1]:
import matplotlib.pyplot as plt
import json
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.utils.data as Data
from tqdm import tqdm
from sklearn import metrics
import torch
import omegaconf
from sklearn.preprocessing import StandardScaler
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
plt.rc("font", family = "Times New Roman")

In [2]:
# set seed
torch.cuda.manual_seed(1)
# Switch to GPU for Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Clear GPU caches
torch.cuda.empty_cache()
# print(torch.cuda.memory_summary())

### Parameters Setup

In [3]:
# === Load configuration file
config_path = "../config/config.yaml"
config = omegaconf.OmegaConf.load(config_path)
trained_model_folder_no = config.evaluation.folder_no
save_format = "svg"

In [4]:
batch_size = 20000 # 20000
split_ratio = 0.8
model_param_path = "../trained_model/Train-Result-"+ str(trained_model_folder_no)+ "/trained_model.pth"
source_data_path = "../trained_model/Train-Result-"+ str(trained_model_folder_no)+ "/feature_dataset.parquet"
region_dict_path = "../raw_dataset/region_segmentation.json"

## A. Data Preparation

### Load Dataset

In [5]:
# import dataset
dataset = pd.read_parquet(source_data_path)
# get x and y
x_indic = dataset[["lat","lon","month","sequence"]]

x = dataset[['fuel_load_cwdc', 'fuel_load_deadcrootc', 'fuel_wetness', 'fuel_temperature', 'climate_wind', 'climate_tbot', 'climate_rh2m', 'climate_rain', 'human_density', 'light_frequency',"burned_area_mom","burned_area_yoy","burned_area_mom_conv","burned_area_yoy_conv","lat","month"]]
y = dataset["burned_area"]
# convert to numpy
x_indic = np.array(x_indic)
x = np.array(x)
y = np.array(y).reshape(-1, 1)
print(x.shape, y.shape)

(96271, 16) (96271, 1)


### Normalization

In [6]:
scaler_X = StandardScaler()
scaler_y = StandardScaler()
x_stand = scaler_X.fit_transform(x)
y_stand = scaler_y.fit_transform(y)

### Train Test Split

In [7]:
train_x = x_stand[0:int(split_ratio*len(x_stand)),:]
train_y = y_stand[0:int(split_ratio*len(y_stand)),:]
test_x = x_stand[int(split_ratio*len(x_stand)):x_stand.shape[0],:]
test_y = y_stand[int(split_ratio*len(y_stand)):y_stand.shape[0],:]

### Dataloader

In [8]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data_inputs, data_targets):
        self.inputs = torch.tensor(data_inputs).float()
        self.label = torch.tensor(data_targets).float()

    def __getitem__(self, idx):
        x = self.inputs[idx]
        y = self.label[idx]
        return x, y

    def __len__(self):
        return len(self.label)

In [9]:
trainset = Dataset(train_x, train_y)
testset = Dataset(test_x, test_y)
TrainDataLoader = Data.DataLoader(trainset, batch_size=batch_size, shuffle=True, drop_last=False)
TestDataLoader = Data.DataLoader(testset, batch_size=batch_size, shuffle=False, drop_last=False)

In [10]:
for i,(v,j) in enumerate(TrainDataLoader):
    print(v.shape,j.shape)
    break

torch.Size([20000, 16]) torch.Size([20000, 1])


### Model Architecture

In [11]:
# Define the neural network
class network(nn.Module):
    def __init__(self):
        super(network, self).__init__()
        self.layer1 = nn.Linear(train_x.shape[1], train_x.shape[1]*2)
        self.layer2 = nn.Linear(train_x.shape[1]*2, 32)
        self.layer3 = nn.Linear(32, 16)
        self.layer4 = nn.Linear(16, 4)
        self.layer5 = nn.Linear(4, 1)
        self.activation = nn.Tanh()
        self.dropout = nn.Dropout(0.15)

    def forward(self, x):
        x = self.layer1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.layer2(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.layer3(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.layer4(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.layer5(x)

        return x

In [12]:
net = network().to(device)
net

network(
  (layer1): Linear(in_features=16, out_features=32, bias=True)
  (layer2): Linear(in_features=32, out_features=32, bias=True)
  (layer3): Linear(in_features=32, out_features=16, bias=True)
  (layer4): Linear(in_features=16, out_features=4, bias=True)
  (layer5): Linear(in_features=4, out_features=1, bias=True)
  (activation): Tanh()
  (dropout): Dropout(p=0.15, inplace=False)
)

### Load pre-trained model

In [13]:
state_dict = torch.load(model_param_path)
net.load_state_dict(state_dict)

<All keys matched successfully>

### Generate model predictions

In [14]:
# prepare data
val_x = torch.tensor(x_stand).float()
val_y = torch.tensor(y_stand).float()
lat_indicator = x_indic[:,0]
lon_indicator = x_indic[:,1]
month_indicator = x_indic[:,2]
year_indicator = x_indic[:,3]

# create lists
month_list = []
y_true_list = []
y_pred_list = []
region_list = []
year_list = []

# validation mode
net.eval()
for idx in tqdm(range(val_x.shape[0])):
    month_list.append(month_indicator[idx])
    region_list.append([lat_indicator[idx],lon_indicator[idx]])
    year_list.append(year_indicator[idx])
    x_input = val_x[idx,:].to(device)
    y_input = val_y[idx,-1].to(device).detach().cpu().numpy().squeeze()
    y_true_list.append(y_input)
    y_pred = net(x_input)
    y_pred = y_pred.detach().cpu().numpy().squeeze()
    y_pred_list.append(y_pred)

100%|██████████| 96271/96271 [01:22<00:00, 1167.11it/s]


## B. Performance Evaluation

### Evaluation Panel Preparation

In [15]:
# Import JSON file
with open(region_dict_path,'r', encoding='UTF-8') as f:
    region_dict = json.load(f)

In [16]:
# Lambda Function to substitute Lat and Lon with Region names
def region_labeler(x):
    for region_name in region_dict.keys():
        if x in region_dict[region_name]:
            return region_name

# month_dict_convertor
month_dict = {1:"Jan",2:"Feb",3:"Mar",4:"Apr",5:"May",6:"Jun",7:"Jul",8:"Aug",9:"Sep",10:"Oct",11:"Nov",12:"Dec"}
# Lambda Function to substitute years seq to years 
year_dict_temp = list(set(year_list))
year_dict= {}
sep_lst = []
start_year = 2001
for i,v in enumerate(year_dict_temp):
    if (i+1) % 12 != 0:
        sep_lst.append(v)
    else:
        sep_lst.append(v)
        year_dict[start_year] = sep_lst
        sep_lst = []
        start_year += 1
def year_labeler(x):
    for yrs in year_dict.keys():
        if x in year_dict[yrs]:
            return yrs

In [17]:
# Reverse Standization
true_log_value = scaler_y.inverse_transform(np.array(y_true_list).reshape(-1,1))
pred_log_value = scaler_y.inverse_transform(np.array(y_pred_list).reshape(-1,1))

evaluator = pd.DataFrame(columns=["y_true","y_pred","month","region","year"])
# model predictions
evaluator["y_true"] = y_true_list
evaluator["y_pred"] = y_pred_list
# inverse standardization
evaluator["y_true_transform_log"] = true_log_value
evaluator["y_pred_transform_log"] = pred_log_value

evaluator["y_true_transform_origin"] = np.exp(evaluator["y_true_transform_log"])
evaluator["y_pred_transform_origin"] = np.exp(evaluator["y_pred_transform_log"])

# month labeling
evaluator["month"] = month_list
evaluator["month_label"] = evaluator["month"].apply(lambda x: month_dict[x])
# year labeling
evaluator["year"] = year_list
evaluator["year_label"] = evaluator["year"].apply(lambda x: year_labeler(x))
# region labeling
evaluator["region"] = region_list
evaluator["region_label"] = evaluator["region"].apply(lambda x: region_labeler(x))
# drop na values
evaluator.dropna(inplace=True)

### 1. Accuracy, Correlation Scores

In [18]:
print("-> Accuracy Score (R-sqaured): ", metrics.r2_score(y_true_list,y_pred_list))
print("-> Pearson Correlation Score (Rho): ", np.corrcoef(y_true_list,y_pred_list)[0][1])

-> Accuracy Score (R-sqaured):  0.777916531379282
-> Pearson Correlation Score (Rho):  0.8832095152035105


### 2. Evaluation by Years

In [19]:
evaluation_year = evaluator.copy()
evaluation_year = evaluation_year[['y_true', 'y_pred','y_true_transform_log',
                                   'y_pred_transform_log', 'y_true_transform_origin',
                                   'y_pred_transform_origin', 'year_label']]
group_data = pd.DataFrame(evaluation_year.groupby(by="year_label").sum())

In [20]:
plt.plot(group_data["y_true_transform_log"], label = "Ln(True Burned Area)", color = "darkred")
plt.plot(group_data["y_pred_transform_log"], label = "Ln(Predicted Burned Area)")
plt.fill_between(group_data.index, group_data["y_true_transform_log"].tolist(), group_data["y_pred_transform_log"].tolist(),
                 facecolor = "dimgray",alpha=.5, linewidth=0)
plt.legend(loc = "best",fontsize = 10)
plt.xlabel("Year")
plt.ylabel("Burned Area")
plt.grid(False)
plt.savefig("Evaluation_Burned Area_By_Year.svg",bbox_inches="tight")
plt.close()

### 3. Evaluation by Months

In [21]:
evaluation_month = evaluator.copy()
del evaluation_month["region"]
group_data = pd.DataFrame(evaluation_month.groupby(by="month").sum()).sort_values(by="month")
group_data["month_label"] = group_data.index
group_data["month_label"] = group_data["month_label"].apply(lambda x: month_dict[x])
group_data.set_index("month_label",inplace=True,drop=True)

In [22]:
plt.plot(group_data["y_true_transform_log"], label = "Ln(True Burned Area)", color = "darkred")
plt.plot(group_data["y_pred_transform_log"], label = "Ln(Predicted Burned Area)")
plt.fill_between(group_data.index, group_data["y_true_transform_log"].tolist(), group_data["y_pred_transform_log"].tolist(),
                 facecolor = "dimgray", alpha=.5, linewidth=0)
plt.title("Performance Evaluation by Month (Log Transformed)",fontsize = 11)
plt.grid(linestyle = '--', linewidth = 0.5, axis='x')
plt.xlabel("Month")
plt.ylabel("Burned Area")
plt.legend(loc = "best",fontsize = 10)
plt.grid(False)
plt.savefig("Evaluation_Burned Area_By_Month."+save_format,bbox_inches="tight")
plt.close()

### 4. Evaluation By Region & Year

In [23]:
evaluation_region = evaluator.copy()
group_data = pd.DataFrame(evaluation_region.groupby(by=["year","region_label"]).sum())
group_data["region"] = group_data.index
group_data["region"] = group_data["region"].apply(lambda x: x[1])
region_num_mapper = {}
for i,v in enumerate(set(group_data["region"].tolist())):
    region_num_mapper[v] = i
group_data["region_label"] = group_data["region"].apply(lambda x: region_num_mapper[x])

In [24]:
x_line = np.linspace(group_data["y_pred_transform_log"].min()-12, group_data["y_pred_transform_log"].max(), 100)
y_line = x_line
plt.plot(x_line, y_line, 'r--',c="k")
# Iterations to draw scatter points
region_list = region_num_mapper.keys()
for i, reg in enumerate(region_list):
    temp_df = group_data.loc[group_data["region"] == reg]
    plt.scatter(temp_df["y_true_transform_log"],temp_df["y_pred_transform_log"], marker="+",s=450)
plt.xlim(np.min(x_line), np.max(x_line))
plt.ylim(np.min(y_line), np.max(y_line))
plt.xlabel("True Ln(Burned Area)")
plt.ylabel("CNN-Fire Ln(Burned Area)")
plt.legend(labels = group_data["region"], bbox_to_anchor=(1.05, 0), loc = 3, borderaxespad = 0)
plt.grid(False)
plt.savefig("Evaluation_Burned Area_Compare."+save_format,bbox_inches="tight")
plt.close()

### 5. Evaluation by Region

In [25]:
evaluation_region = evaluator.copy()
group_data = pd.DataFrame(evaluation_region.groupby(by=["region_label"]).sum())
group_data["region"] = group_data.index
group_data["region"] = group_data["region"].apply(lambda x: x[1])
region_num_mapper = {}
for i,v in enumerate(set(group_data["region"].tolist())):
    region_num_mapper[v] = i
group_data["region_label"] = group_data["region"].apply(lambda x: region_num_mapper[x])

In [26]:
labels = group_data.index.tolist()
true_area = group_data["y_true_transform_log"].tolist()
pred_area = group_data["y_pred_transform_log"].tolist()
x = np.arange(len(group_data.index.tolist()))  # the label locations
width = 0.35  # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, true_area, width, label="Ln(True Burned Area)", color = "steelblue")
rects2 = ax.bar(x + width/2, pred_area, width, label="Ln(Predicted Burned Area)", color = "dimgrey")
ax.set_ylabel('Burned Area')
ax.set_xlabel('Region')
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize = 8)
plt.grid(False)
ax.legend()
plt.savefig("Evaluation_Burned Area_By_Region."+save_format,bbox_inches="tight")
plt.close()

### 6. Multi - comparison

In [28]:
dnn_df = pd.read_parquet("../trained_model/Train-Result-"+ str(trained_model_folder_no)+ "/baseline_evaluation_dnn.parquet")
dt_df = pd.read_parquet("../trained_model/Train-Result-"+ str(trained_model_folder_no)+ "/baseline_evaluation_dt.parquet")
kn_df = pd.read_parquet("../trained_model/Train-Result-"+ str(trained_model_folder_no)+ "/baseline_evaluation_kn.parquet")
lasso_df = pd.read_parquet("../trained_model/Train-Result-"+ str(trained_model_folder_no)+ "/baseline_evaluation_lasso.parquet")
lr_df = pd.read_parquet("../trained_model/Train-Result-"+ str(trained_model_folder_no)+ "/baseline_evaluation_lr.parquet")
ridge_df = pd.read_parquet("../trained_model/Train-Result-"+ str(trained_model_folder_no)+ "/baseline_evaluation_ridge.parquet")
dnn_df["month_label"] = dnn_df["month"].apply(lambda x: month_dict[x])
dt_df["month_label"] = dt_df["month"].apply(lambda x: month_dict[x])
kn_df["month_label"] = kn_df["month"].apply(lambda x: month_dict[x])
lasso_df["month_label"] = lasso_df["month"].apply(lambda x: month_dict[x])
lr_df["month_label"] = lr_df["month"].apply(lambda x: month_dict[x])
ridge_df["month_label"] = ridge_df["month"].apply(lambda x: month_dict[x])

In [29]:
order_dict = {"Jan":1,"Feb":2,"Mar":3,"Apr":4,"May":5,"Jun":6,"Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12}

In [30]:
dnn_df = dnn_df.groupby("month_label").agg({"y_pred_transform_log":"sum"})
dnn_df["month"] = dnn_df.index
dnn_df["month_order"] = dnn_df["month"].apply(lambda x: order_dict[x])
dnn_df.sort_values(by="month_order",inplace=True)

dt_df = dt_df.groupby("month_label").agg({"y_pred_transform_log":"sum"})
dt_df["month"] = dt_df.index
dt_df["month_order"] = dt_df["month"].apply(lambda x: order_dict[x])
dt_df.sort_values(by="month_order",inplace=True)

kn_df = kn_df.groupby("month_label").agg({"y_pred_transform_log":"sum"})
kn_df["month"] = kn_df.index
kn_df["month_order"] = kn_df["month"].apply(lambda x: order_dict[x])
kn_df.sort_values(by="month_order",inplace=True)

lasso_df = lasso_df.groupby("month_label").agg({"y_pred_transform_log":"sum"})
lasso_df["month"] = lasso_df.index
lasso_df["month_order"] = lasso_df["month"].apply(lambda x: order_dict[x])
lasso_df.sort_values(by="month_order",inplace=True)

lr_df = lr_df.groupby("month_label").agg({"y_pred_transform_log":"sum"})
lr_df["month"] = lr_df.index
lr_df["month_order"] = lr_df["month"].apply(lambda x: order_dict[x])
lr_df.sort_values(by="month_order",inplace=True)

ridge_df = ridge_df.groupby("month_label").agg({"y_pred_transform_log":"sum"})
ridge_df["month"] = ridge_df.index
ridge_df["month_order"] = ridge_df["month"].apply(lambda x: order_dict[x])
ridge_df.sort_values(by="month_order",inplace=True)

evaluation_month = evaluator.copy()
del evaluation_month["region"]
cnn_df = pd.DataFrame(evaluation_month.groupby(by="month").sum()).sort_values(by="month")
cnn_df["month_label"] = cnn_df.index
cnn_df["month_label"] = cnn_df["month_label"].apply(lambda x: month_dict[x])
cnn_df.set_index("month_label",inplace=True,drop=True)
cnn_df["month"] = cnn_df.index
cnn_df["month_order"] = cnn_df["month"].apply(lambda x: order_dict[x])
cnn_df.sort_values(by="month_order",inplace=True)

In [31]:
consolidated_df = pd.DataFrame(columns = ["month","CNN","DNN","Decision Tree","K-Nearst Neighbour","Lasso Regression","Linear Regression","Ridge Regression"])
consolidated_df["month"] = ridge_df["month"].tolist()
consolidated_df["CNN"] = cnn_df["y_pred_transform_log"].tolist()
consolidated_df["DNN"] = dnn_df["y_pred_transform_log"].tolist()
consolidated_df["Decision Tree"] = dt_df["y_pred_transform_log"].tolist()
consolidated_df["K-Nearst Neighbour"] = kn_df["y_pred_transform_log"].tolist()
consolidated_df["Lasso Regression"] = lasso_df["y_pred_transform_log"].tolist()
consolidated_df["Linear Regression"] = lr_df["y_pred_transform_log"].tolist()
consolidated_df["Ridge Regression"] = ridge_df["y_pred_transform_log"].tolist()
consolidated_df["Observation"] = cnn_df["y_true_transform_log"].tolist()
consolidated_df.set_index("month",drop=True,inplace=True)
consolidated_df["min"] = consolidated_df.min(axis = 1) - 1000
consolidated_df["max"] = consolidated_df.max(axis = 1) + 1000
consolidated_df

Unnamed: 0_level_0,CNN,DNN,Decision Tree,K-Nearst Neighbour,Lasso Regression,Linear Regression,Ridge Regression,Observation,min,max
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Jan,57353.6875,56107.152344,56674.975101,57024.217368,56404.678937,56757.560422,56757.407344,56480.835938,55107.152344,58353.6875
Feb,61983.910156,60469.253906,61824.603005,61882.528961,61051.345306,61404.092557,61403.993042,61746.945312,59469.253906,62983.910156
Mar,66666.734375,64935.128906,65976.370875,66286.443475,65540.165779,65076.59905,65076.860723,65834.835938,63935.128906,67666.734375
Apr,83709.726562,80392.15625,83465.723914,85135.438493,82948.819176,80124.284941,80124.99753,85079.421875,79124.284941,86135.438493
May,117086.65625,112716.765625,119403.455403,121513.14188,118345.209865,112951.872609,112953.39298,121706.242188,111716.765625,122706.242188
Jun,142666.296875,139524.28125,142083.302348,145749.831486,143766.369728,138137.842198,138139.626002,145657.953125,137137.842198,146749.831486
Jul,158057.203125,155658.453125,156601.565484,158235.213219,160932.686404,155749.696733,155751.23,158230.265625,154658.453125,161932.686404
Aug,155047.84375,152942.9375,152119.00398,155729.58255,161333.531349,155040.045253,155041.367051,153158.96875,151119.00398,162333.531349
Sep,118442.03125,117052.09375,114481.142481,117592.112016,125177.207219,119346.099104,119346.45046,114596.53125,113481.142481,126177.207219
Oct,75782.976562,75066.757812,73603.056049,74878.640942,80117.006821,76511.789185,76511.421531,73385.15625,72385.15625,81117.006821


In [32]:
plt.plot(consolidated_df["Observation"], label = "Observation")
plt.scatter(x = consolidated_df.index, y = consolidated_df["DNN"], marker = ".", s = 100, label = "NN-Fire", color = "k")
plt.scatter(x = consolidated_df.index, y = consolidated_df["CNN"], marker = ".", s = 100, label = "CNN-Fire", color = "cadetblue")
plt.scatter(x = consolidated_df.index, y = consolidated_df["Decision Tree"], marker = ".", s = 100, label = "Decision Tree-Fire")
plt.scatter(x = consolidated_df.index, y = consolidated_df["K-Nearst Neighbour"], marker = ".", s = 100, label = "KNN-Fire")
plt.scatter(x = consolidated_df.index, y = consolidated_df["Lasso Regression"], marker = ".", s = 100, label = "Lasso Regression-Fire")
plt.scatter(x = consolidated_df.index, y = consolidated_df["Linear Regression"], marker = ".", s = 100, label = "Linear Regression-Fire")
plt.scatter(x = consolidated_df.index, y = consolidated_df["Ridge Regression"], marker = ".", s = 100, label = "Ridge Regression-Fire")

plt.fill_between(consolidated_df.index, consolidated_df["min"], consolidated_df["max"], facecolor = "darkgray",alpha=.5, linewidth=0)
plt.legend(bbox_to_anchor=(1.05, 0.28), loc = 3, borderaxespad = 0)
plt.xlabel("Month")
plt.ylabel("Burned Area")
plt.grid(False)
plt.savefig("Evaluation_Burned Area_Baseline_Compare."+save_format,bbox_inches="tight")
plt.close()