# Evaluating classifier results

In [None]:
from evaluate_results import get_score_df, load_histo_file
from typing import List
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

Results files are loaded from paths specified in `config.yaml`. See `example_config.yaml` for an example of how this should be structured. The results files themselves can be downloaded from the shared results folder on Google Drive.

In [None]:
with open('config.yaml') as f:
    config = yaml.safe_load(f)
    RESULTS_DIR = config["results_dir"]
    RESULTS_PATHS = config["results_files"]

Load results, expecting them to be of the form as the files listed above.

In [None]:
results = {filename: load_histo_file(os.path.join(RESULTS_DIR, filename)) for filename in RESULTS_PATHS}

Now we merge the visual interpretation data in. This requires you to have the data at the appropriate path. Again, this data file can be downloaded from Drive, this time from the labels directory.

Alternatively, you can create the relevant CSVs yourself from the xlsx files provided by the client by running the scripts `xlsx_to_csv.py` and `validation_test_split.py` successively.

**NB: We specifically use "validation_complete.csv" because the test data is intended to be reserved only for evaluation of the system at the end of the project.**

In [None]:
labels_df = pd.read_csv("label_CSVs/validation_complete.csv")

# Here we remove any lines corresponding to unstocked forest (as of 2018), because one cannot reliably determine tree cover from land use in those cases.
print(labels_df.shape)

labels_df = labels_df.loc[
    (
        labels_df["Sub-Categories if Naturally regenerated forest"]
        != "Temporarily unstocked forest"
    )
    & (
        labels_df["Sub-Categories if Planted forest"]
        != "Temporarily unstocked planted forest"
    ),
    :,
]

print(labels_df.shape)

results_w_labels = dict()
for path, df in results.items():
    results_w_labels[path] = pd.merge(df, labels_df, how="inner", left_on="plotID",right_on="pl_plotid")


The block below prints the mean absolute errors, precision, recall, etc. of the different results files.

Precision and recall are calculated while treating the model as a binary forest loss or forest gain detector.

The F-5.0 column refers to an [$F_\beta$ score](https://en.wikipedia.org/wiki/F-score) with $\beta=5$. The score unifies precision and recall into a single metric. The value of $\beta$ determines how many times more important recall is compared to precision. If you wish to use a different beta, you can pass it to `get_score_df` using the keyword arguent `beta`.

In [None]:
for path, df in results_w_labels.items():
    print(path)
    print(get_score_df(df))

## Saving the cleaned data

If you wish, you can run the block below to save the processessed results files with the proper percentages as CSVs. 

In [None]:
CLEAN_DIR = "cleaned_results/"

if not os.path.exists(CLEAN_DIR):
    os.mkdir(CLEAN_DIR)

for path, df in results_w_labels.items():
    df.to_csv(CLEAN_DIR + path)

## Visualizing results

If you get an error like `UserWarning: 38.3% of the points cannot be placed` with the swarmplots, just decrease the value of the `size` parameter passed to the `sns.swarmplot` function.

In [None]:
total_df = pd.concat(results_w_labels.values())

In [None]:
get_score_df(total_df)

## Plotting deforestation

First, we plot actual deforestation against deforestation according to the visual interpreters. For clarity, hexas for which there was no deforestation according to the visual interpreters are visualized separately as a histogram below.

You may want to adjust the definition of `df` below to match whatever set of results you wish to visualize.

In [None]:
df = total_df
fig, axes = plt.subplots(2, 1)
fig.set_size_inches(18.5, 8)
w_change_2000 = total_df.loc[total_df["% Forest Loss 2000-2010"] >= 10, :]
w_change_2010 = total_df.loc[total_df["% Forest Loss 2010-2018"] >= 10, :]
sns.swarmplot(x=w_change_2000["% Forest Loss 2000-2010"], y=w_change_2000["deforestation 2000-2010"], size=2, ax=axes[0])
axes[0].set_ylabel("Predicted deforestation % 2000-2010")
axes[0].set_xlabel("Actual deforestation % 2000-2010")
sns.swarmplot(x=w_change_2010["% Forest Loss 2010-2018"], y=w_change_2010["deforestation 2010-2018"], size=2, ax=axes[1])
axes[1].set_ylabel("Predicted deforestation % 2010-2018")
axes[1].set_xlabel("Actual deforestation % 2010-2018")
fig.suptitle("Actual vs predicted deforestation for all hexas where there was deforestation", fontsize=16)
plt.show()

In [None]:
print(w_change_2000.shape)
print(w_change_2010.shape)

In [None]:
df = total_df
fig, axes = plt.subplots(2, 1)
fig.set_size_inches(18.5, 8)
w_no_change_2000 = total_df.loc[total_df["% Forest Loss 2000-2010"] < 10, :]
w_no_change_2010 = total_df.loc[total_df["% Forest Loss 2010-2018"] < 10, :]
sns.histplot(y=w_no_change_2000["deforestation 2000-2010"], ax=axes[0], bins=20)
sns.histplot(y=w_no_change_2010["deforestation 2010-2018"], ax=axes[1], bins=20)
x_lim = max(axes[0].get_xlim(), axes[1].get_xlim())
y_lim = (0, 100)
axes[0].set_xlim(x_lim)
axes[0].set_ylim(y_lim)
axes[1].set_xlim(x_lim)
axes[1].set_ylim(y_lim)
axes[0].set_ylabel("Predicted deforestation % 2000-2010")
axes[1].set_ylabel("Predicted deforestation % 2010-2018")
fig.suptitle("Histogram of predictions for hexas where there was no actual deforestation", fontsize=16)
plt.show()

Both the swarm plot and histogram of one time period, in one figure:

In [None]:
period = "2010-2018"

df = total_df
fig, axes = plt.subplots(2, 1)
fig.set_size_inches(18, 9)
w_no_change = total_df.loc[total_df["% Forest Loss " + period] < 10, :]
w_change = total_df.loc[total_df["% Forest Loss " + period] >= 10, :]
sns.swarmplot(x=w_change["% Forest Loss " + period], y=w_change["deforestation " + period], size=2, ax=axes[0])
sns.histplot(y=w_no_change["deforestation " + period], ax=axes[1], bins=50)
y_lim = (0, 100)
axes[0].set_ylim(y_lim)
axes[1].set_ylim(y_lim)
axes[0].set_ylabel("Predicted deforestation % " + period, fontsize=14)
axes[1].set_ylabel("Predicted deforestation % " + period, fontsize=14)
axes[0].set_xlabel("Actual deforestation % " + period, fontsize=14)
axes[1].set_xlabel("Count", fontsize=14)
axes[0].set_title("Actual vs predicted deforestation for all hexas where there was deforestation", fontsize=16)
axes[1].set_title("Histogram of predictions for hexas where there was no actual deforestation", fontsize=16)
plt.tight_layout()
plt.show()

## Visualizing amount of forest cover in 2018

In [None]:
year = "2010"

df = total_df
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(18.5, 8)
sns.swarmplot(x=df["% of Forest"], y=df["forest " + year], size=3, ax=ax)
ax.set_ylabel("Predicted forest % " + year)
ax.set_xlabel("Actual forest % " + year)
fig.suptitle("Actual vs predicted forest cover for all hexas", fontsize=16)
plt.show()