This notebook produces the confusion matrix for a data and country you have specified in config.yaml.

In [None]:
from pathlib import Path
import pandas as pd

# from scipy.spatial import distance
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from IPython.display import display, HTML

from common import get_comparison, load_results, get_config, exclude_unstocked

In [None]:
def add_labels(original_df, cf_type):
    df = original_df.copy()
    df["label_g"] = df["loss 2010-2018 g"].apply(lambda x: get_label(x, cf_type))
    df["label_p"] = df["loss 2010-2018 p"].apply(lambda x: get_label(x, cf_type))
    return df

def get_label(x, type="binary", threshold=5):
    if type == "binary":
        if x < threshold or x == 0:
            label = "no"
        else:
            label = "yes"
    else:
        if x >= 25:
            label = "high"
        elif x >= 15:
            label = "moderate"
        elif x >= threshold:
            label = "low"
        else:
            label = "zero"
    return label

In [None]:
def get_predictions_and_results():
    predictions = load_results()
    # Gold data
    ground_truth = pd.read_csv(Path("label_CSVs") / "validation_complete.csv")

    df = get_comparison(predictions, ground_truth, "inner")
    df["plotID"] = df["plotID"].astype(int)
    df["pl_plotid"] = df["pl_plotid"].astype(int)
    
    return df

def get_confusion_matrix(original_df, cf_type="binary"):
    # Prepare data for the confusion matrix

    df = original_df.copy()
    
    # Use binary or multi to produce either binary or multi-label confusion matrix
    df = add_labels(df, cf_type)
    
    cm = confusion_matrix(df["label_g"], df["label_p"])
    return cm

In [None]:
df = get_predictions_and_results()
df_exclude_unstocked = exclude_unstocked(df)
cf_type = "binary"

df = add_labels(df, cf_type)
tp = df[(df["label_p"] == "yes") & (df["label_g"] == "yes")].shape[0]
fp = df[(df["label_p"] == "yes") & (df["label_g"] == "no")].shape[0]
fn = df[(df["label_p"] == "no") & (df["label_g"] == "yes")].shape[0]
print(f"Precision: {tp / (tp+fp)}")
print(f"Recall: {tp / (tp+fn)}")

In [None]:
df[(df["label_p"] == "no") & (df["label_g"] == "yes")]

In [None]:
# Baseline: always predict loss
tp = df[(df["label_g"] == "yes")].shape[0]
fp = df[(df["label_g"] == "no")].shape[0]
fn = 0
print("Baseline (always predict deforestation)")
print(f"Precision: {tp / (tp+fp)}")
print(f"Recall: {tp / (tp+fn)}")

In [None]:
config = get_config()
country_name = config["confusion_matrix_country"]

In [None]:
cm = get_confusion_matrix(df_exclude_unstocked, cf_type)

if cf_type == "binary":
    labels = ["no deforestation", "deforestation"]
else:
    labels = ["zero", "low", "moderate", "high"]

display(HTML("<h2>Unstocked excluded</h2>"))
sns.set(font_scale=1.4)
fig, ax = plt.subplots(figsize=(6, 4))

g = sns.heatmap(
    cm, annot=True, fmt="g", ax=ax, cmap=sns.color_palette("light:b", as_cmap=True)
)
g.set_yticklabels(labels=g.get_yticklabels(), va="center")

# labels, title and ticks
ax.set_xlabel("Predicted labels")
ax.set_ylabel("True labels")
ax.set_title(f"Confusion Matrix for {country_name}")

ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)

In [None]:
cm = get_confusion_matrix(df, cf_type)

if cf_type == "binary":
    labels = ["no deforestation", "deforestation"]
else:
    labels = ["zero", "low", "moderate", "high"]
        
display(HTML("<h2>Unstocked excluded</h2>"))
sns.set(font_scale=1.4)
fig, ax = plt.subplots(figsize=(6, 4))

g = sns.heatmap(
    cm, annot=True, fmt="g", ax=ax, cmap=sns.color_palette("light:b", as_cmap=True)
)
g.set_yticklabels(labels=g.get_yticklabels(), va="center")

# labels, title and ticks
ax.set_xlabel("Predicted labels")
ax.set_ylabel("True labels")
ax.set_title(f"Confusion Matrix for {country_name}")

ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)