In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
# %load_ext cudf.pandas
# import pandas as pd
# print(pd)


In [None]:
# Optimizations
# GDAL optimizations
import multiprocessing as mp
import os

cpu_count: int = mp.cpu_count()
num_cores: int = cpu_count - 2
os.environ["GDAL_NUM_THREADS"] = f"{num_cores}"
os.environ["GDAL_CACHEMAX"] = "1024"


## Libraries

In [None]:
# Imports
from pathlib import Path
import numpy as np
import riskmapjnr as rmj


## Set user parameters

In [None]:
project_name = "test"


In [None]:
years = [2015, 2020, 2024]
tree_cover_threshold = 10
forest_source = "gfc"  ##gfc, tmf


In [None]:
coarse_grid_cell_size_pixels: list[int] = [300]
models_to_compare: list[str] = ["rmj_bm", "rmj_mw", "far_rf", "far_icar"]
periods: list[str] = ["calibration", "validation"]


In [None]:
# coarse_grid_cell_size_pixels = [50, 100]
# models_to_compare = ["rmj_bm", "rmj_mw", "rf", "icar", "glm", "user"]


## Connect folders

In [None]:
root_folder: Path = Path.cwd().parent
downloads_folder: Path = root_folder / "data"
downloads_folder.mkdir(parents=True, exist_ok=True)


In [None]:
project_folder = downloads_folder / project_name
project_folder.mkdir(parents=True, exist_ok=True)
processed_data_folder = project_folder / "data"
processed_data_folder.mkdir(parents=True, exist_ok=True)
evaluation_folder = project_folder / "evaluation"
evaluation_folder.mkdir(parents=True, exist_ok=True)


## Select predictions files

In [None]:
def list_folders(directory):
    """
    Lists all folders (directories) within a specified directory.

    Parameters:
        directory (str): The path to the directory from which to list folders.

    Returns:
        list: A list of folder names within the specified directory.
              If an error occurs, returns an empty list and prints an error message.
    """
    try:
        # Create a Path object for the directory
        path = Path(directory)

        # Filter out only directories (folders) using is_dir()
        folders = [entry for entry in path.iterdir() if entry.is_dir()]

        return folders
    except FileNotFoundError:
        print(f"The directory {directory} does not exist.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []


In [None]:
def filter_folders(input_folders, filter_words, exclude_words=None):
    """
    Filters a list of folders based on include and exclude words.
    Parameters:
        input_folders (list): List of folder names to be filtered.
        filter_words (list): Words that must be present in the folder names for inclusion.
        exclude_words (list, optional): Words that must not be present in the folder names for exclusion. Defaults to None.
    Returns:
        list: Filtered list of folders.
    """
    # Ensure all words are lowercase for case-insensitive comparison
    filter_words = [word.lower() for word in filter_words]
    exclude_words = [word.lower() for word in (exclude_words or [])]

    filtered_folders = [
        folder
        for folder in input_folders
        if any(word in folder.name.lower() for word in filter_words)
        and not any(
            exclude_word in folder.name.lower() for exclude_word in exclude_words
        )
    ]

    return filtered_folders


In [None]:
def list_files_by_extension(folder_path, file_extensions, recursive=False):
    """
    List all files with specified extensions in the given folder.
    Parameters:
    folder_path (str or Path): The path to the folder where you want to search for files.
    file_extensions (list of str): A list of file extensions to search for (e.g., ['.shp', '.tif']).
    recursive (bool): Whether to recursively search through subdirectories or not.
    Returns:
    list: A list of file paths with the specified extensions.
    """
    matching_files = []
    try:
        # Convert folder_path to Path object if it's a string
        folder_path = Path(folder_path)

        # Check if the provided path is a directory
        if folder_path.is_dir():
            for entry in folder_path.iterdir():
                if entry.is_file() and any(
                    entry.suffix.lower() == ext.lower() for ext in file_extensions
                ):
                    matching_files.append(str(entry))
                elif recursive and entry.is_dir():
                    # Recursively search subdirectories
                    matching_files.extend(
                        list_files_by_extension(entry, file_extensions, recursive)
                    )
        else:
            print(f"The provided path '{folder_path}' is not a directory.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return matching_files


In [None]:
def filter_files(input_files, filter_words, exclude_words=None, include_all=True):
    """
    Filters a list of files based on include and exclude words.
    Parameters:
        input_files (list): List of file paths to be filtered.
        filter_words (list): Words that must be present in the filenames for inclusion.
        exclude_words (list, optional): Words that must not be present in the filenames for exclusion. Defaults to None.
        include_all (bool, optional): If True, all filter words must be present in the filename. If False, at least one of the filter words must be present. Defaults to False.
    Returns:
        list: Filtered list of files.
    """
    # Ensure all words are lowercase for case-insensitive comparison
    filter_words = [word.lower() for word in filter_words]
    exclude_words = [word.lower() for word in (exclude_words or [])]

    if include_all:
        filtered_files = [
            file
            for file in input_files
            if all(word in Path(file).name.lower() for word in filter_words)
            and not any(
                exclude_word in Path(file).name.lower()
                for exclude_word in exclude_words
            )
        ]
    else:
        filtered_files = [
            file
            for file in input_files
            if any(word in Path(file).name.lower() for word in filter_words)
            and not any(
                exclude_word in Path(file).name.lower()
                for exclude_word in exclude_words
            )
        ]

    return filtered_files


In [None]:
def filter_out_ipynb_checkpoints(input_files):
    """
    Filters out files whose paths contain '.ipynb_checkpoints'.
    Parameters:
        input_files (list): List of file paths to be filtered.
    Returns:
        list: Filtered list of files.
    """
    filtered_files = [
        file for file in input_files if ".ipynb_checkpoints" not in Path(file).parts
    ]
    filtered_files = [
        file for file in input_files if "indices_all" not in Path(file).parts
    ]
    return filtered_files


In [None]:
directory_path = project_folder
folders = list_folders(directory_path)
available_models = filter_folders(folders, models_to_compare, ["data", "data_raw"])
print("Models_available:", available_models)


In [None]:
folders = list_folders(project_folder)
available_models = filter_folders(folders, models_to_compare, ["data", "data_raw"])

available_prediction_files = []
for model_folder in available_models:
    tif_files = list_files_by_extension(model_folder, [".tif"], True)
    model_files = filter_files(tif_files, periods, None, False)
    available_prediction_files.append(model_files)

available_defrate_files = []
for model_folder in available_models:
    csv_files = list_files_by_extension(model_folder, [".csv"], True)
    defrate_files = filter_files(csv_files, periods, None, False)
    defrate_files1 = filter_files(defrate_files, ["defrate"])
    defrate_files2 = filter_out_ipynb_checkpoints(defrate_files1)
    available_defrate_files.append(defrate_files2)


In [None]:
available_prediction_files = sum(available_prediction_files, [])
available_defrate_files = sum(available_defrate_files, [])


In [None]:
# Create dictionaries mapping common names to paths
dict1 = {}
dict2 = {}


# Function to extract the subsystem/model part from path
def extract_subsystem(path):
    """Extract the subsystem identifier from path structure"""
    path_obj = Path(path)

    # Get the path components and look for key directories like rmj_bm, rmj_mw, etc.
    # The pattern is: .../test/{subsystem}/{validation|calibration}/...
    path_parts = path_obj.parts

    # Find the subsystem by looking for test directory and its immediate subdirectory
    try:
        test_index = path_parts.index("test")
        if test_index + 1 < len(path_parts):
            return path_parts[
                test_index + 1
            ]  # This should be the subsystem like rmj_bm
    except ValueError:
        pass

    return None


# Function to extract the period (validation/calibration/historical) from path
def extract_period(path):
    """Extract the period identifier from path structure"""
    path_obj = Path(path)

    # Get the path components
    path_parts = path_obj.parts

    # Find the period by looking for validation or calibration directories
    try:
        for i, part in enumerate(path_parts):
            if part in ["validation", "calibration", "historical"]:
                return part
    except ValueError:
        pass

    return None


for path in available_prediction_files:
    path_obj = Path(path)
    filename = path_obj.name
    subsystem = extract_subsystem(path)
    period = extract_period(path)

    name_no_ext = Path(filename).stem
    if name_no_ext.startswith("prob_"):
        identifier = name_no_ext[len("prob_") :]
    else:
        identifier: str = name_no_ext
    key = (identifier, subsystem, period)
    if key not in dict1:
        dict1[key] = []
    dict1[key].append(path)

for path in available_defrate_files:
    path_obj = Path(path)
    filename = path_obj.name
    subsystem = extract_subsystem(path)
    period = extract_period(path)

    name_no_ext = Path(filename).stem
    identifier = name_no_ext[len("defrate_cat_") :]

    key = (identifier, subsystem, period)
    if key not in dict2:
        dict2[key] = []
    dict2[key].append(path)


# Create the final matching dictionary - now we match by both identifier and subsystem
models_dict = {}
for key1 in dict1:
    if key1 in dict2:
        # Create all possible combinations between tiff and csv files with same identifier and subsystem
        identifier, subsystem, period = key1
        for tiff_path in dict1[key1]:
            for csv_path in dict2[key1]:
                models_dict[(identifier, subsystem, period)] = (
                    tiff_path,
                    csv_path,
                )


In [None]:
print("Final matching dictionary with all attributes:")
for key, value in models_dict.items():
    identifier, subsystem, period = key
    tiff_path, csv_path = value
    print(f"Identifier '{identifier}', Subsystem '{subsystem}', Period '{period}':")
    print(f"  TIFF: {tiff_path}")
    print(f"  CSV:  {csv_path}")
    print()


## Select forest cover change file

In [None]:
# List all raster files in the processed data folder
input_raster_files = list_files_by_extension(processed_data_folder, [".tiff", ".tif"])


In [None]:
forest_change_file = filter_files(
    input_raster_files,
    ["forest", "loss", forest_source] + [str(num) for num in years],
    ["distance", "edge"],
)[0]


## Periods dictionaries

In [None]:
calibration_dict = {
    "period": "calibration",
    "initial_year": years[0],
    "final_year": years[1],
    "defor_value": 1,
    "time_interval": years[1] - years[0],
}
validation_dict = {
    "period": "validation",
    "initial_year": years[1],
    "final_year": years[2],
    "defor_value": 1,
    "time_interval": years[2] - years[1],
}
historical_dict = {
    "period": "historical",
    "initial_year": years[0],
    "final_year": years[2],
    "defor_value": [1, 2],
    "time_interval": years[2] - years[0],
}


In [None]:
# Crear el diccionario principal
period_dict = {
    calibration_dict["period"]: calibration_dict,
    validation_dict["period"]: validation_dict,
    historical_dict["period"]: historical_dict,
}


## Compare models

In [None]:
import forestatrisk as far


def compare_models(
    fcc_file,
    csizes_val,
    val_periods,
    val_models,
    available_prediction_files,
    available_defrate_files,
    period_dict,
):
    for csize_val in csizes_val:
        for period in val_periods:
            period_output_folder = evaluation_folder / period
            period_output_folder.mkdir(parents=True, exist_ok=True)
            for model in val_models:
                riskmap_file = filter_files(
                    available_prediction_files, [model, period], None, True
                )[0]
                defrate_file = filter_files(
                    available_defrate_files, [model, period], None, True
                )[0]
                far.validation_udef_arp(
                    # validation_udef_arp_xr(
                    fcc_file=fcc_file,
                    period=period,
                    time_interval=period_dict[period]["time_interval"],
                    riskmap_file=riskmap_file,
                    tab_file_defor=defrate_file,
                    csize_coarse_grid=csize_val,
                    indices_file_pred=period_output_folder
                    / f"indices_{model}_{period}_{csize_val}.csv",
                    tab_file_pred=period_output_folder
                    / f"pred_obs_{model}_{period}_{csize_val}.csv",
                    fig_file_pred=period_output_folder
                    / f"pred_obs_{model}_{period}_{csize_val}.png",
                    verbose=False,
                )


In [None]:
import forestatrisk as far
from typing import Dict


def compare_models(
    fcc_file: Path,
    csizes_val: list[int],
    models_dict: Dict,
    period_dict: Dict,
):
    for csize_val in csizes_val:
        for key, value in models_dict.items():
            identifier, subsystem, period = key
            riskmap_file, defrate_file = value
            period_output_folder = evaluation_folder / period
            period_output_folder.mkdir(parents=True, exist_ok=True)
            if not Path(
                period_output_folder / f"indices_{identifier}_{csize_val}.csv"
            ).exists():
                far.validation_udef_arp(
                    fcc_file=fcc_file,
                    period=period,
                    time_interval=period_dict[period]["time_interval"],
                    riskmap_file=riskmap_file,
                    tab_file_defor=defrate_file,
                    csize_coarse_grid=csize_val,
                    indices_file_pred=period_output_folder
                    / f"indices_{identifier}_{csize_val}.csv",
                    tab_file_pred=period_output_folder
                    / f"pred_obs_{identifier}_{csize_val}.csv",
                    fig_file_pred=period_output_folder
                    / f"pred_obs_{identifier}_{csize_val}.png",
                    verbose=False,
                )


In [None]:
compare_models(
    forest_change_file,
    coarse_grid_cell_size_pixels,
    models_dict,
    period_dict,
)


## Join all the indices 

In [None]:
evaluation_csv_files = list_files_by_extension(evaluation_folder, [".csv"], True)
indices_csv_files = filter_files(evaluation_csv_files, ["indices"], None, False)
indices_csv_files_clean = filter_out_ipynb_checkpoints(indices_csv_files)


In [None]:
import pandas as pd


from pathlib import Path


def extract_info_from_filename(filepath):
    """
    Extracts period and model from a given filename.

    Args:
        filepath (str): The full path to the file.

    Returns:
        tuple: A tuple containing (period, model).
    """
    # Convert the filepath to a Path object
    path = Path(filepath)

    # Get the filename without the extension
    filename = path.stem

    # Split the filename by underscores
    parts = filename.split("_")

    # The period is always the last part before the number (which is the last part)
    # Find where the numeric part starts
    for i in range(len(parts) - 1, 0, -1):
        if parts[i].isdigit():
            # The model is between 'indices' and the period
            period = parts[i - 1]
            # The model name can contain underscores
            model_parts = parts[
                1 : i - 1
            ]  # Skip 'indices' (index 0) and period (index i-1)
            model = "_".join(model_parts)
            return period, model

    # If no numeric part is found, fallback to original logic
    period = parts[-1]
    model = parts[-2]
    return period, model


def combine_model_results(indices_files_list):
    """Combine model results for comparison."""
    indices_list = []
    for file in indices_files_list:
        if Path(file).is_file():
            period, model = extract_info_from_filename(file)
            df = pd.read_csv(file)
            df["model"] = model
            df["period"] = period
            indices_list.append(df)
        # Concat indices
        indices = pd.concat(indices_list, axis=0)
        indices.sort_values(by=["csize_coarse_grid", "period", "model"])
        indices = indices[
            [
                "csize_coarse_grid",
                "csize_coarse_grid_ha",
                "ncell",
                "period",
                "model",
                "MedAE",
                "R2",
                "RMSE",
                "wRMSE",
            ]
        ]
    indices.to_csv(
        os.path.join(evaluation_folder, "indices_all.csv"),
        sep=",",
        header=True,
        index=False,
        index_label=False,
    )


In [None]:
combine_model_results(indices_csv_files_clean)
