In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
# %load_ext cudf.pandas
# import pandas as pd
# print(pd)


In [3]:
# import cuml
# cuml.accel.install()


In [4]:
# Optimizations
# GDAL optimizations
import multiprocessing as mp
import os

cpu_count: int = mp.cpu_count()
num_cores: int = cpu_count - 2
os.environ["GDAL_NUM_THREADS"] = f"{num_cores}"
os.environ["GDAL_CACHEMAX"] = "1024"


## Libraries

In [5]:
from pathlib import Path
import numpy as np
import pandas as pd
import riskmapjnr as rmj
from tabulate import tabulate
from patsy import dmatrices


## Set user parameters

In [6]:
project_name = "test"


In [7]:
forest_source = "gfc"  ##gfc, tmf
tree_cover_threshold = 10
years = [2015, 2020, 2024]
string_years = [str(num) for num in years]


In [8]:
random_seed = 1
n_trees = 100


## Connect folders

In [9]:
root_folder: Path = Path.cwd().parent
downloads_folder: Path = root_folder / "data"
downloads_folder.mkdir(parents=True, exist_ok=True)


In [10]:
project_folder = downloads_folder / project_name
project_folder.mkdir(parents=True, exist_ok=True)
processed_data_folder = project_folder / "data"
processed_data_folder.mkdir(parents=True, exist_ok=True)
sampling_folder = project_folder / "far_samples"
sampling_folder.mkdir(parents=True, exist_ok=True)
rf_model = project_folder / "far_rf"
rf_model.mkdir(parents=True, exist_ok=True)


## Helper functions

In [11]:
def list_folders(directory):
    """
    Lists all folders (directories) within a specified directory.

    Parameters:
        directory (str): The path to the directory from which to list folders.

    Returns:
        list: A list of folder names within the specified directory.
              If an error occurs, returns an empty list and prints an error message.
    """
    try:
        # Create a Path object for the directory
        path = Path(directory)

        # Filter out only directories (folders) using is_dir()
        folders = [entry for entry in path.iterdir() if entry.is_dir()]

        return folders
    except FileNotFoundError:
        print(f"The directory {directory} does not exist.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []


In [12]:
def filter_folders(input_folders, filter_words, exclude_words=None):
    """
    Filters a list of folders based on include and exclude words.
    Parameters:
        input_folders (list): List of folder names to be filtered.
        filter_words (list): Words that must be present in the folder names for inclusion.
        exclude_words (list, optional): Words that must not be present in the folder names for exclusion. Defaults to None.
    Returns:
        list: Filtered list of folders.
    """
    # Ensure all words are lowercase for case-insensitive comparison
    filter_words = [word.lower() for word in filter_words]
    exclude_words = [word.lower() for word in (exclude_words or [])]

    filtered_folders = [
        folder
        for folder in input_folders
        if any(word in folder.name.lower() for word in filter_words)
        and not any(
            exclude_word in folder.name.lower() for exclude_word in exclude_words
        )
    ]

    return filtered_folders


In [13]:
def list_files_by_extension(folder_path, file_extensions, recursive=False):
    """
    List all files with specified extensions in the given folder.
    Parameters:
    folder_path (str or Path): The path to the folder where you want to search for files.
    file_extensions (list of str): A list of file extensions to search for (e.g., ['.shp', '.tif']).
    recursive (bool): Whether to recursively search through subdirectories or not.
    Returns:
    list: A list of file paths with the specified extensions.
    """
    matching_files = []
    try:
        # Convert folder_path to Path object if it's a string
        folder_path = Path(folder_path)

        # Check if the provided path is a directory
        if folder_path.is_dir():
            for entry in folder_path.iterdir():
                if entry.is_file() and any(
                    entry.suffix.lower() == ext.lower() for ext in file_extensions
                ):
                    matching_files.append(str(entry))
                elif recursive and entry.is_dir():
                    # Recursively search subdirectories
                    matching_files.extend(
                        list_files_by_extension(entry, file_extensions, recursive)
                    )
        else:
            print(f"The provided path '{folder_path}' is not a directory.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return matching_files


In [14]:
def filter_files(input_files, filter_words, exclude_words=None, include_all=True):
    """
    Filters a list of files based on include and exclude words.
    Parameters:
        input_files (list): List of file paths to be filtered.
        filter_words (list): Words that must be present in the filenames for inclusion.
        exclude_words (list, optional): Words that must not be present in the filenames for exclusion. Defaults to None.
        include_all (bool, optional): If True, all filter words must be present in the filename. If False, at least one of the filter words must be present. Defaults to False.
    Returns:
        list: Filtered list of files.
    """
    # Ensure all words are lowercase for case-insensitive comparison
    filter_words = [word.lower() for word in filter_words]
    exclude_words = [word.lower() for word in (exclude_words or [])]

    if include_all:
        filtered_files = [
            file
            for file in input_files
            if all(word in Path(file).name.lower() for word in filter_words)
            and not any(
                exclude_word in Path(file).name.lower()
                for exclude_word in exclude_words
            )
        ]
    else:
        filtered_files = [
            file
            for file in input_files
            if any(word in Path(file).name.lower() for word in filter_words)
            and not any(
                exclude_word in Path(file).name.lower()
                for exclude_word in exclude_words
            )
        ]

    return filtered_files


In [15]:
def filter_out_ipynb_checkpoints(input_files):
    """
    Filters out files whose paths contain '.ipynb_checkpoints'.
    Parameters:
        input_files (list): List of file paths to be filtered.
    Returns:
        list: Filtered list of files.
    """
    filtered_files = [
        file for file in input_files if ".ipynb_checkpoints" not in Path(file).parts
    ]
    return filtered_files


## Select forest cover change file

In [16]:
# List all raster files in the processed data folder
input_raster_files = filter_out_ipynb_checkpoints(
    list_files_by_extension(processed_data_folder, [".tiff", ".tif"])
)


In [17]:
forest_change_file = filter_files(
    input_raster_files,
    ["forest", "loss", forest_source] + [str(num) for num in years],
    ["distance", "edge"],
)[0]


## Periods dictionaries

In [18]:
calibration_dict = {
    "period": "calibration",
    "train_period": "calibration",
    "initial_year": years[0],
    "final_year": years[1],
    "defor_value": 1,
    "time_interval": years[1] - years[0],
}
validation_dict = {
    "period": "validation",
    "train_period": "calibration",
    "initial_year": years[1],
    "final_year": years[2],
    "defor_value": 1,
    "time_interval": years[2] - years[1],
}
historical_dict = {
    "period": "historical",
    "train_period": "historical",
    "initial_year": years[0],
    "final_year": years[2],
    "defor_value": [1, 2],
    "time_interval": years[2] - years[0],
}
forecast_dict = {
    "period": "forecast",
    "train_period": "historical",
    "initial_year": years[0],
    "final_year": years[2],
    "defor_value": [1, 2],
    "time_interval": years[2] - years[0],
}


In [19]:
# Crear el diccionario principal
period_dictionaries = {
    calibration_dict["period"]: calibration_dict,
    validation_dict["period"]: validation_dict,
    historical_dict["period"]: historical_dict,
    forecast_dict["period"]: forecast_dict,
}


## Select input files based on period

In [20]:
def get_fcc_files(input_raster_files, period_dict, period):
    # Define the period-dependent variables and their associated files
    period_dictionary = period_dict[period]
    initial_year = str(period_dictionary["initial_year"])
    final_year = str(period_dictionary["final_year"])
    exclude_year = ", ".join(
        map(
            str,
            set(years)
            - {period_dictionary["initial_year"], period_dictionary["final_year"]},
        )
    )
    forest_loss_files = filter_files(
        input_raster_files,
        [forest_source, initial_year, final_year, "forest", "loss"],
        [exclude_year, "edge"],
        True,
    )

    # Create a dictionary with variable types as keys and file paths as values
    variable_file_mapping = {
        "period": period_dictionary["period"],
        "fcc": forest_loss_files[0],
    }
    return variable_file_mapping


In [21]:
def get_samples_for_period(period, sampling_folder):
    period_name = period_dictionaries[period]["train_period"]
    samples = sampling_folder / period_name / "sample.txt"
    return samples


In [22]:
def get_csize_for_period(period, sampling_folder):
    period_name = period_dictionaries[period]["train_period"]
    samples = sampling_folder / period_name / "csize_icar.txt"
    return samples


In [23]:
import re


def extract_raw_variables(formula: str) -> set:
    """
    Extract raw variable names from a Patsy-style formula,
    safely handling I(), scale(), C(), and other transformations.

    Example:
        "I(1 - fcc) + trial ~ scale(altitude) + C(pa)"
        → returns {'fcc', 'trial', 'altitude', 'pa'}
    """
    raw_vars = set()

    # Pattern to match: any Patsy function with content inside parentheses
    # We capture the inner part, then extract variables from it
    pattern = r"[a-zA-Z_][a-zA-Z0-9_]*\(([^)]+)\)"

    # Find all expressions like I(...), scale(...), C(...)
    matches = re.findall(pattern, formula)

    for expr in matches:
        # Clean the expression: remove spaces, split by operators
        # We want to extract only variable names (no constants or math)
        tokens = re.split(r"[+\-*/\(\)\s]", expr)  # Split on common symbols
        tokens = [t.strip() for t in tokens if t.strip()]

        # Keep only valid identifiers that are not numbers/strings
        for token in tokens:
            # Skip numeric literals (e.g., '1', '2.3')
            if re.match(r"^\d+(\.\d+)?$", token):
                continue
            # Skip keywords like 'I', 'scale'
            if token.lower() in {"i", "scale", "c", "poly", "bs", "cr"}:
                continue
            raw_vars.add(token)

    # Now extract standalone variables (not inside functions)
    standalone = re.findall(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b", formula)

    for var in standalone:
        if var.lower() not in {"i", "scale", "c"}:  # Skip Patsy keywords
            raw_vars.add(var)

    # Remove invalid tokens (e.g., '1-fcc' is not a column name)
    raw_vars = {v for v in raw_vars if re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", v)}

    return raw_vars


## Train rf based on period

In [24]:
variables_for_training = [
    "scale(altitude)",
    "scale(dist_edge)",
    "scale(dist_river)",
    "scale(dist_road)",
    "scale(dist_town)",
    "scale(slope)",
    "C(pa)",
]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
import joblib


def train_rf_from_formula(
    formula: str,
    dataset_file: str,
    out_file: str = "rf_model.joblib",
    random_state: int = 42,
    n_estimators: int = 50,
    min_samples_leaf: int = 2,
    max_depth: int = 15,
    n_jobs: int = -1,
):
    """
    Train a Random Forest classifier from a formula and dataset.
    Preprocessing:
        - Reads CSV file
        - Drops rows with missing values
        - Adds 'trial' = 1
        - Filters dataset to only include variables used in the formula
        - Validates required columns exist

    Parameters:
        formula (str): Patsy-style formula (e.g., 'target ~ var1 + C(var2)')
        dataset (pd.DataFrame): Input data
        out_file (str): Path to save the trained model via joblib
        random_state (int): Random seed for reproducibility
        n_estimators (int): Number of trees in forest
        min_samples_leaf (int): Minimum samples per leaf node
        max_depth (int): Max depth of each tree
        n_jobs (int): Number of parallel jobs; set to 1 for QGIS safety
        verbose (bool): Whether to print progress

    Returns:
        dict: Dictionary with model, predictions, and deviance
    """

    # Read the dataset from the text file
    print(f"📊 Loading data from {dataset_file}...")
    try:
        dataset = pd.read_csv(dataset_file)
    except Exception as e:
        raise ValueError(f"Failed to read dataset file: {e}")

    if dataset.empty:
        raise ValueError("Dataset is empty after loading.")

    # Apply required preprocessing
    print("🧹 Preprocessing data: dropping missing values and adding 'trial' column...")
    dataset = dataset.dropna(axis=0)  # Drop any rows with NA
    dataset["trial"] = 1  # Add trial column as 1

    # Extract raw variable names used in the formula (ignoring I(), scale(), C())
    raw_variables = extract_raw_variables(formula)

    # Also ensure that `trial` and `cell` are present — these are often used as offsets or weights
    required_vars = raw_variables | {"trial"}

    # Check which required variables are missing from dataset
    missing_vars = [var for var in required_vars if var not in dataset.columns]

    if missing_vars:
        raise ValueError(f"Missing columns in dataset: {missing_vars}")

    print(len(dataset))

    # Now filter the dataset: keep only relevant columns
    try:
        dataset = dataset[list(required_vars)]
    except KeyError as e:
        raise ValueError(f"Failed to select columns from dataset: {e}")

    print(
        f"💾 Filtered dataset to {len(dataset.columns)} variables: {list(dataset.columns)}"
    )

    # Parse formula using patsy
    y, x = dmatrices(formula, data=dataset, NA_action="drop")
    # Ensure consistent preprocessing
    # Debug: Confirm alignment
    if len(y) != len(x):
        raise ValueError(
            f"Inconsistent sample sizes after dmatrices: Y={len(y)}, X={len(x)}"
        )

    Y = y[:, 0]
    X = x
    # X = x[:, 1:-1]

    # Remove intercept (first column) and spatial cell ID (last column)

    # Fit Random Forest model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        min_samples_leaf=min_samples_leaf,
        max_depth=max_depth,
        random_state=random_state,
        n_jobs=-1,
    )
    model.fit(X, Y)

    # Predictions
    pred_proba = model.predict_proba(X)[:, 1]

    # Compute deviance (twice the log loss)
    deviance = 2 * log_loss(Y, pred_proba, normalize=False)

    # Save model metadata (pickle)

    model_data = {
        "model": model,
        "predictions": pred_proba,
        "deviance": deviance,
        "formula": formula,
        "dataset_shape": dataset.shape,
    }

    # Save model with joblib
    joblib.dump(model_data, out_file, compress=3)
    # Save model with pickle
    # with open(out_file, "wb") as file:
    #     pickle.dump(model_data, file)

    print(f"✅ Random Forest trained and saved to: {out_file}")
    return {
        "model": model,
        "predictions": pred_proba,
        "deviance": deviance,
        "formula": formula,
    }


In [29]:
def train_rf_period(
    variables, period, sampling_folder, model_folder, random_seed, n_estimators
):
    ##Get formula
    right_part = " + ".join(variables)
    left_part = "I(1-fcc) + trial ~ "
    # left_part = "I(fcc) + trial ~ "
    rf_formula = left_part + right_part
    # Get samples
    samples_path = get_samples_for_period(period, sampling_folder)
    # Create period folder
    period_output_folder = model_folder / period
    if not os.path.exists(period_output_folder):
        os.makedirs(period_output_folder)
    # Set outputfile
    model_output_file = period_output_folder / "rf_model.joblib"
    # Train RF
    rf_trined = train_rf_from_formula(
        rf_formula, samples_path, model_output_file, random_seed, n_estimators
    )


In [30]:
rf_trined_calibration = train_rf_period(
    variables_for_training,
    "calibration",
    sampling_folder,
    rf_model,
    random_seed,
    n_trees,
)


📊 Loading data from /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/far_samples/calibration/sample.txt...
🧹 Preprocessing data: dropping missing values and adding 'trial' column...
19990
💾 Filtered dataset to 9 variables: ['dist_road', 'fcc', 'pa', 'trial', 'altitude', 'dist_river', 'slope', 'dist_town', 'dist_edge']


✅ Random Forest trained and saved to: /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/far_rf/calibration/rf_model.joblib


In [31]:
rf_trined_historical = train_rf_period(
    variables_for_training,
    "historical",
    sampling_folder,
    rf_model,
    random_seed,
    n_trees,
)


📊 Loading data from /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/far_samples/historical/sample.txt...
🧹 Preprocessing data: dropping missing values and adding 'trial' column...
19990
💾 Filtered dataset to 9 variables: ['dist_road', 'fcc', 'pa', 'trial', 'altitude', 'dist_river', 'slope', 'dist_town', 'dist_edge']
✅ Random Forest trained and saved to: /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/far_rf/historical/rf_model.joblib


## Select input files based on period

In [32]:
def get_period_variable_files(input_raster_files, period_dict, period):
    # Define the period-independent variables and their associated files
    period_independant_variables = ["altitude", "slope", "pa", "subj"]
    altitude_files = filter_files(input_raster_files, ["altitude"], None, False)
    slope_files = filter_files(input_raster_files, ["slope"], None, False)
    wdpa_files = filter_files(input_raster_files, ["pa"], None, False)
    subj_files = filter_files(input_raster_files, ["subj"], None, False)

    # Define the rivers and roads variables and their associated files
    rivers_files = filter_files(
        input_raster_files, ["rivers", "reprojected", "distance"], None, True
    )
    road_files = filter_files(
        input_raster_files, ["roads", "reprojected", "distance"], None, True
    )

    # Define the period-dependent variables and their associated files
    period_dictionary = period_dict[period]
    initial_year = str(period_dictionary["initial_year"])
    final_year = str(period_dictionary["final_year"])
    exclude_year = ", ".join(
        map(
            str,
            set(years)
            - {period_dictionary["initial_year"], period_dictionary["final_year"]},
        )
    )
    forest_loss_files = filter_files(
        input_raster_files,
        [forest_source, initial_year, final_year, "forest", "loss"],
        [exclude_year, "edge"],
        True,
    )
    # forest_edge_files = filter_files(input_raster_files, [forest_source, initial_year, 'forest','reprojected', 'edge'], None, True)
    town_files = filter_files(
        input_raster_files,
        [initial_year, "town", "reprojected", "distance"],
        None,
        True,
    )
    if period in ["calibration", "validation", "historical"]:
        forest_files = filter_files(
            input_raster_files,
            [forest_source, initial_year, "forest", "reprojected"],
            ["edge"],
            True,
        )
        forest_edge_files = filter_files(
            input_raster_files,
            [forest_source, initial_year, "forest", "reprojected", "edge"],
            None,
            True,
        )
    elif period == "forecast":
        forest_files = filter_files(
            input_raster_files,
            [forest_source, final_year, "forest", "reprojected"],
            ["edge"],
            True,
        )
        forest_edge_files = filter_files(
            input_raster_files,
            [forest_source, final_year, "forest", "reprojected", "edge"],
            None,
            True,
        )
    else:
        raise ValueError(
            f"Invalid period: {period}. Must be 'calibration', 'validation', 'historical', or 'forecast'"
        )
    # Create a dictionary with variable types as keys and file paths as values
    variable_file_mapping = {
        "period": period_dictionary["period"],
        "altitude": altitude_files[0],
        "slope": slope_files[0],
        "pa": wdpa_files[0],
        "subj": subj_files[0],
        "dist_river": rivers_files[0],
        "dist_road": road_files[0],
        "dist_town": town_files[0],
        "fcc": forest_loss_files[0],
        "dist_edge": forest_edge_files[0],
        "forest": forest_files[0],
    }
    return variable_file_mapping


In [33]:
def get_trained_model(period_dictionaries, period, model_folder):
    period_name = period_dictionaries[period]["train_period"]
    model_period_folder = model_folder / period_name
    model = list_files_by_extension(model_period_folder, [".pickle", ".joblib"])[0]
    return model


In [34]:
def get_design_info(formula_icar, dataset_file):
    """Get design info from patsy."""
    dataset = pd.read_csv(dataset_file)
    dataset = dataset.dropna(axis=0)
    dataset["trial"] = 1
    y, x = dmatrices(formula_icar, dataset, 0, "drop")
    y_design_info = y.design_info
    x_design_info = x.design_info
    return (y_design_info, x_design_info)


In [35]:
# Standard library imports
import os
import sys
import uuid

# Third party imports
import numpy as np
from osgeo import gdal
import pandas as pd
from patsy.build import build_design_matrices

# Local application imports
from forestatrisk.misc import rescale, makeblock


# predict_raster
def predict_raster(
    model,
    _x_design_info,
    period_dict_files="data",
    input_forest_raster="data/forest.tif",
    output_file="predictions.tif",
    blk_rows=128,
    verbose=True,
):
    """Predict the spatial probability of deforestation from a
    statistical model.

    This function predicts the spatial probability of deforestation
    from a statistical model. Computation are done by block and
    can be performed on large geographical areas.

    :param model: The model (glm, rf) to predict from. Must have a
        model.predict_proba() function.
    :param _x_design_info: Design matrix information from patsy.
    :param var_dir: Directory with rasters (.tif) of explicative variables.
    :param input_forest_raster: Path to forest raster (1 for forest).
    :param output_file: Name of the output raster file for predictions.
    :param blk_rows: If > 0, number of rows for computation by block.
    :param verbose: Logical. Whether to print messages or not. Default
        to ``True``.

    """

    # Mask on forest
    if verbose:
        print(f"Using {input_forest_raster} file")
    fmaskR = gdal.Open(input_forest_raster)
    fmaskB = fmaskR.GetRasterBand(1)

    # Landscape variables from forest raster
    gt = fmaskR.GetGeoTransform()
    ncol = fmaskR.RasterXSize
    nrow = fmaskR.RasterYSize
    Xmin = gt[0]
    Xmax = gt[0] + gt[1] * ncol
    Ymin = gt[3] + gt[5] * nrow
    Ymax = gt[3]

    # Raster list
    # Extract keys excluding 'fcc', 'forest' and 'period' and sort them
    sorted_keys = sorted(
        [key for key in period_dict_files.keys() if key not in ["period", "forest"]]
    )

    # Retrieve the corresponding file paths based on the sorted keys
    raster_list = [period_dict_files[key] for key in sorted_keys]
    # raster_names = []
    # for i in range(len(raster_list)):
    #     fname = os.path.basename(raster_list[i])
    #     index_dot = fname.index(".")
    #     raster_names.append(fname[:index_dot])
    var_names = sorted_keys
    var_names.extend(["X", "Y", "fmask"])
    # print(len(var_names), len(raster_list))
    # Make vrt with gdalbuildvrt
    if verbose:
        print("Make virtual raster with variables as raster bands")
    param = gdal.BuildVRTOptions(
        resolution="user",
        outputBounds=(Xmin, Ymin, Xmax, Ymax),
        xRes=gt[1],
        yRes=-gt[5],
        separate=True,
    )
    rand_uuid = uuid.uuid4()
    vrt_file = f"/vsimem/var_{rand_uuid}.vrt"
    cback = gdal.TermProgress_nocb if verbose else 0
    gdal.BuildVRT(vrt_file, raster_list, options=param, callback=cback)
    stack = gdal.Open(vrt_file)
    nband = stack.RasterCount
    proj = stack.GetProjection()

    # List of nodata values
    bandND = np.zeros(nband)
    for k in range(nband):
        band = stack.GetRasterBand(k + 1)
        bandND[k] = band.GetNoDataValue()
        if (bandND[k] is None) or (bandND[k] is np.nan):
            print(f"NoData value is not specified for input raster file {k}")
            sys.exit(1)
    bandND = bandND.astype(np.float32)

    # Make blocks
    blockinfo = makeblock(vrt_file, blk_rows=blk_rows)
    nblock = blockinfo[0]
    nblock_x = blockinfo[1]
    x = blockinfo[3]
    y = blockinfo[4]
    nx = blockinfo[5]
    ny = blockinfo[6]
    if verbose:
        print(f"Divide region in {nblock} blocks")

    # Raster of predictions
    if verbose:
        print("Create a raster file on disk for projections")
    driver = gdal.GetDriverByName("GTiff")
    try:
        os.remove(output_file)
    except FileNotFoundError:
        pass
    Pdrv = driver.Create(
        output_file,
        ncol,
        nrow,
        1,
        gdal.GDT_UInt16,
        ["COMPRESS=DEFLATE", "PREDICTOR=2", "BIGTIFF=YES"],
    )
    Pdrv.SetGeoTransform(gt)
    Pdrv.SetProjection(proj)
    Pband = Pdrv.GetRasterBand(1)
    Pband.SetNoDataValue(0)

    # Predict by block
    # Message
    if verbose:
        print("Predict deforestation probability by block")
    # Loop on blocks of data
    for b in range(nblock):
        # Position in 1D-arrays
        px = b % nblock_x
        py = b // nblock_x
        # Number of pixels
        npix = nx[px] * ny[py]
        # Data for one block of the stack (shape = (nband, nrow, ncol))
        data = stack.ReadAsArray(x[px], y[py], nx[px], ny[py])
        data = data.astype(float)  # From uint to float
        # Replace ND values with -9999
        for i in range(nband):
            data[i][np.nonzero(data[i] == bandND[i])] = -9999
        # Add a dimension if there is only one variable
        if len(data.shape) == 2:
            data = data[np.newaxis, :, :]
        # Coordinates of the center of the pixels of the block
        X_col = (
            gt[0] + x[px] * gt[1] + (np.arange(nx[px]) + 0.5) * gt[1]
        )  # +0.5 for center of pixels
        X = np.repeat(X_col[np.newaxis, :], ny[py], axis=0)
        X = X[np.newaxis, :, :]
        Y_row = (
            gt[3] + y[py] * gt[5] + (np.arange(ny[py]) + 0.5) * gt[5]
        )  # +0.5 for center of pixels
        Y = np.repeat(Y_row[:, np.newaxis], nx[px], axis=1)
        Y = Y[np.newaxis, :, :]
        # Forest mask
        fmaskA = fmaskB.ReadAsArray(x[px], y[py], nx[px], ny[py])
        fmaskA = fmaskA.astype(float)  # From uint to float
        fmaskA[np.nonzero(fmaskA != 1)] = -9999
        fmaskA = fmaskA[np.newaxis, :, :]
        # Concatenate forest mask with stack
        data = np.concatenate((data, X, Y, fmaskA), axis=0)
        # Transpose and reshape to 2D array
        data = data.transpose(1, 2, 0)
        data = data.reshape(npix, nband + 3)
        # Observations without NA
        w = np.nonzero(~(data == -9999).any(axis=1))
        # Remove observations with NA
        data = data[w]
        # Transform into a pandas DataFrame
        df = pd.DataFrame(data)
        df.columns = var_names
        # Add fake cell column for _x_design_info
        df["cell"] = 0
        # Predict
        pred = np.zeros(npix)  # Initialize with nodata value (0)
        if len(w[0]) > 0:
            # Get X
            (x_new,) = build_design_matrices([_x_design_info], df)
            X_new = x_new  # [:, :-1]
            # if "LogisticRegression" in str(model):
            #     X_new = x_new[:, :-1]
            # else:
            #     X_new = x_new[:, 1:-1]
            # Get predictions into an array
            p = model.predict_proba(X_new)[:, 1]
            # Rescale and return to pred
            pred[w] = rescale(p)
        # Assign prediction to raster
        pred = pred.reshape(ny[py], nx[px])
        Pband.WriteArray(pred, x[px], y[py])

    # Compute statistics
    if verbose:
        print("Compute statistics")
    Pband.FlushCache()  # Write cache data to disk
    Pband.ComputeStatistics(False)

    # Dereference driver
    Pband = None
    del Pdrv


# End


## Apply rf based on period

In [40]:
import pickle
import joblib
from patsy import dmatrices
import forestatrisk


def apply_rf_period(
    period_dictionaries,
    period,
    model_folder,
    processed_data_folder,
    sampling_folder,
):
    ##Get formula
    # right_part = " + ".join(variables)
    # left_part = "I(1-fcc) + trial ~ "
    # # left_part = "I(fcc) + trial ~ "
    # formula = left_part + right_part

    period_dictionary = period_dictionaries[period]
    period_output_folder = model_folder / period
    if not os.path.exists(period_output_folder):
        os.makedirs(period_output_folder)
    prediction_output = period_output_folder / f"rf_{period_dictionary['period']}.tif"

    # Variables
    model = get_trained_model(period_dictionaries, period, model_folder)
    # Load model
    with open(model, "rb") as file:
        model_f = joblib.load(file)

    formula = model_f.get("formula")

    input_raster_files = filter_out_ipynb_checkpoints(
        list_files_by_extension(processed_data_folder, [".tiff", ".tif"])
    )
    variable_files = get_period_variable_files(
        input_raster_files, period_dictionaries, period
    )
    forest_raster = variable_files["forest"]
    samples = get_samples_for_period(period, sampling_folder)
    (y_design_info, x_design_info) = get_design_info(formula, samples)
    time_interval = period_dictionary["time_interval"]
    predict_raster(
        model_f.get("model"),
        x_design_info,
        variable_files,
        forest_raster,
        prediction_output,
        blk_rows=256,
        verbose=True,
    )

    # defrate_per_cat
    print("Calculate deforestation rate per cathegory")
    defrate_output = str(
        period_output_folder / f"defrate_cat_rf_{period_dictionary['period']}.csv"
    )
    forestatrisk.defrate_per_cat(
        forest_change_file,
        str(prediction_output),
        time_interval,
        period,
        defrate_output,
        128,
        False,
    )


In [41]:
rf_predict_calibration = apply_rf_period(
    period_dictionaries,
    "calibration",
    rf_model,
    processed_data_folder,
    sampling_folder,
)


Using /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/data/test_forest_gfc_10_loss_2015_2015_reprojected.tif file
Make virtual raster with variables as raster bands
Divide region in 32 blocks
Create a raster file on disk for projections
Predict deforestation probability by block
Compute statistics
Calculate deforestation rate per cathegory


In [42]:
rf_predict_validation = apply_rf_period(
    period_dictionaries,
    "validation",
    rf_model,
    processed_data_folder,
    sampling_folder,
)


Using /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/data/test_forest_gfc_10_loss_2015_2020_reprojected.tif file
Make virtual raster with variables as raster bands
Divide region in 32 blocks
Create a raster file on disk for projections
Predict deforestation probability by block
Compute statistics
Calculate deforestation rate per cathegory


In [43]:
rf_predict_historical = apply_rf_period(
    period_dictionaries,
    "historical",
    rf_model,
    processed_data_folder,
    sampling_folder,
)


Using /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/data/test_forest_gfc_10_loss_2015_2015_reprojected.tif file
Make virtual raster with variables as raster bands
Divide region in 32 blocks
Create a raster file on disk for projections
Predict deforestation probability by block
Compute statistics
Calculate deforestation rate per cathegory


In [44]:
rf_predict_forecast = apply_rf_period(
    period_dictionaries,
    "forecast",
    rf_model,
    processed_data_folder,
    sampling_folder,
)


Using /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/data/test_forest_gfc_10_2024_reprojected.tif file
Make virtual raster with variables as raster bands
Divide region in 32 blocks
Create a raster file on disk for projections
Predict deforestation probability by block
Compute statistics
Calculate deforestation rate per cathegory
