In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
# %load_ext cudf.pandas
# import pandas as pd
# print(pd)


In [None]:
# import cuml
# cuml.accel.install()


In [None]:
# Optimizations
# GDAL optimizations
import multiprocessing as mp
import os

cpu_count: int = mp.cpu_count()
num_cores: int = cpu_count - 2
os.environ["GDAL_NUM_THREADS"] = f"{num_cores}"
os.environ["GDAL_CACHEMAX"] = "1024"


## Libraries

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import riskmapjnr as rmj
from patsy import dmatrices


In [None]:
# Add root to path
import sys

sys.path.append("..")
from component.script.utilities.file_filter import (
    list_files_by_extension,
    filter_files_by_keywords,
    filter_files_by_keywords_strict,
)


## Set user parameters

In [None]:
project_name = "test"


In [None]:
forest_source = "gfc"  ##gfc, tmf
tree_cover_threshold = 10
years = [2015, 2020, 2024]
string_years = [str(num) for num in years]


In [None]:
static_variables = ["altitude", "slope", "pa", "subj", "dist_rivers", "dist_roads"]
dynamic_variables = [
    "forest",
    "deforestation",
    "forest_edge",
    "dist_towns",
]


In [None]:
# csize = 10
prior_vrho = -1
mcmc = 6000
burnin = 4000
thin = 1
beta_start = -99
random_seed = 1
csize_interpolate = 0.1


In [None]:
model_identifier_name = "v1"


## Connect folders

In [None]:
root_folder: Path = Path.cwd().parent
downloads_folder: Path = root_folder / "data"
downloads_folder.mkdir(parents=True, exist_ok=True)


In [None]:
project_folder = downloads_folder / project_name
project_folder.mkdir(parents=True, exist_ok=True)
processed_data_folder = project_folder / "data"
processed_data_folder.mkdir(parents=True, exist_ok=True)
sampling_folder = project_folder / "far_samples"
sampling_folder.mkdir(parents=True, exist_ok=True)
icar_model_folder = project_folder / "far_icar"
icar_model_folder.mkdir(parents=True, exist_ok=True)


## Helper functions

In [None]:
def list_files_by_extension(folder_path, file_extensions, recursive=False):
    files = []
    for ext in file_extensions:
        files.extend(
            list(Path(folder_path).glob(f"*{ext}"))
            if not recursive
            else list(Path(folder_path).rglob(f"*.{ext}"))
        )
    files = [f for f in files if ".ipynb_checkpoints" not in Path(f).parts]

    return files


## Select forest cover change file

In [None]:
# List all raster files in the processed data folder
input_raster_files = list_files_by_extension(processed_data_folder, [".tiff", ".tif"])
forest_change_file = filter_files_by_keywords(input_raster_files, ["defostack"])[0]
forest_change_file


## Periods dictionaries

In [None]:
import re
from pathlib import Path


def create_full_period_dict(
    years: list[int],
    period: str,
    processed_data_folder: Path,
    static_variables: list[str],
    dynamic_variables: list[str],
):
    """
    Create a comprehensive dictionary for a given modeling period.
    Handles period-independent and multi-temporal variables separately.
    """

    if len(years) < 3:
        raise ValueError("The 'years' list must contain at least three elements.")

    configs = {
        "calibration": {
            "train_period": "calibration",
            "initial_idx": 0,
            "final_idx": 1,
            "defor_value": 1,
            "var_idx": 0,
        },
        "validation": {
            "train_period": "calibration",
            "initial_idx": 1,
            "final_idx": 2,
            "defor_value": 1,
            "var_idx": 1,
        },
        "historical": {
            "train_period": "historical",
            "initial_idx": 0,
            "final_idx": 2,
            "defor_value": [1, 2],
            "var_idx": 0,
        },
        "forecast": {
            "train_period": "historical",
            "initial_idx": 0,
            "final_idx": 2,
            "defor_value": [1, 2],
            "var_idx": 2,
        },
    }

    if period not in configs:
        raise ValueError(f"Unknown period '{period}'. Must be one of: {list(configs)}.")

    c = configs[period]

    # --- Base period dictionary ---
    period_dict = {
        "period": period,
        "train_period": c["train_period"],
        "initial_year": years[c["initial_idx"]],
        "final_year": years[c["final_idx"]],
        "defor_value": c["defor_value"],
        "time_interval": years[c["final_idx"]] - years[c["initial_idx"]],
        "var_year": years[c["var_idx"]],
    }

    initial_year = str(period_dict["initial_year"])
    final_year = str(period_dict["final_year"])
    var_year = str(period_dict["var_year"])
    exclude_years = ", ".join(map(str, set(years) - {initial_year, final_year}))
    period_name = str(period_dict["period"])

    variable_file_mapping = {"period": period}
    input_raster_files = list_files_by_extension(
        processed_data_folder, [".tiff", ".tif"]
    )

    # --- Modular file search ---
    def _is_token_separate_in_name(token: str, name: str) -> bool:
        """
        Devuelve True si `token` aparece en `name` como 'palabra' separada por
        caracteres no alfanuméricos o en los límites (comportamiento similar a \b,
        pero \b considera "_" como no palabra; aquí queremos lo mismo).
        """
        if token.isdigit():  # años u otros números: buscar la secuencia directamente
            return token in name
        # construimos regex que asegura token no está pegado a letras o números
        pattern = rf"(?<![0-9A-Za-z]){re.escape(token)}(?![0-9A-Za-z])"
        return re.search(pattern, name) is not None

    def _strict_candidate_filter(candidates, tokens):
        """
        Filtra candidatos manteniendo sólo aquellos que contienen todos los tokens
        como 'palabras' separadas (ver _is_token_separate_in_name).
        """
        filtered = []
        for p in candidates:
            s = str(p).lower()
            if all(_is_token_separate_in_name(tok.lower(), s) for tok in tokens):
                filtered.append(p)
        return filtered

    def find_file(var_name, dynamic=False):
        """
        Busca un archivo que contenga los términos relevantes.
        Si es dinámico, incluye los años del periodo.
        """
        parts = var_name.split("_")
        include_terms = []
        if len(parts) == 1:
            exclude_terms = ["distance", "edge"]
        else:
            exclude_terms = None

        if dynamic:
            if period_name != "forecast":
                # Buscar archivos que incluyan los años del periodo
                if "deforestation" in parts:
                    include_terms = [*parts, initial_year, final_year]
                else:
                    include_terms = [*parts, initial_year]
            elif period_name == "forecast":
                include_terms = [*parts, var_year]
        else:
            include_terms = parts

        # Buscar distancias o bordes si el nombre lo indica
        if "dist" in parts and "distance" not in include_terms:
            include_terms.append("distance")

        files = filter_files_by_keywords(
            input_raster_files, include_terms, False, exclude_terms, True
        )
        # Si no hay archivos, devolvemos None
        if not files and period_name == "forecast":
            include_terms = [*parts, str(years[1])]
            files = filter_files_by_keywords(
                input_raster_files, include_terms, False, exclude_terms, True
            )
        if not files:
            return None

        # Si viene solo 1, ok
        if len(files) == 1:
            return files[0]
        strict = _strict_candidate_filter(files, parts)
        if strict:
            # si hay múltiplos aún, devolvemos el primero (heurística)
            return strict[0]

    # --- Buscar variables independientes ---
    for var in static_variables:
        variable_file_mapping[var] = find_file(var, dynamic=False)

    # --- Buscar variables multitemporales ---
    for var in dynamic_variables:
        variable_file_mapping[var] = find_file(var, dynamic=True)

    # --- Merge final ---
    period_dict.update(variable_file_mapping)
    return period_dict


In [None]:
calibration_dict = create_full_period_dict(
    years,
    "calibration",
    processed_data_folder,
    static_variables,
    dynamic_variables,
)
validation_dict = create_full_period_dict(
    years,
    "validation",
    processed_data_folder,
    static_variables,
    dynamic_variables,
)
historical_dict = create_full_period_dict(
    years,
    "historical",
    processed_data_folder,
    static_variables,
    dynamic_variables,
)
forecast_dict = create_full_period_dict(
    years,
    "forecast",
    processed_data_folder,
    static_variables,
    dynamic_variables,
)


In [None]:
# Crear el diccionario principal
period_dictionaries = {
    calibration_dict["period"]: calibration_dict,
    validation_dict["period"]: validation_dict,
    historical_dict["period"]: historical_dict,
    forecast_dict["period"]: forecast_dict,
}


## Training formula


In [None]:
def generate_formula(
    dependent_variable: str,
    independent_variables_continuous: list[str] | None = None,
    independent_variables_categorical: list[str] | None = None,
) -> str:
    """
    Generate a regression formula string with scaled continuous variables
    and categorical variables using Patsy-style syntax.

    Example:
        generate_formula("y", ["age", "weight"], ["sex", "breed"])
        -> "I(y) + trial ~ scale(age) + scale(weight) + C(sex) + C(breed)"
    """
    independent_variables_continuous = independent_variables_continuous or []
    independent_variables_categorical = independent_variables_categorical or []

    parts = []
    if independent_variables_continuous:
        parts += [f"scale({x})" for x in independent_variables_continuous]
    if independent_variables_categorical:
        parts += [f"C({x})" for x in independent_variables_categorical]

    rhs = " + ".join(parts) if parts else "1"  # intercept-only model if empty
    formula = f"I({dependent_variable}) + trial ~ {rhs}"
    return formula


In [None]:
# user_formula = "I(1-fcc) + trial ~ scale(altitude) + scale(dist_edge) + scale(dist_river) + scale(dist_road) + scale(dist_town) + scale(slope) + C(pa)"
user_formula = None


In [None]:
dependant_variable = "1-deforestation"
independent_variables_continuous = [
    "altitude",
    "forest_edge",
    "dist_rivers",
    "dist_roads",
    "dist_towns",
    "slope",
]
independant_variable_categorical = ["pa"]

calculated_formula = generate_formula(
    dependant_variable,
    independent_variables_continuous,
    independant_variable_categorical,
)


In [None]:
if user_formula is None:
    training_formula = calculated_formula
elif user_formula is not None:
    training_formula = user_formula
training_formula


## Train icar based on period

In [None]:
import pickle
import pandas as pd
import forestatrisk
from component.script.far_helpers import extract_variables


def train_icar_from_formula(
    formula: str,
    dataset_file: str,
    raster_path: str,
    csize: int,
    prior_vrho: int = -1,
    mcmc: int = 6000,
    burnin: int = 3000,
    thin: int = 1,
    beta_start=-99,
    random_state: int = 42,
    model_file: str = "icar_model.pickle",
    summary_file: str = "summary_icar.txt",
    mcmc_file: str = "mcmc.pdf",
):
    """
    Train a Bayesian iCAR model using the forestatrisk library.

    Parameters:
        formula (str): Patsy-style formula, e.g., 'I(1-fcc) + trial ~ scale(var1) + C(var2) + cell'
        dataset (pd.DataFrame): Input data with columns matching formula
        raster_path (str): Path to a GeoTIFF file for spatial neighborhood (e.g., "fcc.tif")
        csize (float): Cell size in map units (used for neighbor distance)
        prior_vrho (float): Prior variance for rho (spatial autocorrelation parameter), default=1.0
        burnin (int): Number of burn-in MCMC samples
        mcmc (int): Total number of MCMC iterations
        thin (int): Thinning interval (keep every 'thin' sample)
        beta_start (array-like, optional): Initial values for coefficients; if None, uses default
        random_state (int): Seed for reproducibility
        output_dir (str): Directory to save outputs (summary.txt, mcmc.pdf, model.pkl)

    Returns:
        dict: Dictionary with model results and paths to saved files
    """

    # Read the dataset from the text file
    print(f"📊 Loading data from {dataset_file}...")
    try:
        dataset = pd.read_csv(dataset_file)
    except Exception as e:
        raise ValueError(f"Failed to read dataset file: {e}")

    if dataset.empty:
        raise ValueError("Dataset is empty after loading.")

    # Apply required preprocessing
    print("🧹 Preprocessing data: dropping missing values and adding 'trial' column...")
    dataset = dataset.dropna(axis=0)  # Drop any rows with NA
    # dataset = dataset.fillna(0)  # Fill na values from distance files
    dataset["trial"] = 1  # Add trial column as 1

    # Extract raw variable names used in the formula (ignoring I(), scale(), C())
    raw_variables = extract_variables(formula, "all")

    # Also ensure that `trial` and `cell` are present — these are often used as offsets or weights
    required_vars = raw_variables | {"trial", "cell"}

    # Check which required variables are missing from dataset
    missing_vars = [var for var in required_vars if var not in dataset.columns]

    if missing_vars:
        raise ValueError(f"Missing columns in dataset: {missing_vars}")

    print(len(dataset))

    # Now filter the dataset: keep only relevant columns
    try:
        dataset = dataset[list(required_vars)]
    except KeyError as e:
        raise ValueError(f"Failed to select columns from dataset: {e}")

    print(
        f"💾 Filtered dataset to {len(dataset.columns)} variables: {list(dataset.columns)}"
    )

    # Step 1: Parse formula using patsy
    y, x = dmatrices(formula, data=dataset, NA_action="drop")
    # Ensure consistent preprocessing
    # Debug: Confirm alignment
    if len(y) != len(x):
        raise ValueError(
            f"Inconsistent sample sizes after dmatrices: Y={len(y)}, X={len(x)}"
        )

    Y = y[:, 0]
    X = x

    # Step 2: Compute spatial neighborhood from raster
    # Use forestatrisk's cellneigh to get adjacency structure
    nneigh, adj = forestatrisk.cellneigh(
        raster=raster_path,
        csize=csize,
        rank=1,  # Adjacency based on 4-neighbors (can change if needed)
    )
    print(f"✅ Spatial neighborhood computed")

    # Step 3: Round mcmc to thounds and thin according to it
    if mcmc >= 1000:
        mcmc = int(mcmc // 1000) * 1000
        thin = int(mcmc / 1000)
    else:
        mcmc = mcmc
        thin = 1

    # Step 3: Train iCAR model via forestatrisk
    mod_icar = forestatrisk.model_binomial_iCAR(
        suitability_formula=formula + " + cell",
        data=dataset,
        n_neighbors=nneigh,
        neighbors=adj,
        priorVrho=prior_vrho,
        burnin=burnin,
        mcmc=mcmc,
        thin=thin,
        beta_start=beta_start,
    )

    # print(f"✅ iCAR model trained. Deviance={deviance}, rho={rho}")
    print(f"✅ iCAR model trained.")

    # Step 4: Extract results and Save model metadata (pickle)
    model_data = {
        "formula": mod_icar.suitability_formula,
        "rho": mod_icar.rho,
        "betas": mod_icar.betas,
        "Vrho": mod_icar.Vrho,
        "deviance": mod_icar.deviance,
        "samples_path": dataset_file,
    }

    with open(model_file, "wb") as file:
        pickle.dump(model_data, file)
    print(f"💾 iCAR model metadata saved to: {model_file}")

    # Step 5: Save summary
    with open(summary_file, "w", encoding="utf-8") as f:
        f.write(str(mod_icar))
    print(f"📄 Summary saved to: {summary_file}")

    # Step 6: Plot MCMC diagnostics
    # figs = mod_icar.plot(
    #     output_file= str(mcmc_file),
    #     plots_per_page=3,
    #     figsize=(10, 6),
    #     dpi=80
    # )
    # for fig in figs:
    #     plt.close(fig)
    # print(f"📊 MCMC diagnostics saved to: {str(mcmc_file)}")

    # Step 7: Return results
    return model_data


In [None]:
def get_samples_for_period(period, sample_name: str = "sample.txt"):
    period_name = period_dictionaries[period]["train_period"]
    samples = sampling_folder / period_name / sample_name
    return samples


In [None]:
def get_csize_for_period(period, csize_name: str = "csize_icar.txt"):
    period_name = period_dictionaries[period]["train_period"]
    csize = sampling_folder / period_name / csize_name
    with open(csize, "r", encoding="utf-8", errors="ignore") as file:
        content = int(float(file.read()))

    return content


In [None]:
def train_icar_period(
    formula,
    period_dictionaries,
    period,
    sample_path,
    model_folder,
    csize,
    prior_vrho,
    mcmc,
    burnin,
    thin,
    beta_start,
    random_seed,
    csize_interpolate,
):
    # Raster fcc
    target_variables = extract_variables(formula, "I").pop()
    dependant_variable_raster = period_dictionaries[period][target_variables]
    print(dependant_variable_raster)

    # Create period folder
    period_output_folder = model_folder / period
    period_output_folder.mkdir(parents=True, exist_ok=True)

    # Set outputfiles
    model_output = period_output_folder / f"icar_model_{model_identifier_name}.pickle"
    summary_file = period_output_folder / f"summary_icar_{model_identifier_name}.txt"
    mcmc_file = period_output_folder / f"mcmc_{model_identifier_name}.pdf"
    rho_file = period_output_folder / f"rho_{model_identifier_name}.tif"

    # Train ICAR
    icar_trained = train_icar_from_formula(
        formula,
        sample_path,
        dependant_variable_raster,
        csize,
        prior_vrho,
        mcmc,
        burnin,
        thin,
        beta_start,
        random_seed,
        model_output,
        summary_file,
        mcmc_file,
    )

    # Rho interpolation
    with open(str(model_output), "rb") as file:
        mod_icar_pickle = pickle.load(file)
    rho = mod_icar_pickle["rho"]

    rho_intherpolation = forestatrisk.interpolate_rho(
        rho=rho,
        input_raster=dependant_variable_raster,
        output_file=str(rho_file),
        csize_orig=csize,
        csize_new=csize_interpolate,
    )
    return icar_trained


In [None]:
period_t = "calibration"

samples = get_samples_for_period(period_t, "sample.txt")
samples_csize = get_csize_for_period(period_t, "csize_icar.txt")

calibration_icar = train_icar_period(
    training_formula,
    period_dictionaries,
    period_t,
    samples,
    icar_model_folder,
    samples_csize,
    prior_vrho,
    mcmc,
    burnin,
    thin,
    beta_start,
    random_seed,
    csize_interpolate,
)


In [None]:
period_t = "historical"

samples = get_samples_for_period(period_t, "sample.txt")
samples_csize = get_csize_for_period(period_t, "csize_icar.txt")

historical_icar = train_icar_period(
    training_formula,
    period_dictionaries,
    period_t,
    samples,
    icar_model_folder,
    samples_csize,
    prior_vrho,
    mcmc,
    burnin,
    thin,
    beta_start,
    random_seed,
    csize_interpolate,
)


## Apply icar based on period

In [None]:
from patsy.build import build_design_matrices
from forestatrisk.misc import invlogit, makeblock


# predict_binomial_iCAR
def predict_binomial_iCAR(model_betas, _x_design_info, new_data, rhos):
    """Function to return the predictions of a model_binomial_iCAR model.

    Function to return the predictions of a model_binomial_iCAR model
    for a new data-set. In this function, rho values for spatial cells
    are directly provided and not obtained from the model.

    :param model: The model_binomial_iCAR model to predict from.
    :param new_data: Pandas DataFrame including explicative variables.
    :param rhos: Spatial random effects for each observation (row) in new_data.
    :return: Predictions (probabilities).

    """

    (x_new,) = build_design_matrices([_x_design_info], new_data)
    X_new = x_new[:, :-1]
    return invlogit(np.dot(X_new, model_betas) + rhos)


In [None]:
# Standard library imports
import os
import sys
import uuid

# Third party imports
import numpy as np
from osgeo import gdal
import pandas as pd

# Local application imports
from forestatrisk.misc import rescale, makeblock


# predict_raster
def predict_raster_icar(
    model,
    _x_design_info,
    period_dict="data",
    input_cell_raster="output/rho.tif",
    output_file="predictions.tif",
    blk_rows=128,
    verbose=True,
):
    """Predict the spatial probability of deforestation from a
    statistical model.

    This function predicts the spatial probability of deforestation
    from a statistical model. Computation are done by block and
    can be performed on large geographical areas.

    :param model: The model (glm, rf) to predict from. Must have a
        model.predict_proba() function.
    :param _x_design_info: Design matrix information from patsy.
    :param var_dir: Directory with rasters (.tif) of explicative variables.
    :param input_forest_raster: Path to forest raster (1 for forest).
    :param output_file: Name of the output raster file for predictions.
    :param blk_rows: If > 0, number of rows for computation by block.
    :param verbose: Logical. Whether to print messages or not. Default
        to ``True``.

    """

    # Read model and extract data
    model_pickle = pd.read_pickle(model)
    formula = model_pickle.get("formula")
    model_betas = model_pickle.get("betas")
    predictors_variable = extract_variables(formula, "predictors")
    predictors_variable.remove("cell")

    sorted_variables = sorted(predictors_variable)

    # Retrieve the corresponding file paths based on the sorted keys
    raster_list = [period_dict[key] for key in sorted_variables]
    raster_list.append(input_cell_raster)

    # Get forest layer from period dictionary
    input_forest_raster = period_dict["forest"]

    # Mask on forest
    if verbose:
        print(f"Using {input_forest_raster} file")
    fmaskR = gdal.Open(input_forest_raster)
    fmaskB = fmaskR.GetRasterBand(1)

    # Landscape variables from forest raster
    gt = fmaskR.GetGeoTransform()
    ncol = fmaskR.RasterXSize
    nrow = fmaskR.RasterYSize
    Xmin = gt[0]
    Xmax = gt[0] + gt[1] * ncol
    Ymin = gt[3] + gt[5] * nrow
    Ymax = gt[3]

    # Raster list

    var_names = sorted_variables
    var_names.extend(["rho", "fmask"])
    # Make vrt with gdalbuildvrt
    if verbose:
        print("Make virtual raster with variables as raster bands")
    param = gdal.BuildVRTOptions(
        resolution="user",
        outputBounds=(Xmin, Ymin, Xmax, Ymax),
        xRes=gt[1],
        yRes=-gt[5],
        separate=True,
    )
    rand_uuid = uuid.uuid4()
    vrt_file = f"/vsimem/var_{rand_uuid}.vrt"
    cback = gdal.TermProgress_nocb if verbose else 0
    gdal.BuildVRT(vrt_file, raster_list, options=param, callback=cback)
    stack = gdal.Open(vrt_file)
    nband = stack.RasterCount
    proj = stack.GetProjection()
    # List of nodata values
    bandND = np.zeros(nband)
    for k in range(nband):
        band = stack.GetRasterBand(k + 1)
        bandND[k] = band.GetNoDataValue()
        if (bandND[k] is None) or (bandND[k] is np.nan):
            print(f"NoData value is not specified for input raster file {k}")
            sys.exit(1)
    bandND = bandND.astype(np.float32)

    # Make blocks
    blockinfo = makeblock(vrt_file, blk_rows=blk_rows)
    nblock = blockinfo[0]
    nblock_x = blockinfo[1]
    x = blockinfo[3]
    y = blockinfo[4]
    nx = blockinfo[5]
    ny = blockinfo[6]
    if verbose:
        print(f"Divide region in {nblock} blocks")

    # Raster of predictions
    if verbose:
        print("Create a raster file on disk for projections")
    driver = gdal.GetDriverByName("GTiff")
    try:
        os.remove(output_file)
    except FileNotFoundError:
        pass
    Pdrv = driver.Create(
        output_file,
        ncol,
        nrow,
        1,
        gdal.GDT_UInt16,
        ["COMPRESS=DEFLATE", "PREDICTOR=2", "BIGTIFF=YES"],
    )
    Pdrv.SetGeoTransform(gt)
    Pdrv.SetProjection(proj)
    Pband = Pdrv.GetRasterBand(1)
    Pband.SetNoDataValue(0)

    # Predict by block
    # Message
    if verbose:
        print("Predict deforestation probability by block")
    # Loop on blocks of data
    for b in range(nblock):
        # Position in 1D-arrays
        px = b % nblock_x
        py = b // nblock_x
        # Number of pixels
        npix = nx[px] * ny[py]
        # Data for one block of the stack (shape = (nband,nrow,ncol))
        data = stack.ReadAsArray(x[px], y[py], nx[px], ny[py])
        data = data.astype(float)
        # Replace ND values with -9999
        for i in range(nband):
            data[i][np.nonzero(data[i] == bandND[i])] = -9999
        # Forest mask
        fmaskA = fmaskB.ReadAsArray(x[px], y[py], nx[px], ny[py])
        fmaskA = fmaskA.astype(float)  # From uint to float
        fmaskA[np.nonzero(fmaskA != 1)] = -9999
        fmaskA = fmaskA[np.newaxis, :, :]
        # Concatenate forest mask with stack
        data = np.concatenate((data, fmaskA), axis=0)
        # Transpose and reshape to 2D array
        data = data.transpose(1, 2, 0)
        data = data.reshape(npix, nband + 1)
        # Observations without NA
        w = np.nonzero(~(data == -9999).any(axis=1))
        # Remove observations with NA
        data = data[w]
        # Transform into a pandas DataFrame
        df = pd.DataFrame(data)
        df.columns = var_names
        # Add fake cell column for _x_design_info
        df["cell"] = 0
        # Predict
        pred = np.zeros(npix)  # Initialize with nodata value (0)
        if len(w[0]) > 0:
            # Get predictions into an array
            p = predict_binomial_iCAR(
                model_betas, _x_design_info, new_data=df, rhos=data[:, -2]
            )
            # Rescale and return to pred
            pred[w] = rescale(p)
        # Assign prediction to raster
        pred = pred.reshape(ny[py], nx[px])
        Pband.WriteArray(pred, x[px], y[py])

    # Compute statistics
    if verbose:
        print("Compute statistics")
    Pband.FlushCache()  # Write cache data to disk
    Pband.ComputeStatistics(False)

    # Dereference driver
    Pband = None
    del Pdrv


# End


In [None]:
def get_trained_model(period_dictionaries, period, model_name):
    period_name = period_dictionaries[period]["train_period"]
    model_period_folder = icar_model_folder / period_name
    model = model_period_folder / model_name
    return model


In [None]:
def get_rho_file(period_dictionaries, period):
    period_name = period_dictionaries[period]["train_period"]
    model_period_folder = icar_model_folder / period_name
    rho_file = model_period_folder / f"rho_{model_identifier_name}.tif"
    return str(rho_file)


In [None]:
import pickle
from patsy import dmatrices
import forestatrisk
from component.script.far_helpers import get_design_info


def apply_icar_period(
    period_dictionaries,
    period,
    model,
):
    period_dictionary = period_dictionaries[period]
    period_output_folder = icar_model_folder / period
    period_output_folder.mkdir(parents=True, exist_ok=True)

    prediction_output = (
        period_output_folder
        / f"icar_{period_dictionary['period']}_{model_identifier_name}.tif"
    )

    # Load model
    with open(model, "rb") as file:
        model_f = pickle.load(file)

    formula = model_f.get("formula")
    betas = model_f.get("betas")
    samples = model_f.get("samples_path")

    input_cell_raster = get_rho_file(period_dictionaries, period)

    (y_design_info, x_design_info) = get_design_info(formula, samples)
    time_interval = period_dictionary["time_interval"]
    predict_raster_icar(
        model,
        x_design_info,
        period_dictionary,
        input_cell_raster,
        prediction_output,
        blk_rows=128,
        verbose=True,
    )

    # defrate_per_cat
    print("Calculate deforestation rate per cathegory")
    defrate_output = str(
        period_output_folder
        / f"defrate_cat_icar_{period_dictionary['period']}_{model_identifier_name}.csv"
    )
    forestatrisk.defrate_per_cat(
        forest_change_file,
        str(prediction_output),
        time_interval,
        period,
        defrate_output,
        128,
        False,
    )


In [None]:
# Predict over calibration period

period_c = "calibration"


model = get_trained_model(
    period_dictionaries, period_c, f"icar_model_{model_identifier_name}.pickle"
)


icar_predict_calibration = apply_icar_period(
    period_dictionaries,
    period_c,
    model,
)


In [None]:
# Predict over calibration period
period_c = "validation"


model = get_trained_model(
    period_dictionaries, period_c, f"icar_model_{model_identifier_name}.pickle"
)


icar_predict_validation = apply_icar_period(
    period_dictionaries,
    period_c,
    model,
)


In [None]:
# Predict over calibration period
period_c = "historical"


model = get_trained_model(
    period_dictionaries, period_c, f"icar_model_{model_identifier_name}.pickle"
)


icar_predict_historical = apply_icar_period(
    period_dictionaries,
    period_c,
    model,
)


In [None]:
# Predict over calibration period
period_c = "forecast"


model = get_trained_model(
    period_dictionaries, period_c, f"icar_model_{model_identifier_name}.pickle"
)


icar_predict_forecast = apply_icar_period(
    period_dictionaries,
    period_c,
    model,
)
