In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
# %load_ext cudf.pandas
# import pandas as pd
# print(pd)


In [3]:
# import cuml
# cuml.accel.install()


In [4]:
# Optimizations
# GDAL optimizations
import multiprocessing as mp
import os
import sys

sys.path.append("..")
cpu_count: int = mp.cpu_count()
num_cores: int = cpu_count - 2
os.environ["GDAL_NUM_THREADS"] = f"{num_cores}"
os.environ["GDAL_CACHEMAX"] = "1024"


## Libraries

In [5]:
from pathlib import Path
import numpy as np
import pandas as pd
import riskmapjnr as rmj
from patsy import dmatrices


In [6]:
# Add root to path
import sys

sys.path.append("..")
from component.script.utilities.file_filter import (
    list_files_by_extension,
    filter_files_by_keywords,
    filter_files_by_keywords_strict,
)


## Set user parameters

In [7]:
project_name = "test"


In [8]:
forest_source = "gfc"  ##gfc, tmf
tree_cover_threshold = 10
years = [2015, 2020, 2024]


In [9]:
static_variables = ["altitude", "slope", "pa", "subj", "dist_rivers", "dist_roads"]
dynamic_variables = [
    "forest",
    "deforestation",
    "forest_edge",
    "dist_towns",
]


In [10]:
model_identifier_name = "v1"
random_seed = 1


## Connect folders

In [11]:
root_folder: Path = Path.cwd().parent
downloads_folder: Path = root_folder / "data"
downloads_folder.mkdir(parents=True, exist_ok=True)


In [12]:
project_folder = downloads_folder / project_name
project_folder.mkdir(parents=True, exist_ok=True)
processed_data_folder = project_folder / "data"
processed_data_folder.mkdir(parents=True, exist_ok=True)
sampling_folder = project_folder / "far_samples"
sampling_folder.mkdir(parents=True, exist_ok=True)
glm_model_folder = project_folder / "far_glm"
glm_model_folder.mkdir(parents=True, exist_ok=True)


## Helper functions

In [13]:
def list_files_by_extension(folder_path, file_extensions, recursive=False):
    files = []
    for ext in file_extensions:
        files.extend(
            list(Path(folder_path).glob(f"*{ext}"))
            if not recursive
            else list(Path(folder_path).rglob(f"*.{ext}"))
        )
    files = [f for f in files if ".ipynb_checkpoints" not in Path(f).parts]

    return files


## Select forest cover change file

In [14]:
# List all raster files in the processed data folder
input_raster_files = list_files_by_extension(processed_data_folder, [".tiff", ".tif"])
forest_change_file = filter_files_by_keywords(input_raster_files, ["defostack"])[0]
forest_change_file


PosixPath('/home/jose/workspace/deforisk-jupyter-nb-v2/data/test/data/test_defostack_gfc_10_2015_2020_2024_reprojected.tif')

## Periods dictionaries

In [15]:
import re
from pathlib import Path


def create_full_period_dict(
    years: list[int],
    period: str,
    processed_data_folder: Path,
    static_variables: list[str],
    dynamic_variables: list[str],
):
    """
    Create a comprehensive dictionary for a given modeling period.
    Handles period-independent and multi-temporal variables separately.
    """

    if len(years) < 3:
        raise ValueError("The 'years' list must contain at least three elements.")

    configs = {
        "calibration": {
            "train_period": "calibration",
            "initial_idx": 0,
            "final_idx": 1,
            "defor_value": 1,
            "var_idx": 0,
        },
        "validation": {
            "train_period": "calibration",
            "initial_idx": 1,
            "final_idx": 2,
            "defor_value": 1,
            "var_idx": 1,
        },
        "historical": {
            "train_period": "historical",
            "initial_idx": 0,
            "final_idx": 2,
            "defor_value": [1, 2],
            "var_idx": 0,
        },
        "forecast": {
            "train_period": "historical",
            "initial_idx": 0,
            "final_idx": 2,
            "defor_value": [1, 2],
            "var_idx": 2,
        },
    }

    if period not in configs:
        raise ValueError(f"Unknown period '{period}'. Must be one of: {list(configs)}.")

    c = configs[period]

    # --- Base period dictionary ---
    period_dict = {
        "period": period,
        "train_period": c["train_period"],
        "initial_year": years[c["initial_idx"]],
        "final_year": years[c["final_idx"]],
        "defor_value": c["defor_value"],
        "time_interval": years[c["final_idx"]] - years[c["initial_idx"]],
        "var_year": years[c["var_idx"]],
    }

    initial_year = str(period_dict["initial_year"])
    final_year = str(period_dict["final_year"])
    var_year = str(period_dict["var_year"])
    exclude_years = ", ".join(map(str, set(years) - {initial_year, final_year}))
    period_name = str(period_dict["period"])

    variable_file_mapping = {"period": period}
    input_raster_files = list_files_by_extension(
        processed_data_folder, [".tiff", ".tif"]
    )

    # --- Modular file search ---
    def _is_token_separate_in_name(token: str, name: str) -> bool:
        """
        Devuelve True si `token` aparece en `name` como 'palabra' separada por
        caracteres no alfanuméricos o en los límites (comportamiento similar a \b,
        pero \b considera "_" como no palabra; aquí queremos lo mismo).
        """
        if token.isdigit():  # años u otros números: buscar la secuencia directamente
            return token in name
        # construimos regex que asegura token no está pegado a letras o números
        pattern = rf"(?<![0-9A-Za-z]){re.escape(token)}(?![0-9A-Za-z])"
        return re.search(pattern, name) is not None

    def _strict_candidate_filter(candidates, tokens):
        """
        Filtra candidatos manteniendo sólo aquellos que contienen todos los tokens
        como 'palabras' separadas (ver _is_token_separate_in_name).
        """
        filtered = []
        for p in candidates:
            s = str(p).lower()
            if all(_is_token_separate_in_name(tok.lower(), s) for tok in tokens):
                filtered.append(p)
        return filtered

    def find_file(var_name, dynamic=False):
        """
        Busca un archivo que contenga los términos relevantes.
        Si es dinámico, incluye los años del periodo.
        """
        parts = var_name.split("_")
        include_terms = []
        if len(parts) == 1:
            exclude_terms = ["distance", "edge"]
        else:
            exclude_terms = None

        if dynamic:
            if period_name != "forecast":
                # Buscar archivos que incluyan los años del periodo
                if "deforestation" in parts:
                    include_terms = [*parts, initial_year, final_year]
                else:
                    include_terms = [*parts, initial_year]
            elif period_name == "forecast":
                include_terms = [*parts, var_year]
        else:
            include_terms = parts

        # Buscar distancias o bordes si el nombre lo indica
        if "dist" in parts and "distance" not in include_terms:
            include_terms.append("distance")

        files = filter_files_by_keywords(
            input_raster_files, include_terms, False, exclude_terms, True
        )
        # Si no hay archivos, devolvemos None
        if not files and period_name == "forecast":
            include_terms = [*parts, str(years[1])]
            files = filter_files_by_keywords(
                input_raster_files, include_terms, False, exclude_terms, True
            )
        if not files:
            return None

        # Si viene solo 1, ok
        if len(files) == 1:
            return files[0]
        strict = _strict_candidate_filter(files, parts)
        if strict:
            # si hay múltiplos aún, devolvemos el primero (heurística)
            return strict[0]

    # --- Buscar variables independientes ---
    for var in static_variables:
        variable_file_mapping[var] = find_file(var, dynamic=False)

    # --- Buscar variables multitemporales ---
    for var in dynamic_variables:
        variable_file_mapping[var] = find_file(var, dynamic=True)

    # --- Merge final ---
    period_dict.update(variable_file_mapping)
    return period_dict


In [16]:
calibration_dict = create_full_period_dict(
    years,
    "calibration",
    processed_data_folder,
    static_variables,
    dynamic_variables,
)
validation_dict = create_full_period_dict(
    years,
    "validation",
    processed_data_folder,
    static_variables,
    dynamic_variables,
)
historical_dict = create_full_period_dict(
    years,
    "historical",
    processed_data_folder,
    static_variables,
    dynamic_variables,
)
forecast_dict = create_full_period_dict(
    years,
    "forecast",
    processed_data_folder,
    static_variables,
    dynamic_variables,
)


In [17]:
# Crear el diccionario principal
period_dictionaries = {
    calibration_dict["period"]: calibration_dict,
    validation_dict["period"]: validation_dict,
    historical_dict["period"]: historical_dict,
    forecast_dict["period"]: forecast_dict,
}


## Training formula


In [18]:
def generate_formula(
    dependent_variable: str,
    independent_variables_continuous: list[str] | None = None,
    independent_variables_categorical: list[str] | None = None,
) -> str:
    """
    Generate a regression formula string with scaled continuous variables
    and categorical variables using Patsy-style syntax.

    Example:
        generate_formula("y", ["age", "weight"], ["sex", "breed"])
        -> "I(y) + trial ~ scale(age) + scale(weight) + C(sex) + C(breed)"
    """
    independent_variables_continuous = independent_variables_continuous or []
    independent_variables_categorical = independent_variables_categorical or []

    parts = []
    if independent_variables_continuous:
        parts += [f"scale({x})" for x in independent_variables_continuous]
    if independent_variables_categorical:
        parts += [f"C({x})" for x in independent_variables_categorical]

    rhs = " + ".join(parts) if parts else "1"  # intercept-only model if empty
    formula = f"I({dependent_variable}) + trial ~ {rhs}"
    return formula


In [19]:
# user_formula = "I(1-fcc) + trial ~ scale(altitude) + scale(dist_edge) + scale(dist_river) + scale(dist_road) + scale(dist_town) + scale(slope) + C(pa)"
user_formula = None


In [20]:
from component.script.far_helpers import generate_patsy_formula


dependant_variable = "1-deforestation"
independent_variables_continuous = [
    "altitude",
    "forest_edge",
    "dist_rivers",
    "dist_roads",
    "dist_towns",
    "slope",
]
independant_variable_categorical = ["pa"]

calculated_formula = generate_patsy_formula(
    dependant_variable,
    independent_variables_continuous,
    independant_variable_categorical,
)


In [21]:
if user_formula is None:
    training_formula = calculated_formula
elif user_formula is not None:
    training_formula = user_formula
training_formula


'I(1-deforestation) + trial ~ scale(altitude) + scale(forest_edge) + scale(dist_rivers) + scale(dist_roads) + scale(dist_towns) + scale(slope) + C(pa)'

## Train glm based on period

In [22]:
import pickle

import pandas as pd
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from component.script.far_helpers import extract_variables


def train_glm_from_formula(
    formula: str,
    dataset_file: str,
    out_file: str = "glm_model.pickle",
    random_state: int = 42,
    solver: str = "lbfgs",
    max_iter: int = 1000,
):
    """
    Train a logistic regression model from a formula and a text file dataset.

    Preprocessing:
        - Reads CSV file
        - Drops rows with missing values
        - Adds 'trial' = 1
        - Filters dataset to only include variables used in the formula
        - Validates required columns exist

    Parameters:
        formula (str): Patsy-style formula (e.g., 'target ~ var1 + C(var2)')
        dataset_file (str): Path to input text file (CSV format)
        out_file (str): Path to save trained model via joblib
        random_state (int): Random seed for reproducibility
        solver (str): Solver for LogisticRegression
        max_iter (int): Maximum iterations

    Returns:
        dict: Dictionary with model, predictions, deviance, formula, and dataset shape
    """
    # Read the dataset from the text file
    print(f"📊 Loading data from {dataset_file}")
    try:
        dataset = pd.read_csv(dataset_file)
    except Exception as e:
        raise ValueError(f"Failed to read dataset file: {e}")

    if dataset.empty:
        raise ValueError("Dataset is empty after loading.")

    # Apply required preprocessing
    print("🧹 Preprocessing data: dropping missing values and adding 'trial' column...")
    dataset = dataset.dropna(axis=0)  # Drop any rows with NA
    # dataset = dataset.fillna(0)  # Fill na values from distance files
    dataset["trial"] = 1  # Add trial column as 1

    # Extract raw variable names used in the formula (ignoring I(), scale(), C())
    raw_variables = extract_variables(formula, "all")

    # Also ensure that `trial` and `cell` are present — these are often used as offsets or weights
    required_vars = raw_variables | {"trial"}

    # Check which required variables are missing from dataset
    missing_vars = [var for var in required_vars if var not in dataset.columns]

    if missing_vars:
        raise ValueError(f"Missing columns in dataset: {missing_vars}")

    # Now filter the dataset: keep only relevant columns
    try:
        dataset = dataset[list(required_vars)]
    except KeyError as e:
        raise ValueError(f"Failed to select columns from dataset: {e}")

    print(
        f"💾 Filtered dataset to {len(dataset.columns)} variables: {list(dataset.columns)}"
    )
    # Ensure consistent preprocessing
    print(formula, len(dataset))
    y, x = dmatrices(formula, data=dataset, NA_action="drop")
    # Debug: Confirm alignment
    if len(y) != len(x):
        raise ValueError(
            f"Inconsistent sample sizes after dmatrices: Y={len(y)}, X={len(x)}"
        )

    Y = y[:, 0]
    X = x
    # X = x[:, :-1]

    print(f"✅ Data aligned: {len(Y)} samples for training")

    # Fit GLM (Logistic Regression)
    model = LogisticRegression(
        solver=solver, max_iter=max_iter, random_state=random_state, n_jobs=-1
    )
    model.fit(X, Y)

    # Predictions
    pred_proba = model.predict_proba(X)[:, 1]

    # Compute deviance (twice the log loss)
    deviance = 2 * log_loss(Y, pred_proba, normalize=False)

    # Save model metadata (pickle)

    model_data = {
        "model": model,
        "predictions": pred_proba,
        "deviance": deviance,
        "formula": formula,
        "dataset_shape": dataset.shape,
        "samples_path": dataset_file,
    }

    # Save model with pickle
    with open(out_file, "wb") as file:
        pickle.dump(model_data, file)

    print(f"✅ GLM trained and saved to: {out_file}")

    return model_data


In [23]:
def get_samples_for_period(period, sample_name: str = "sample.txt"):
    period_name = period_dictionaries[period]["train_period"]
    samples = sampling_folder / period_name / sample_name
    return samples


In [24]:
def train_glm_period(formula, period, sample_path, random_seed, model_id):
    # Create period folder
    period_output_folder = glm_model_folder / period
    period_output_folder.mkdir(parents=True, exist_ok=True)
    # Set outputfile
    model_output = period_output_folder / f"glm_model_{model_id}.pickle"
    # Train GLM
    glm_trined = train_glm_from_formula(formula, sample_path, model_output, random_seed)
    return model_output


In [25]:
# Train calibration period
period_t = "calibration"

samples = get_samples_for_period(period_t, "sample.txt")

glm_trined_calibration = train_glm_period(
    training_formula, period_t, samples, random_seed, model_identifier_name
)


📊 Loading data from /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/far_samples/calibration/sample.txt
🧹 Preprocessing data: dropping missing values and adding 'trial' column...
💾 Filtered dataset to 9 variables: ['dist_rivers', 'forest_edge', 'altitude', 'dist_roads', 'slope', 'trial', 'deforestation', 'pa', 'dist_towns']
I(1-deforestation) + trial ~ scale(altitude) + scale(forest_edge) + scale(dist_rivers) + scale(dist_roads) + scale(dist_towns) + scale(slope) + C(pa) 19980


✅ Data aligned: 19980 samples for training
✅ GLM trained and saved to: /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/far_glm/calibration/glm_model_v1.pickle


In [26]:
# Train calibration period
period_t = "historical"

samples = get_samples_for_period(period_t, "sample.txt")

glm_trined_historical = train_glm_period(
    training_formula, period_t, samples, random_seed, model_identifier_name
)


📊 Loading data from /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/far_samples/historical/sample.txt
🧹 Preprocessing data: dropping missing values and adding 'trial' column...
💾 Filtered dataset to 9 variables: ['dist_rivers', 'forest_edge', 'altitude', 'dist_roads', 'slope', 'trial', 'deforestation', 'pa', 'dist_towns']
I(1-deforestation) + trial ~ scale(altitude) + scale(forest_edge) + scale(dist_rivers) + scale(dist_roads) + scale(dist_towns) + scale(slope) + C(pa) 19988
✅ Data aligned: 19988 samples for training
✅ GLM trained and saved to: /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/far_glm/historical/glm_model_v1.pickle


## Apply trained glm model

In [27]:
# Standard library imports
import os
import sys
import uuid

# Third party imports
import numpy as np
from osgeo import gdal
import pandas as pd
from patsy.build import build_design_matrices

# Local application imports
from forestatrisk.misc import rescale, makeblock


# predict_raster
def predict_raster(
    model_pickle,
    _x_design_info,
    period_dict="data",
    output_file="predictions.tif",
    blk_rows=128,
    verbose=True,
):
    """Predict the spatial probability of deforestation from a
    statistical model.

    This function predicts the spatial probability of deforestation
    from a statistical model. Computation are done by block and
    can be performed on large geographical areas.

    :param model: The model (glm, rf) to predict from. Must have a
        model.predict_proba() function.
    :param _x_design_info: Design matrix information from patsy.
    :param var_dir: Directory with rasters (.tif) of explicative variables.
    :param output_file: Name of the output raster file for predictions.
    :param blk_rows: If > 0, number of rows for computation by block.
    :param verbose: Logical. Whether to print messages or not. Default
        to ``True``.

    """
    # Read model and extract data
    model_pickle = pd.read_pickle(model_pickle)
    model = model_pickle.get("model")
    formula = model_pickle.get("formula")
    predictors_variable = sorted(extract_variables(formula, "predictors"))

    # Retrieve the corresponding file paths based on the sorted keys
    raster_list = [period_dict[key] for key in predictors_variable]

    # Get forest layer from period dictionary
    input_forest_raster = period_dict["forest"]

    # Mask on forest
    if verbose:
        print(f"Using {input_forest_raster} file")
    fmaskR = gdal.Open(input_forest_raster)
    fmaskB = fmaskR.GetRasterBand(1)

    # Landscape variables from forest raster
    gt = fmaskR.GetGeoTransform()
    ncol = fmaskR.RasterXSize
    nrow = fmaskR.RasterYSize
    Xmin = gt[0]
    Xmax = gt[0] + gt[1] * ncol
    Ymin = gt[3] + gt[5] * nrow
    Ymax = gt[3]

    # raster_names = []
    # for i in range(len(raster_list)):
    #     fname = os.path.basename(raster_list[i])
    #     index_dot = fname.index(".")
    #     raster_names.append(fname[:index_dot])
    var_names = predictors_variable
    var_names.extend(["X", "Y", "fmask"])
    # print(len(var_names), len(raster_list))
    # Make vrt with gdalbuildvrt
    if verbose:
        print("Make virtual raster with variables as raster bands")
    param = gdal.BuildVRTOptions(
        resolution="user",
        outputBounds=(Xmin, Ymin, Xmax, Ymax),
        xRes=gt[1],
        yRes=-gt[5],
        separate=True,
    )
    rand_uuid = uuid.uuid4()
    vrt_file = f"/vsimem/var_{rand_uuid}.vrt"
    cback = gdal.TermProgress_nocb if verbose else 0
    gdal.BuildVRT(vrt_file, raster_list, options=param, callback=cback)
    stack = gdal.Open(vrt_file)
    nband = stack.RasterCount
    proj = stack.GetProjection()

    # List of nodata values
    bandND = np.zeros(nband)
    for k in range(nband):
        band = stack.GetRasterBand(k + 1)
        bandND[k] = band.GetNoDataValue()
        if (bandND[k] is None) or (bandND[k] is np.nan):
            print(f"NoData value is not specified for input raster file {k}")
            sys.exit(1)
    bandND = bandND.astype(np.float32)

    # Make blocks
    blockinfo = makeblock(vrt_file, blk_rows=blk_rows)
    nblock = blockinfo[0]
    nblock_x = blockinfo[1]
    x = blockinfo[3]
    y = blockinfo[4]
    nx = blockinfo[5]
    ny = blockinfo[6]
    if verbose:
        print(f"Divide region in {nblock} blocks")

    # Raster of predictions
    if verbose:
        print("Create a raster file on disk for projections")
    driver = gdal.GetDriverByName("GTiff")
    try:
        os.remove(output_file)
    except FileNotFoundError:
        pass
    Pdrv = driver.Create(
        output_file,
        ncol,
        nrow,
        1,
        gdal.GDT_UInt16,
        ["COMPRESS=DEFLATE", "PREDICTOR=2", "BIGTIFF=YES"],
    )
    Pdrv.SetGeoTransform(gt)
    Pdrv.SetProjection(proj)
    Pband = Pdrv.GetRasterBand(1)
    Pband.SetNoDataValue(0)

    # Predict by block
    # Message
    if verbose:
        print("Predict deforestation probability by block")
    # Loop on blocks of data
    for b in range(nblock):
        # Position in 1D-arrays
        px = b % nblock_x
        py = b // nblock_x
        # Number of pixels
        npix = nx[px] * ny[py]
        # Data for one block of the stack (shape = (nband, nrow, ncol))
        data = stack.ReadAsArray(x[px], y[py], nx[px], ny[py])
        data = data.astype(float)  # From uint to float
        # Replace ND values with -9999
        for i in range(nband):
            data[i][np.nonzero(data[i] == bandND[i])] = -9999
        # Add a dimension if there is only one variable
        if len(data.shape) == 2:
            data = data[np.newaxis, :, :]
        # Coordinates of the center of the pixels of the block
        X_col = (
            gt[0] + x[px] * gt[1] + (np.arange(nx[px]) + 0.5) * gt[1]
        )  # +0.5 for center of pixels
        X = np.repeat(X_col[np.newaxis, :], ny[py], axis=0)
        X = X[np.newaxis, :, :]
        Y_row = (
            gt[3] + y[py] * gt[5] + (np.arange(ny[py]) + 0.5) * gt[5]
        )  # +0.5 for center of pixels
        Y = np.repeat(Y_row[:, np.newaxis], nx[px], axis=1)
        Y = Y[np.newaxis, :, :]
        # Forest mask
        fmaskA = fmaskB.ReadAsArray(x[px], y[py], nx[px], ny[py])
        fmaskA = fmaskA.astype(float)  # From uint to float
        fmaskA[np.nonzero(fmaskA != 1)] = -9999
        fmaskA = fmaskA[np.newaxis, :, :]
        # Concatenate forest mask with stack
        data = np.concatenate((data, X, Y, fmaskA), axis=0)
        # Transpose and reshape to 2D array
        data = data.transpose(1, 2, 0)
        data = data.reshape(npix, nband + 3)
        # Observations without NA
        w = np.nonzero(~(data == -9999).any(axis=1))
        # Remove observations with NA
        data = data[w]
        # Transform into a pandas DataFrame
        df = pd.DataFrame(data)
        df.columns = var_names
        # Add fake cell column for _x_design_info
        df["cell"] = 0
        # Predict
        pred = np.zeros(npix)  # Initialize with nodata value (0)
        if len(w[0]) > 0:
            # Get X
            (x_new,) = build_design_matrices([_x_design_info], df)
            X_new = x_new  # [:, :-1]
            # if "LogisticRegression" in str(model):
            #     X_new = x_new[:, :-1]
            # else:
            #     X_new = x_new[:, 1:-1]
            # Get predictions into an array
            p = model.predict_proba(X_new)[:, 1]
            # Rescale and return to pred
            pred[w] = rescale(p)
        # Assign prediction to raster
        pred = pred.reshape(ny[py], nx[px])
        Pband.WriteArray(pred, x[px], y[py])

    # Compute statistics
    if verbose:
        print("Compute statistics")
    Pband.FlushCache()  # Write cache data to disk
    Pband.ComputeStatistics(False)

    # Dereference driver
    Pband = None
    del Pdrv


# End


In [28]:
import pickle
from patsy import dmatrices
import forestatrisk
from component.script.far_helpers import get_design_info


def apply_glm_period(
    period_dictionaries,
    period,
    model,
):
    period_dictionary = period_dictionaries[period]
    period_output_folder = glm_model_folder / period
    period_output_folder.mkdir(parents=True, exist_ok=True)
    prediction_output = (
        period_output_folder / f"glm_{period}_{model_identifier_name}.tif"
    )

    # Load model
    model_f = pd.read_pickle(model)
    formula = model_f.get("formula")
    samples = model_f.get("samples_path")
    (y_design_info, x_design_info) = get_design_info(formula, samples)
    time_interval = period_dictionary["time_interval"]

    predict_raster(
        model,
        x_design_info,
        period_dictionary,
        prediction_output,
        blk_rows=256,
        verbose=True,
    )

    # defrate_per_cat
    print("Calculate deforestation rate per cathegory")
    defrate_output = str(
        period_output_folder
        / f"defrate_cat_glm_{period_dictionary['period']}_{model_identifier_name}.csv"
    )
    forestatrisk.defrate_per_cat(
        forest_change_file,
        str(prediction_output),
        time_interval,
        period,
        defrate_output,
        256,
        False,
    )


In [29]:
def get_trained_model(period_dictionaries, period, model_name):
    period_name = period_dictionaries[period]["train_period"]
    model_period_folder = glm_model_folder / period_name
    model = model_period_folder / model_name
    return model


In [30]:
# Predict over calibration period

period_c = "calibration"


model = get_trained_model(
    period_dictionaries, period_c, f"glm_model_{model_identifier_name}.pickle"
)

glm_predict_calibration = apply_glm_period(
    period_dictionaries,
    period_c,
    model,
)


Using /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/data/test_forest_gfc_10_2015_reprojected.tif file
Make virtual raster with variables as raster bands
Divide region in 32 blocks
Create a raster file on disk for projections
Predict deforestation probability by block
Compute statistics
Calculate deforestation rate per cathegory


In [31]:
# Predict over calibration period

period_c = "validation"


model = get_trained_model(
    period_dictionaries, period_c, f"glm_model_{model_identifier_name}.pickle"
)

glm_predict_validation = apply_glm_period(
    period_dictionaries,
    period_c,
    model,
)


Using /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/data/test_forest_gfc_10_2020_reprojected.tif file
Make virtual raster with variables as raster bands
Divide region in 32 blocks
Create a raster file on disk for projections
Predict deforestation probability by block
Compute statistics
Calculate deforestation rate per cathegory


In [32]:
# Predict over historical period

period_c = "historical"


model = get_trained_model(
    period_dictionaries, period_c, f"glm_model_{model_identifier_name}.pickle"
)

glm_predict_historical = apply_glm_period(
    period_dictionaries,
    period_c,
    model,
)


Using /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/data/test_forest_gfc_10_2015_reprojected.tif file
Make virtual raster with variables as raster bands
Divide region in 32 blocks
Create a raster file on disk for projections
Predict deforestation probability by block
Compute statistics
Calculate deforestation rate per cathegory


In [33]:
# Predict over historical period

period_c = "forecast"


model = get_trained_model(
    period_dictionaries, period_c, f"glm_model_{model_identifier_name}.pickle"
)

glm_predict_forecast = apply_glm_period(
    period_dictionaries,
    period_c,
    model,
)


Using /home/jose/workspace/deforisk-jupyter-nb-v2/data/test/data/test_forest_gfc_10_2024_reprojected.tif file
Make virtual raster with variables as raster bands
Divide region in 32 blocks
Create a raster file on disk for projections
Predict deforestation probability by block
Compute statistics
Calculate deforestation rate per cathegory


In [34]:
print("Done!")


Done!
