In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
# %load_ext cudf.pandas
# import pandas as pd
# print(pd)


In [None]:
# import cuml
# cuml.accel.install()


In [None]:
# Optimizations
# GDAL optimizations
import multiprocessing as mp
import os
import sys

sys.path.append("..")
cpu_count: int = mp.cpu_count()
num_cores: int = cpu_count - 2
os.environ["GDAL_NUM_THREADS"] = f"{num_cores}"
os.environ["GDAL_CACHEMAX"] = "1024"


## Libraries

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import riskmapjnr as rmj
from tabulate import tabulate
from patsy import dmatrices


## Set user parameters

In [None]:
project_name = "test"


In [None]:
forest_source = "gfc"  ##gfc, tmf
tree_cover_threshold = 10
years = [2015, 2020, 2024]
string_years = [str(num) for num in years]


In [None]:
random_seed = 1


## Connect folders

In [None]:
root_folder: Path = Path.cwd().parent
downloads_folder: Path = root_folder / "data"
downloads_folder.mkdir(parents=True, exist_ok=True)


In [None]:
project_folder = downloads_folder / project_name
project_folder.mkdir(parents=True, exist_ok=True)
processed_data_folder = project_folder / "data"
processed_data_folder.mkdir(parents=True, exist_ok=True)
sampling_folder = project_folder / "far_samples"
sampling_folder.mkdir(parents=True, exist_ok=True)
glm_model = project_folder / "far_glm"
glm_model.mkdir(parents=True, exist_ok=True)


## Helper functions

In [None]:
def list_files_by_extension(folder_path, file_extensions, recursive=False):

    files = []
    for ext in file_extensions:
        files.extend(
            list(Path(folder_path).glob(f"*{ext}")) if not recursive else list(Path(folder_path).rglob(f"*.{ext}"))
        )
    files = [
        f for f in files if ".ipynb_checkpoints" not in Path(f).parts
    ]

    return files

In [None]:
def filter_files(input_files, filter_words, exclude_words=None, include_all=True):
    """
    Filters a list of files based on include and exclude words.
    Parameters:
        input_files (list): List of file paths to be filtered.
        filter_words (list): Words that must be present in the filenames for inclusion.
        exclude_words (list, optional): Words that must not be present in the filenames for exclusion. Defaults to None.
        include_all (bool, optional): If True, all filter words must be present in the filename. If False, at least one of the filter words must be present. Defaults to False.
    Returns:
        list: Filtered list of files.
    """
    # Ensure all words are lowercase for case-insensitive comparison
    filter_words = [word.lower() for word in filter_words]
    exclude_words = [word.lower() for word in (exclude_words or [])]

    if include_all:
        filtered_files = [
            file
            for file in input_files
            if all(word in Path(file).name.lower() for word in filter_words)
            and not any(
                exclude_word in Path(file).name.lower()
                for exclude_word in exclude_words
            )
        ]
    else:
        filtered_files = [
            file
            for file in input_files
            if any(word in Path(file).name.lower() for word in filter_words)
            and not any(
                exclude_word in Path(file).name.lower()
                for exclude_word in exclude_words
            )
        ]

    return filtered_files


## Select forest cover change file

In [None]:
# List all raster files in the processed data folder
input_raster_files = list_files_by_extension(processed_data_folder, [".tiff", ".tif"])
input_raster_files

In [None]:
forest_change_file = filter_files(
    input_raster_files,
    ["forest", "loss", forest_source] + [str(num) for num in years],
    ["distance", "edge"],
)[0]
forest_change_file

## Periods dictionaries

In [None]:
calibration_dict = {
    "period": "calibration",
    "train_period": "calibration",
    "initial_year": years[0],
    "final_year": years[1],
    "defor_value": 1,
    "time_interval": years[1] - years[0],
}
validation_dict = {
    "period": "validation",
    "train_period": "calibration",
    "initial_year": years[1],
    "final_year": years[2],
    "defor_value": 1,
    "time_interval": years[2] - years[1],
}
historical_dict = {
    "period": "historical",
    "train_period": "historical",
    "initial_year": years[0],
    "final_year": years[2],
    "defor_value": [1, 2],
    "time_interval": years[2] - years[0],
}
forecast_dict = {
    "period": "forecast",
    "train_period": "historical",
    "initial_year": years[0],
    "final_year": years[2],
    "defor_value": [1, 2],
    "time_interval": years[2] - years[0],
}


In [None]:
# Crear el diccionario principal
period_dictionaries = {
    calibration_dict["period"]: calibration_dict,
    validation_dict["period"]: validation_dict,
    historical_dict["period"]: historical_dict,
    forecast_dict["period"]: forecast_dict,
}


## Select training files based on period

In [None]:
def get_fcc_files(input_raster_files, period_dict, period):
    # Define the period-dependent variables and their associated files
    period_dictionary = period_dict[period]
    initial_year = str(period_dictionary["initial_year"])
    final_year = str(period_dictionary["final_year"])
    exclude_year = ", ".join(
        map(
            str,
            set(years)
            - {period_dictionary["initial_year"], period_dictionary["final_year"]},
        )
    )
    forest_loss_files = filter_files(
        input_raster_files,
        [forest_source, initial_year, final_year, "forest", "loss"],
        [exclude_year, "edge"],
        True,
    )

    # Create a dictionary with variable types as keys and file paths as values
    variable_file_mapping = {
        "period": period_dictionary["period"],
        "fcc": forest_loss_files[0],
    }
    return variable_file_mapping


In [None]:
def get_samples_for_period(period, sampling_folder):
    period_name = period_dictionaries[period]["train_period"]
    samples = sampling_folder / period_name / "sample.txt"
    return samples


## Train glm based on period

In [None]:
variables_for_training = [
    "scale(altitude)",
    "scale(dist_edge)",
    "scale(dist_river)",
    "scale(dist_road)",
    "scale(dist_town)",
    "scale(slope)",
    "C(pa)",
]


In [None]:
import pickle

import pandas as pd
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from component.script.far_helpers import extract_raw_variables


def train_glm_from_formula(
    formula: str,
    dataset_file: str,
    out_file: str = "glm_model.pickle",
    random_state: int = 42,
    solver: str = "lbfgs",
    max_iter: int = 1000,
):
    """
    Train a logistic regression model from a formula and a text file dataset.

    Preprocessing:
        - Reads CSV file
        - Drops rows with missing values
        - Adds 'trial' = 1
        - Filters dataset to only include variables used in the formula
        - Validates required columns exist

    Parameters:
        formula (str): Patsy-style formula (e.g., 'target ~ var1 + C(var2)')
        dataset_file (str): Path to input text file (CSV format)
        out_file (str): Path to save trained model via joblib
        random_state (int): Random seed for reproducibility
        solver (str): Solver for LogisticRegression
        max_iter (int): Maximum iterations

    Returns:
        dict: Dictionary with model, predictions, deviance, formula, and dataset shape
    """
    # Read the dataset from the text file
    print(f"📊 Loading data from {dataset_file}...")
    try:
        dataset = pd.read_csv(dataset_file)
    except Exception as e:
        raise ValueError(f"Failed to read dataset file: {e}")

    if dataset.empty:
        raise ValueError("Dataset is empty after loading.")

    # Apply required preprocessing
    print("🧹 Preprocessing data: dropping missing values and adding 'trial' column...")
    dataset = dataset.dropna(axis=0)  # Drop any rows with NA
    # dataset = dataset.fillna(0)  # Fill na values from distance files
    dataset["trial"] = 1  # Add trial column as 1

    # Extract raw variable names used in the formula (ignoring I(), scale(), C())
    raw_variables = extract_raw_variables(formula)

    # Also ensure that `trial` and `cell` are present — these are often used as offsets or weights
    required_vars = raw_variables | {"trial"}

    # Check which required variables are missing from dataset
    missing_vars = [var for var in required_vars if var not in dataset.columns]

    if missing_vars:
        raise ValueError(f"Missing columns in dataset: {missing_vars}")

    # Now filter the dataset: keep only relevant columns
    try:
        dataset = dataset[list(required_vars)]
    except KeyError as e:
        raise ValueError(f"Failed to select columns from dataset: {e}")

    print(
        f"💾 Filtered dataset to {len(dataset.columns)} variables: {list(dataset.columns)}"
    )
    # Ensure consistent preprocessing
    print(formula, len(dataset))
    y, x = dmatrices(formula, data=dataset, NA_action="drop")
    # Debug: Confirm alignment
    if len(y) != len(x):
        raise ValueError(
            f"Inconsistent sample sizes after dmatrices: Y={len(y)}, X={len(x)}"
        )

    Y = y[:, 0]
    X = x
    # X = x[:, :-1]

    print(f"✅ Data aligned: {len(Y)} samples for training")

    # Fit GLM (Logistic Regression)
    model = LogisticRegression(
        solver=solver, max_iter=max_iter, random_state=random_state, n_jobs=-1
    )
    model.fit(X, Y)

    # Predictions
    pred_proba = model.predict_proba(X)[:, 1]

    # Compute deviance (twice the log loss)
    deviance = 2 * log_loss(Y, pred_proba, normalize=False)

    # Save model metadata (pickle)

    model_data = {
        "model": model,
        "predictions": pred_proba,
        "deviance": deviance,
        "formula": formula,
        "dataset_shape": dataset.shape,
    }

    # Save model with pickle
    with open(out_file, "wb") as file:
        pickle.dump(model_data, file)

    print(f"✅ GLM trained and saved to: {out_file}")

    return model_data

In [None]:
def train_glm_period(variables, period, sampling_folder, model_folder, random_seed):
    ##Get formula
    right_part = " + ".join(variables)
    left_part = "I(1-fcc) + trial ~ "
    # left_part = "I(fcc) + trial ~ "
    glm_formula = left_part + right_part
    # Get samples
    samples_path = get_samples_for_period(period, sampling_folder)
    # Create period folder
    period_output_folder = model_folder / period
    if not os.path.exists(period_output_folder):
        os.makedirs(period_output_folder)
    # Set outputfile
    model_output = period_output_folder / "glm_model.pickle"
    # Train GLM
    glm_trined = train_glm_from_formula(
        glm_formula, samples_path, model_output, random_seed
    )


In [None]:
glm_trined_calibration = train_glm_period(
    variables_for_training, "calibration", sampling_folder, glm_model, random_seed
)


In [None]:
glm_trined_historical = train_glm_period(
    variables_for_training, "historical", sampling_folder, glm_model, random_seed
)

## Select files for prediction based on period

In [None]:
def get_period_variable_files(input_raster_files, period_dict, period):
    # Define the period-independent variables and their associated files
    period_independant_variables = ["altitude", "slope", "pa", "subj"]
    altitude_files = filter_files(input_raster_files, ["altitude"], None, False)
    slope_files = filter_files(input_raster_files, ["slope"], None, False)
    wdpa_files = filter_files(input_raster_files, ["pa"], None, False)
    subj_files = filter_files(input_raster_files, ["subj"], None, False)

    # Define the rivers and roads variables and their associated files
    rivers_files = filter_files(
        input_raster_files, ["rivers", "reprojected", "distance"], None, True
    )
    road_files = filter_files(
        input_raster_files, ["roads", "reprojected", "distance"], None, True
    )

    # Define the period-dependent variables and their associated files
    period_dictionary = period_dict[period]
    initial_year = str(period_dictionary["initial_year"])
    final_year = str(period_dictionary["final_year"])
    exclude_year = ", ".join(
        map(
            str,
            set(years)
            - {period_dictionary["initial_year"], period_dictionary["final_year"]},
        )
    )
    forest_loss_files = filter_files(
        input_raster_files,
        [forest_source, initial_year, final_year, "forest", "loss"],
        [exclude_year, "edge"],
        True,
    )
    # forest_edge_files = filter_files(input_raster_files, [forest_source, initial_year, 'forest','reprojected', 'edge'], None, True)
    town_files = filter_files(
        input_raster_files,
        [initial_year, "town", "reprojected", "distance"],
        None,
        True,
    )
    if period in ["calibration", "validation", "historical"]:
        forest_files = filter_files(
            input_raster_files,
            [forest_source, initial_year, "forest", "reprojected"],
            ["edge"],
            True,
        )
        forest_edge_files = filter_files(
            input_raster_files,
            [forest_source, initial_year, "forest", "reprojected", "edge"],
            None,
            True,
        )
    elif period == "forecast":
        forest_files = filter_files(
            input_raster_files,
            [forest_source, final_year, "forest", "reprojected"],
            ["edge"],
            True,
        )
        forest_edge_files = filter_files(
            input_raster_files,
            [forest_source, final_year, "forest", "reprojected", "edge"],
            None,
            True,
        )
    else:
        raise ValueError(
            f"Invalid period: {period}. Must be 'calibration', 'validation', 'historical', or 'forecast'"
        )
    # Create a dictionary with variable types as keys and file paths as values
    variable_file_mapping = {
        "period": period_dictionary["period"],
        "altitude": altitude_files[0],
        "slope": slope_files[0],
        "pa": wdpa_files[0],
        "subj": subj_files[0],
        "dist_river": rivers_files[0],
        "dist_road": road_files[0],
        "dist_town": town_files[0],
        "fcc": forest_loss_files[0],
        "dist_edge": forest_edge_files[0],
        "forest": forest_files[0],
    }
    return variable_file_mapping


In [None]:
def get_trained_model(period_dictionaries, period, model_folder):
    period_name = period_dictionaries[period]["train_period"]
    model_period_folder = model_folder / period_name
    model = list_files_by_extension(model_period_folder, [".pickle", ".joblib"])[0]
    return model


In [None]:
def get_design_info(formula_icar, dataset_file):
    """Get design info from patsy."""
    dataset = pd.read_csv(dataset_file)
    dataset = dataset.dropna(axis=0)
    dataset["trial"] = 1
    y, x = dmatrices(formula_icar, dataset, 0, "drop")
    y_design_info = y.design_info
    x_design_info = x.design_info
    return (y_design_info, x_design_info)


## Apply trained glm model

In [None]:
# Standard library imports
import os
import sys
import uuid

# Third party imports
import numpy as np
from osgeo import gdal
import pandas as pd
from patsy.build import build_design_matrices

# Local application imports
from forestatrisk.misc import rescale, makeblock


# predict_raster
def predict_raster(
    model,
    _x_design_info,
    period_dict_files="data",
    input_forest_raster="data/forest.tif",
    output_file="predictions.tif",
    blk_rows=128,
    verbose=True,
):
    """Predict the spatial probability of deforestation from a
    statistical model.

    This function predicts the spatial probability of deforestation
    from a statistical model. Computation are done by block and
    can be performed on large geographical areas.

    :param model: The model (glm, rf) to predict from. Must have a
        model.predict_proba() function.
    :param _x_design_info: Design matrix information from patsy.
    :param var_dir: Directory with rasters (.tif) of explicative variables.
    :param input_forest_raster: Path to forest raster (1 for forest).
    :param output_file: Name of the output raster file for predictions.
    :param blk_rows: If > 0, number of rows for computation by block.
    :param verbose: Logical. Whether to print messages or not. Default
        to ``True``.

    """

    # Mask on forest
    if verbose:
        print(f"Using {input_forest_raster} file")
    fmaskR = gdal.Open(input_forest_raster)
    fmaskB = fmaskR.GetRasterBand(1)

    # Landscape variables from forest raster
    gt = fmaskR.GetGeoTransform()
    ncol = fmaskR.RasterXSize
    nrow = fmaskR.RasterYSize
    Xmin = gt[0]
    Xmax = gt[0] + gt[1] * ncol
    Ymin = gt[3] + gt[5] * nrow
    Ymax = gt[3]

    # Raster list
    # Extract keys excluding 'fcc', 'forest' and 'period' and sort them
    sorted_keys = sorted(
        [key for key in period_dict_files.keys() if key not in ["period", "forest"]]
    )

    # Retrieve the corresponding file paths based on the sorted keys
    raster_list = [period_dict_files[key] for key in sorted_keys]
    # raster_names = []
    # for i in range(len(raster_list)):
    #     fname = os.path.basename(raster_list[i])
    #     index_dot = fname.index(".")
    #     raster_names.append(fname[:index_dot])
    var_names = sorted_keys
    var_names.extend(["X", "Y", "fmask"])
    # print(len(var_names), len(raster_list))
    # Make vrt with gdalbuildvrt
    if verbose:
        print("Make virtual raster with variables as raster bands")
    param = gdal.BuildVRTOptions(
        resolution="user",
        outputBounds=(Xmin, Ymin, Xmax, Ymax),
        xRes=gt[1],
        yRes=-gt[5],
        separate=True,
    )
    rand_uuid = uuid.uuid4()
    vrt_file = f"/vsimem/var_{rand_uuid}.vrt"
    cback = gdal.TermProgress_nocb if verbose else 0
    gdal.BuildVRT(vrt_file, raster_list, options=param, callback=cback)
    stack = gdal.Open(vrt_file)
    nband = stack.RasterCount
    proj = stack.GetProjection()

    # List of nodata values
    bandND = np.zeros(nband)
    for k in range(nband):
        band = stack.GetRasterBand(k + 1)
        bandND[k] = band.GetNoDataValue()
        if (bandND[k] is None) or (bandND[k] is np.nan):
            print(f"NoData value is not specified for input raster file {k}")
            sys.exit(1)
    bandND = bandND.astype(np.float32)

    # Make blocks
    blockinfo = makeblock(vrt_file, blk_rows=blk_rows)
    nblock = blockinfo[0]
    nblock_x = blockinfo[1]
    x = blockinfo[3]
    y = blockinfo[4]
    nx = blockinfo[5]
    ny = blockinfo[6]
    if verbose:
        print(f"Divide region in {nblock} blocks")

    # Raster of predictions
    if verbose:
        print("Create a raster file on disk for projections")
    driver = gdal.GetDriverByName("GTiff")
    try:
        os.remove(output_file)
    except FileNotFoundError:
        pass
    Pdrv = driver.Create(
        output_file,
        ncol,
        nrow,
        1,
        gdal.GDT_UInt16,
        ["COMPRESS=DEFLATE", "PREDICTOR=2", "BIGTIFF=YES"],
    )
    Pdrv.SetGeoTransform(gt)
    Pdrv.SetProjection(proj)
    Pband = Pdrv.GetRasterBand(1)
    Pband.SetNoDataValue(0)

    # Predict by block
    # Message
    if verbose:
        print("Predict deforestation probability by block")
    # Loop on blocks of data
    for b in range(nblock):
        # Position in 1D-arrays
        px = b % nblock_x
        py = b // nblock_x
        # Number of pixels
        npix = nx[px] * ny[py]
        # Data for one block of the stack (shape = (nband, nrow, ncol))
        data = stack.ReadAsArray(x[px], y[py], nx[px], ny[py])
        data = data.astype(float)  # From uint to float
        # Replace ND values with -9999
        for i in range(nband):
            data[i][np.nonzero(data[i] == bandND[i])] = -9999
        # Add a dimension if there is only one variable
        if len(data.shape) == 2:
            data = data[np.newaxis, :, :]
        # Coordinates of the center of the pixels of the block
        X_col = (
            gt[0] + x[px] * gt[1] + (np.arange(nx[px]) + 0.5) * gt[1]
        )  # +0.5 for center of pixels
        X = np.repeat(X_col[np.newaxis, :], ny[py], axis=0)
        X = X[np.newaxis, :, :]
        Y_row = (
            gt[3] + y[py] * gt[5] + (np.arange(ny[py]) + 0.5) * gt[5]
        )  # +0.5 for center of pixels
        Y = np.repeat(Y_row[:, np.newaxis], nx[px], axis=1)
        Y = Y[np.newaxis, :, :]
        # Forest mask
        fmaskA = fmaskB.ReadAsArray(x[px], y[py], nx[px], ny[py])
        fmaskA = fmaskA.astype(float)  # From uint to float
        fmaskA[np.nonzero(fmaskA != 1)] = -9999
        fmaskA = fmaskA[np.newaxis, :, :]
        # Concatenate forest mask with stack
        data = np.concatenate((data, X, Y, fmaskA), axis=0)
        # Transpose and reshape to 2D array
        data = data.transpose(1, 2, 0)
        data = data.reshape(npix, nband + 3)
        # Observations without NA
        w = np.nonzero(~(data == -9999).any(axis=1))
        # Remove observations with NA
        data = data[w]
        # Transform into a pandas DataFrame
        df = pd.DataFrame(data)
        df.columns = var_names
        # Add fake cell column for _x_design_info
        df["cell"] = 0
        # Predict
        pred = np.zeros(npix)  # Initialize with nodata value (0)
        if len(w[0]) > 0:
            # Get X
            (x_new,) = build_design_matrices([_x_design_info], df)
            X_new = x_new  # [:, :-1]
            # if "LogisticRegression" in str(model):
            #     X_new = x_new[:, :-1]
            # else:
            #     X_new = x_new[:, 1:-1]
            # Get predictions into an array
            p = model.predict_proba(X_new)[:, 1]
            # Rescale and return to pred
            pred[w] = rescale(p)
        # Assign prediction to raster
        pred = pred.reshape(ny[py], nx[px])
        Pband.WriteArray(pred, x[px], y[py])

    # Compute statistics
    if verbose:
        print("Compute statistics")
    Pband.FlushCache()  # Write cache data to disk
    Pband.ComputeStatistics(False)

    # Dereference driver
    Pband = None
    del Pdrv


# End


In [None]:
import pickle
from patsy import dmatrices
import forestatrisk


def apply_glm_period(
    period_dictionaries,
    period,
    model_folder,
    processed_data_folder,
    sampling_folder,
):
    period_dictionary = period_dictionaries[period]
    period_output_folder = model_folder / period
    if not os.path.exists(period_output_folder):
        os.makedirs(period_output_folder)
    prediction_output = period_output_folder / f"glm_{period_dictionary['period']}.tif"

    # Variables
    model = get_trained_model(period_dictionaries, period, model_folder)
    # Load model
    model_f = pd.read_pickle(model)
    formula = model_f.get("formula")

    ##Get formula
    # right_part = " + ".join(variables)  # +  " + cell"
    # left_part = "I(1-fcc) + trial ~ "
    # # left_part = "I(fcc) + trial ~ "
    # formula = left_part + right_part

    input_raster_files = list_files_by_extension(processed_data_folder, [".tiff", ".tif"])
    variable_files = get_period_variable_files(
        input_raster_files, period_dictionaries, period
    )
    forest_raster = variable_files["forest"]
    samples = get_samples_for_period(period, sampling_folder)
    (y_design_info, x_design_info) = get_design_info(formula, samples)
    time_interval = period_dictionary["time_interval"]
    predict_raster(
        model_f.get("model"),
        x_design_info,
        variable_files,
        forest_raster,
        prediction_output,
        blk_rows=256,
        verbose=True,
    )

    # defrate_per_cat
    print("Calculate deforestation rate per cathegory")
    defrate_output = str(
        period_output_folder / f"defrate_cat_glm_{period_dictionary['period']}.csv"
    )
    forestatrisk.defrate_per_cat(
        forest_change_file,
        str(prediction_output),
        time_interval,
        period,
        defrate_output,
        256,
        False,
    )


In [None]:
glm_predict_calibration = apply_glm_period(
    period_dictionaries,
    "calibration",
    glm_model,
    processed_data_folder,
    sampling_folder,
)


In [None]:
glm_predict_validation = apply_glm_period(
    period_dictionaries,
    "validation",
    glm_model,
    processed_data_folder,
    sampling_folder,
)


In [None]:
glm_predict_historical = apply_glm_period(
    period_dictionaries,
    "historical",
    glm_model,
    processed_data_folder,
    sampling_folder,
)


In [None]:
glm_predict_forecast = apply_glm_period(
    period_dictionaries,
    "forecast",
    glm_model,
    processed_data_folder,
    sampling_folder,
)


In [None]:
print("Done!")