## Probability grids

This notebook creates the time-dependent probability maps and writes them to file (`.nc` format). The notebook `01-create_classifiers.ipynb` must have been run previously.

In [1]:
use_extracted_data = True


In [2]:
import os
import time
from datetime import timedelta

import dask
import dask.dataframe as dd
from dask_ml.wrappers import ParallelPostFit
from joblib import load

from lib.check_files import check_prepared_data
from lib.pu import create_probability_grids


In [3]:
n_jobs = int(os.environ.get("N_JOBS", 8))
dask.config.set(num_workers=n_jobs)


<dask.config.set at 0x154faf613740>

### Load input data from file

In [4]:
if use_extracted_data:
    data_dir = "extracted_data"
else:
    data_dir = "prepared_data"
    check_prepared_data(data_dir, verbose=True)
data_filename = os.path.join(data_dir, "grid_data.csv")
point_data = dd.read_csv(data_filename)

df_out = point_data[["lon", "lat", "age (Ma)"]].compute()


### Calculate probabilities

In [5]:
output_dir = os.path.join("outputs", "Americas")

for algorithm in ("PU", "SVM"):
    print(
        f"Calculating probabilities for {algorithm} model... ",
        end="",
        flush=True,
    )
    t0 = time.time()

    subdir = os.path.join(output_dir, algorithm)
    model_filename = os.path.join(subdir, f"classifier.joblib")
    probabilities_filename = os.path.join(
        subdir,
        f"grid_probabilities.csv",
    )
    model = load(model_filename)

    # Set model n_jobs if possible
    # (let dask handle parallelism at this stage)
    try:
        model[-1].set_params(n_jobs=1)
    except ValueError:
        pass

    model_parallel = ParallelPostFit(model)

    point_x = point_data[model.feature_names_in_]
    p = model_parallel.predict_proba(point_x)[:, 1].ravel().compute()
    probabilities = df_out.copy()
    probabilities["probability"] = p
    del p
    probabilities.to_csv(probabilities_filename, index=False)
    del probabilities, model
    duration = timedelta(seconds=time.time() - t0)
    print(f"Done! (duration: {duration})", flush=True)


Calculating probabilities for PU model... Done! (duration: 0:01:13.771394)
Calculating probabilities for SVM model... Done! (duration: 0:00:11.669991)


### Create probability maps

In [6]:
for algorithm in ("PU", "SVM"):
    print(
        f"Creating grids for {algorithm} model... ",
        end="",
        flush=True,
    )

    subdir = os.path.join(output_dir, algorithm)
    probabilities_filename = os.path.join(
        subdir,
        f"grid_probabilities.csv",
    )
    grid_output_dir = os.path.join(
        subdir,
        f"probability_grids",
    )
    os.makedirs(grid_output_dir, exist_ok=True)

    create_probability_grids(
        data=probabilities_filename,
        output_dir=grid_output_dir,
        threads=n_jobs,
        extent=(-180, 180, -90, 90),
    )
    print("Done!", flush=True)


Creating grids for PU model... Done!
Creating grids for SVM model... Done!


In [7]:
probabilities_filename

'outputs/Americas/SVM/grid_probabilities.csv'