In [1]:
%cd ..

c:\Users\thomas.jemmett\dev\nhp\nhp_model


# New Hospitals Model

This notebook runs the NHP model and produces the raw results.

Note, this can take a very long time to run and load the resulting data. If you find that you are running out of RAM (especially when loading data) consider reducing the number of model runs.

In [None]:
params_file = "sample_params.json"
data_path = "data"
results_path = "results"

## Setup

Load the required packages

In [None]:
import os
import tempfile

from datetime import datetime

from run_model import run_model

from model.aae import AaEModel
from model.inpatients import InpatientsModel
from model.outpatients import OutpatientsModel
from model.model_save import LocalSave
from model.helpers import load_params

We need to load in the params json file.

In [None]:
params = load_params(params_file)
# extract the number of model_runs the params calls for
model_runs = params["model_runs"]
# get the dataset name
dataset = params["input_data"]
# and get the scenario
scenario = params["name"]
# set the create_datetime
create_datetime = params["create_datetime"] = f"{datetime.now():%Y%m%d_%H%M%S}"

We will run the model in parallel. By default, use all available CPU cores. You can set this to a lower value to use less resources, but it will take longer to run the model.

In [None]:
cpus = os.cpu_count()
cpus

When we run the model in parallel it's slightly more efficient to run a batch of model runs. Batches of 4 or 8 seems to be most efficient. This value should be a power of 2.

In [None]:
batch_size = 2 ** 2
batch_size

## Run the model

First, we create the model runner. The `run_model()` function expects the params dictionary, the path to the data, the path where the results will be saved, which model run to start at, how many model runs to perform, the number of CPU cores to use, and the size of the batches to run.

The function returns a function, which takes either `AaEModel`, `InpatientsModel`, or `OutpatientsModel`, depending on what type of model we want to run.

Note, we add one to the model runs. The "principal" model run is model run 0, and then we perform 1 to `model_runs` iterations of the model.

In [None]:
model_save = LocalSave(params, results_path, temppath:=tempfile.mkdtemp(), True)
runner = run_model(model_save, -1, model_runs + 2, cpus, batch_size)

Now the runner is set up, we can run each of the types of models.

In [None]:
for m in [AaEModel, OutpatientsModel, InpatientsModel]:
    runner(m(params, "data"))

with all of the models run, we can now call the `post_runs()` method which will save the params/run params/change factors and aggregated results

In [None]:
model_save.post_runs()

## Load Results

We can now load in our results.

In [None]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import pyarrow.dataset as ds

In [None]:
def load_dataset(activity_type):
  base_data = (pq
    .read_pandas(f"data/{dataset}/{activity_type}.parquet")
    .to_pandas()
  )
  results_data = (pq
    .ParquetDataset(
      f"{results_path}/model_results/activity_type={activity_type}/" +
      f"dataset={dataset}/scenario={scenario}/create_datetime={create_datetime}"
    )
    .read_pandas()
    .to_pandas()
  )
  base_data_cols = [col
    for col in (base_data.columns)
    if col not in set(results_data.columns)
  ]
  # fix category columns
  results_data["model_run"] = results_data["model_run"].astype(int)
  # merge and return
  return (base_data[base_data_cols]
    .merge(results_data, left_on = "rn", right_index = True)
    .drop("rn", axis="columns")
  )

In [None]:
aae = load_dataset("aae")
aae

In [None]:
ip = load_dataset("ip")
ip

op data needs to be handled slightly differently: we need to add in the op conversion rows

In [None]:
def load_op_data():
  grouping_cols = [
    "age",
    "sex",
    "tretspef",
    "is_gp_ref",
    "is_cons_cons_ref",
    "is_first",
    "has_procedures",
    "model_run"
  ]
  # load the ip->op conversion data
  base_data = (pq
    .read_pandas(f"data/{dataset}/ip.parquet")
    .to_pandas()
  )
  results_data = (pq
    .ParquetDataset(
      f"{results_path}/model_results/activity_type=op_conversion/" +
      f"dataset={dataset}/scenario={scenario}/create_datetime={create_datetime}")
    .read_pandas()
    .to_pandas()
  )
  base_data_cols = [col
    for col in (base_data.columns)
    if col not in set(results_data.columns)
  ]
  grouping_cols_subset = [
    i for i in grouping_cols if i in set(base_data_cols) or i in set(results_data.columns)
  ]
  # fix category columns
  results_data["model_run"] = results_data["model_run"].astype(int)
  # merge results and aggregate
  merged = (base_data[base_data_cols]
    .merge(results_data, left_on = "rn", right_index = True)
    .groupby(grouping_cols_subset)
    .agg({"attendances": "sum", "tele_attendances": "sum"})
    .assign(is_gp_ref = False, is_cons_cons_ref = False, is_first = False, has_procedures = True)
    .reset_index()
  )
  # load the op data
  op = load_dataset("op")[grouping_cols + ["attendances", "tele_attendances"]]
  # combine the data
  return (pd
    .concat([op, merged])
    .groupby(grouping_cols)
    .agg({"attendances": "sum", "tele_attendances": "sum"})
    .reset_index()
  )

In [None]:
op = load_op_data()
op

we can load the change factors in like so. Note, the order of the rows is semi-important within each model_run:
the "baseline" change_factor row must always come first. The other rows are then in the order that change factor
was run within the model engine, but strictly do not need to be shown in that order.


In [None]:
change_factors = pd.read_csv(
  f"{results_path}/change_factors/{dataset}__{scenario}__{create_datetime}.csv"
)
change_factors