In [1]:
import numpy as np
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

from pyarrow.dataset import HivePartitioning

from model.InpatientsModel import InpatientsModel

Select a particular dataset and model run, then create an instance of the Inpatients Model class and load the datset.

In [2]:
dataset = "data/synthetic"
model_run = f"test/20220110_104353"
m = InpatientsModel(f"{dataset}/results/{model_run}")
ip = pq.read_pandas(f"{dataset}/ip.parquet").to_pandas()
ip.drop(["hsagrp"], axis = "columns", inplace = True)

We can now perform a model run. The `0` is the model run number. This alters the random seed used for the PRNG, meaning each time we run the model with this value we will get the same results.

In [3]:
selected_variant, results = m.run(0)
results

Unnamed: 0,rn,speldur,classpat,admission_avoidance_strategy,los_reduction_strategy
0,112772,0.0,2,,improved_discharge_planning_elective
1,62197,0.0,2,,improved_discharge_planning_elective
2,63143,0.0,2,,
3,63143,0.0,2,,
4,89718,3.0,1,ambulatory_care_conditions_acute,improved_discharge_planning_non-elective
...,...,...,...,...,...
146506,15862,1.0,1,,
146507,15862,1.0,1,,
146508,64514,0.0,1,alcohol_partial_chronic,
146509,64514,0.0,1,alcohol_partial_chronic,


We can join the results back to the full dataset to get all columns.

In [4]:
merged_results = ip.drop(["classpat", "speldur"], axis = "columns").merge(results, on = "rn")
merged_results.shape

(146511, 15)

We can now compare the base data to the modelled results.

In [5]:
# mean length of stay in the baseline vs results
(np.mean(ip.speldur), np.mean(merged_results.speldur))

(2.129893197030924, 1.733289650606439)

In [6]:
# number of rows in the baseline vs results
(len(ip.index), len(merged_results.index))

(128929, 146511)

In [7]:
# count by patient classification.
# Note: -1 indicates that a row has moved from the inpatients dataset to outpatients
pd.merge(
  ip.value_counts("classpat").to_frame("baseline"),
  merged_results.value_counts("classpat").to_frame("results"),
  left_index = True,
  right_index = True,
  how = "outer"
).fillna(0)

Unnamed: 0_level_0,baseline,results
classpat,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,0.0,245
1,69743.0,76199
2,47533.0,56163
3,10596.0,12781
4,7.0,5
5,1050.0,1118


We need to filter out the `classpat == -1` cases, these will be merged into Outpatients.

In [8]:
outpatients_rows = merged_results.classpat == "-1"

outpatients_results = merged_results[outpatients_rows] \
  [["rn", "age", "sex", "ethnos", "imd04_decile", "tretspef", "admidate"]] \
  .rename(columns = { "admidate": "apptdate" })

merged_results = merged_results[~outpatients_rows]
merged_results.shape

(146266, 15)

Now we need to do a bit of data cleansing, the modelling process updates the `speldur` (length of stay), but doesn't change the `disdate`.

In [9]:
merged_results.disdate = pd.to_datetime(merged_results.admidate) + pd.to_timedelta(merged_results.speldur, 'd')

## Multiple Model Runs

We can run multiple iterations of the model using the `multi_model_runs()` function. This function run's slightly differently from `run()` in two specific ways:

1. It tries to run in parallel across multiple CPU cores, though this tend's to work much better on Linux than Windows (dependent on how this is run in Windows it may be stuck using a single CPU no matter how many you ask for)
2. Instead of returning the results, it immediately saves them to disk in the folder `results/[model_name]/[model_run_date]/results`

In [10]:
cpu_count = os.cpu_count()
m.multi_model_runs(0, cpu_count * 3, cpu_count)

In [11]:
partitioning = HivePartitioning(pa.schema([("model_run", pa.int32())]))

multi_results = pq.ParquetDataset(
  f"{dataset}/results/{model_run}/results",
  partitioning = partitioning,
  use_legacy_dataset = False
).read_pandas().to_pandas()

In [12]:
mr_counts = multi_results.groupby(["model_run"]).size()
mr_counts.head()

model_run
0    146511
1    153574
2    149417
3    151734
4    147052
dtype: int64

baseline counts vs. modelled counts

In [13]:
(len(ip.index), mr_counts.agg(["mean", "median", "min", "max"]).astype(int))

(128929,
 mean      148334
 median    147311
 min       144780
 max       153574
 dtype: int32)

In [14]:
mr_speldur = multi_results.groupby(["model_run"])["speldur"].agg("mean")
mr_speldur.head()

model_run
0    1.733290
1    1.656049
2    1.672306
3    1.658554
4    1.722683
Name: speldur, dtype: float64

baseline length of stay vs. modelled length of stay

In [15]:
(np.mean(ip.speldur), mr_speldur.agg(["mean", "median", "min", "max"]))

(2.129893197030924,
 mean      1.722195
 median    1.726336
 min       1.598376
 max       1.837079
 Name: speldur, dtype: float64)