In [1]:
import numpy as np
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

from pyarrow.dataset import HivePartitioning

from model.InpatientsModel import InpatientsModel

Select a particular dataset and model run, then create an instance of the Inpatients Model class and load the datset.

In [2]:
dataset = "data/synthetic"
model_run = f"test/20220110_104353"
m = InpatientsModel(f"{dataset}/results/{model_run}")
ip = pq.read_pandas(f"{dataset}/ip.parquet").to_pandas()
ip.drop(["admigrp"], axis = "columns", inplace = True)

We can now perform a model run. The `0` is the model run number. This alters the random seed used for the PRNG, meaning each time we run the model with this value we will get the same results.

In [3]:
selected_variant, results = m.run(0)
results

Unnamed: 0,rn,speldur,classpat,admission_avoidance_strategy,los_reduction_strategy
0,1,0.0,2,,bads_daycase
1,1,0.0,2,,bads_daycase
2,735,1.0,1,,improved_discharge_planning_non-elective
3,735,1.0,1,,improved_discharge_planning_non-elective
4,841,4.0,1,ambulatory_care_conditions_chronic,
...,...,...,...,...,...
148798,127945,0.0,2,,
148799,128227,0.0,2,,
148800,128227,0.0,2,,
148801,128833,13.0,1,ambulatory_care_conditions_chronic,


We can join the results back to the full dataset to get all columns.

In [4]:
merged_results = ip.drop(["classpat", "speldur"], axis = "columns").merge(results, on = "rn")
merged_results.shape

(148803, 15)

We can now compare the base data to the modelled results.

In [5]:
# mean length of stay in the baseline vs results
(np.mean(ip.speldur), np.mean(merged_results.speldur))

(2.1150341597326157, 1.7143538772739797)

In [6]:
# number of rows in the baseline vs results
(len(ip.index), len(merged_results.index))

(128953, 148803)

In [7]:
# count by patient classification.
# Note: -1 indicates that a row has moved from the inpatients dataset to outpatients
pd.merge(
  ip.value_counts("classpat").to_frame("baseline"),
  merged_results.value_counts("classpat").to_frame("results"),
  left_index = True,
  right_index = True,
  how = "outer"
).fillna(0)

Unnamed: 0_level_0,baseline,results
classpat,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,0.0,323
1,69522.0,76188
2,47833.0,58117
3,10571.0,13073
4,8.0,7
5,1019.0,1095


We need to filter out the `classpat == -1` cases, these will be merged into Outpatients.

In [8]:
outpatients_rows = merged_results.classpat == "-1"

outpatients_results = merged_results[outpatients_rows] \
  [["rn", "admiage", "sex", "ethnos", "imd04_decile", "tretspef", "admidate"]] \
  .rename(columns = { "admiage": "apptage", "admidate": "apptdate" })

merged_results = merged_results[~outpatients_rows]
merged_results.shape

(148480, 15)

Now we need to do a bit of data cleansing, the modelling process updates the `speldur` (length of stay), but doesn't change the `disdate`.

In [9]:
merged_results.disdate = pd.to_datetime(merged_results.admidate) + pd.to_timedelta(merged_results.speldur, 'd')

## Multiple Model Runs

We can run multiple iterations of the model using the `multi_model_runs()` function. This function run's slightly differently from `run()` in two specific ways:

1. It tries to run in parallel across multiple CPU cores, though this tend's to work much better on Linux than Windows (dependent on how this is run in Windows it may be stuck using a single CPU no matter how many you ask for)
2. Instead of returning the results, it immediately saves them to disk in the folder `results/[model_name]/[model_run_date]/results`

In [10]:
cpu_count = os.cpu_count()
m.multi_model_runs(0, cpu_count * 3, cpu_count)

In [11]:
partitioning = HivePartitioning(pa.schema([("model_run", pa.int32())]))

multi_results = pq.ParquetDataset(
  f"{dataset}/results/{model_run}/results",
  partitioning = partitioning,
  use_legacy_dataset = False
).read_pandas().to_pandas()

In [12]:
mr_counts = multi_results.groupby(["model_run"]).size()
mr_counts.head()

model_run
0    148803
1    154974
2    148285
3    154109
4    148483
dtype: int64

baseline counts vs. modelled counts

In [13]:
(len(ip.index), mr_counts.agg(["mean", "median", "min", "max"]).astype(int))

(128953,
 mean      150024
 median    149218
 min       147388
 max       155886
 dtype: int32)

In [14]:
mr_speldur = multi_results.groupby(["model_run"])["speldur"].agg("mean")
mr_speldur.head()

model_run
0    1.714354
1    1.727716
2    1.681195
3    1.653940
4    1.723477
Name: speldur, dtype: float64

baseline length of stay vs. modelled length of stay

In [15]:
(np.mean(ip.speldur), mr_speldur.agg(["mean", "median", "min", "max"]))

(2.1150341597326157,
 mean      1.699347
 median    1.698407
 min       1.653940
 max       1.742855
 Name: speldur, dtype: float64)