In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

## Setup

Choose the dataset, model name, and model run time below - this will locate what files to load

In [2]:
dataset = "RL4"
model_name = "test"
model_run = "20220110_104353"

data_path = f"data/{dataset}"
results_path = f"{data_path}/results/{model_name}/{model_run}"

In [3]:
def age_groups(x):
  return pd.cut(
    x,
    [0, 5, 15, 35, 50, 65, 85, 1000],
    False,
    ["0-4", "5-14", "15-34", "35-49", "50-64", "65-84", "85+"]
  ).astype(str)

## Load Datasets

In [4]:
def load_dataset(type, *args):
  cols = ["rn", "age", "sex", "ethnos", "imd04_decile"] + list(args)[0]
  
  df = (pq
    .read_pandas(f"{data_path}/{type}.parquet", cols)
    .to_pandas()
    .set_index("rn")
  )
  df["age_group"] = age_groups(df["age"])
  return df

In [5]:
ip = load_dataset("ip", [
  "tretspef", "classpat", "admidate", "disdate", "speldur", "epitype", "admimeth", "dismeth"
])

In [6]:
op = load_dataset("op", [
  "tretspef", "is_first", "has_procedures", "attendances", "tele_attendances"
])

In [7]:
aae = load_dataset("aae", [
  "aedepttype", "aearrivalmode", "arrivals"
])

## Load Model Results

In [8]:
partitioning = pa.dataset.HivePartitioning(pa.schema([("model_run", pa.int32())]))
def load_model_results(type):
  ds = pq.ParquetDataset(
    f"{results_path}/{type}/",
    partitioning = partitioning,
    use_legacy_dataset = False
  )
  df = ds.read_pandas().to_pandas()
  return df.set_index("rn")

## Process IP data

Rows that have a classpat of -1 need to be moved into OP.

In [9]:
ip_mr = load_model_results("ip").drop(["admission_avoidance_strategy", "los_reduction_strategy"], axis = "columns")

In [10]:
ip_op_row_ix = ip_mr["classpat"] == "-1"

ip_op_rows = (ip
  .merge(ip_mr[ip_op_row_ix][["model_run"]], left_index = True, right_index = True)
  .value_counts(["age_group", "sex", "imd04_decile", "ethnos", "tretspef", "model_run"])
  .to_frame("attendances")
  .reset_index()
)
ip_op_rows["is_first"] = False
ip_op_rows["has_procedures"] = True
ip_op_rows["tele_attendances"] = 0

In [11]:
# remove the ip to op rows
ip_mr = (ip
  .drop(["classpat", "speldur"], axis = "columns")
  .merge(ip_mr[~ip_op_row_ix], left_index = True, right_index = True)
)

In [12]:
# join the results and the baseline data together
ip["type"] = "baseline"
ip_mr["type"] = "model"
ip["model_run"] = 0
ip = pd.concat([ip, ip_mr])

In [13]:
# free up memory
ip_mr = None

In [14]:
# create an admission group column
ip["admission_group"] = "non-elective"
ip.loc[ip["admimeth"].str.startswith("1"), "admission_group"] = "elective"
# quick dq fix: convert any "non-elective" daycases to "elective"
ip.loc[ip["classpat"].isin(["2", "3"]), "admission_group"] = "elective"
# create a "pod" column, starting with the admission group
ip["pod"] = ip["admission_group"]
ip.loc[ip["classpat"].isin(["1", "4"]), "pod"] += "_admission"
ip.loc[ip["classpat"].isin(["2", "3"]), "pod"] += "_daycase"
ip.loc[ip["classpat"] == "5", "pod"] += "_birth-episode"
ip["beddays"] = ip["speldur"] + 1

### IP aggregations

In [15]:
ip_agg = (ip
  .groupby(["age_group", "sex", "tretspef", "type", "model_run", "pod"], as_index = False)
  .agg({ "speldur": len, "beddays": np.sum })
).rename({ "speldur": "admissions" }, axis = "columns")

## Outpatients

In [16]:
# make sure to convert imd04_decile to a string
op["imd04_decile"] = op["imd04_decile"].astype(str)

In [17]:
op_mr = (op[["age_group", "sex", "imd04_decile", "ethnos", "tretspef", "is_first", "has_procedures"]]
  .merge(load_model_results("op"), left_index = True, right_index = True)
).reset_index(drop = True)

In [18]:
op_mr = pd.concat([op_mr, ip_op_rows])

In [19]:
# join the results and the baseline data together
op["type"] = "baseline"
op_mr["type"] = "model"
op["model_run"] = 0
op = (pd.concat([op[op_mr.columns.tolist()].reset_index(drop = True), op_mr])
  .groupby(["age_group", "sex", "imd04_decile", "ethnos", "tretspef", "is_first", "has_procedures", "type", "model_run"], as_index = False)
  .agg(sum)
)
op.loc[ op["is_first"], "pod"] = "op_first"
op.loc[~op["is_first"], "pod"] = "op_follow-up"
op.loc[op["has_procedures"], "pod"] = "op_procedure"
# repromote imd04_decile to categorial, make sure to use ip's categories
op["imd04_decile"] = pd.Categorical(
  op["imd04_decile"].astype("category"),
  ip["imd04_decile"].cat.categories
)

### OP Aggregations

In [20]:
op_agg = (op
  .groupby(["age_group", "sex", "tretspef", "pod", "type", "model_run"], as_index = False)
  .agg({ "attendances": np.sum, "tele_attendances": np.sum })
)

## A&E Data

In [21]:
# create the pod type
aae["pod"] = "type-" + aae["aedepttype"]

aae_mr = (aae[["age_group", "sex", "imd04_decile", "ethnos", "aedepttype", "aearrivalmode", "pod"]]
  .merge(load_model_results("aae"), left_index = True, right_index = True)
  .reset_index(drop = True)
)

In [22]:
# join the results and the baseline data together
aae["type"] = "baseline"
aae_mr["type"] = "model"

aae["model_run"] = 0

In [23]:
aae = pd.concat([aae[aae_mr.columns.tolist()].reset_index(drop = True), aae_mr])

In [24]:
aae["measure"] = "walk-in"
aae.loc[aae["aearrivalmode"] == "1", "measure"] = "ambulance"

In [25]:
aae_agg = (aae
  .groupby(["age_group", "sex", "pod", "type", "model_run", "measure"], as_index = False)
  .agg({ "arrivals": np.sum })
  .rename({"arrivals": "value"}, axis = "columns")
  .assign(dataset = "aae", tretspef = "X01")
)

# Write aggregated data

In [26]:
melted_data = [
  pd.melt(
    op_agg.assign(dataset = "op"),
    ["age_group", "sex", "tretspef", "dataset", "pod", "type", "model_run"],
    ["attendances", "tele_attendances"],
    "measure"
  ),

  pd.melt(
    ip_agg.assign(dataset = "ip"),
    ["age_group", "sex", "tretspef", "dataset", "pod", "type", "model_run"],
    ["admissions", "beddays"],
    "measure"
  ),

  aae_agg
]


In [27]:

all_agg = pd.concat(melted_data).sort_values(["sex", "dataset", "pod", "measure", "tretspef", "age_group", "type", "model_run"])
# update the type of the principal runs
all_agg.loc[(all_agg["model_run"] == 0) & (all_agg["type"] == "model"), "type"] = "principal"
# save the results
all_agg.to_parquet(f"{results_path}/model_results.parquet", index = False)
# show the results
all_agg

Unnamed: 0,age_group,sex,tretspef,dataset,pod,type,model_run,measure,value
0,0-4,1,X01,aae,type-01,baseline,0,ambulance,1158
2,0-4,1,X01,aae,type-01,principal,0,ambulance,1116
4,0-4,1,X01,aae,type-01,model,1,ambulance,1232
6,0-4,1,X01,aae,type-01,model,2,ambulance,1215
8,0-4,1,X01,aae,type-01,model,3,ambulance,1213
...,...,...,...,...,...,...,...,...,...
253295,85+,2,X01,op,op_procedure,model,252,tele_attendances,34
253296,85+,2,X01,op,op_procedure,model,253,tele_attendances,11
253297,85+,2,X01,op,op_procedure,model,254,tele_attendances,34
253298,85+,2,X01,op,op_procedure,model,255,tele_attendances,25
