In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

## Setup

Choose the dataset, model name, and model run time below - this will locate what files to load

In [4]:
dataset = "synthetic"
model_name = "test"
model_run = "20220110_104353"

data_path = f"data/{dataset}"
results_path = f"{data_path}/results/{model_name}/{model_run}"

## Load Datasets

In [51]:
def load_dataset(type, *args):
  cols = ["rn", "age", "sex", "ethnos", "imd04_decile", "tretspef"]
  
  return (pq
    .read_pandas(f"{data_path}/{type}.parquet", *args)
    .to_pandas()
    .set_index("rn")
  )

In [53]:
ip = load_dataset("ip", [
  "rn", "age", "sex", "ethnos", "imd04_decile", "tretspef",
  "admidate", "disdate", "speldur", "epitype", "admimeth", "dismeth"
])
ip

Unnamed: 0_level_0,age,sex,ethnos,imd04_decile,tretspef,admidate,disdate,speldur,epitype,admimeth,dismeth
rn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5512,1,1,99,Least deprived 10%,100,2018-08-14,2018-08-14,0.0,1,11,1
20201,17,1,99,Less deprived 30-40%,100,2018-12-07,2018-12-07,0.0,1,13,1
113524,28,1,99,More deprived 10-20%,100,2018-10-13,2018-10-13,0.0,1,13,1
104776,30,1,99,Less deprived 20-30%,100,2018-07-14,2018-07-14,0.0,1,12,1
52848,37,1,99,Less deprived 10-20%,100,2018-07-18,2018-07-18,0.0,1,11,1
...,...,...,...,...,...,...,...,...,...,...,...
100613,76,2,Z,Less deprived 30-40%,X01,2018-06-12,2018-06-12,0.0,1,81,1
79868,76,2,Z,Least deprived 10%,X01,2018-12-19,2018-12-28,9.0,1,81,1
42145,78,2,Z,Less deprived 20-30%,X01,2018-11-30,2018-12-04,4.0,1,81,1
116143,87,2,Z,Least deprived 10%,X01,2018-03-30,2018-03-30,0.0,1,81,1


In [47]:
op = load_dataset("op", [
  "rn", "age", "sex", "ethnos", "imd04_decile", "tretspef",
  "is_surgical_specialty", "is_first", "has_procedures", "attendances", "tele_attendances"
])
op

KeyError: "['hsagrp'] not found in axis"

In [26]:
aae = load_dataset("aae")

## Load Model Results

In [37]:
partitioning = pa.dataset.HivePartitioning(pa.schema([("model_run", pa.int32())]))
def load_model_results(type):
  ds = pq.ParquetDataset(
    f"{results_path}/{type}/",
    partitioning = partitioning,
    use_legacy_dataset = False
  )
  df = ds.read_pandas().to_pandas()
  return df.set_index("rn")

In [19]:
ip_mr = load_model_results("ip")

In [20]:
op_mr = load_model_results("op")

In [21]:
aae_mr = load_model_results("aae")

## Process IP data

Rows that have a classpat of -1 need to be moved into OP.

In [32]:
ip_op_row_ix = ip_mr["classpat"] == "-1"

ip_op_rows = ip_mr[ip_op_row_ix]
ip_mr = ip_mr[~ip_op_row_ix]

In [35]:
(ip
  .merge(ip_op_rows[["model_run"]], left_index = True, right_index = True)
  .groupby(["age", "sex", "ethnos", "tretspef"]
]


Unnamed: 0_level_0,age,sex,ethnos,imd04_decile,classpat,tretspef,admidate,disdate,speldur,epitype,admimeth,dismeth,hsagrp,model_run
rn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
188,28,2,A,Less deprived 40-50%,2,X01,2018-07-18,2018-07-18,0.0,1,11,1,daycase,1
188,28,2,A,Less deprived 40-50%,2,X01,2018-07-18,2018-07-18,0.0,1,11,1,daycase,102
188,28,2,A,Less deprived 40-50%,2,X01,2018-07-18,2018-07-18,0.0,1,11,1,daycase,128
188,28,2,A,Less deprived 40-50%,2,X01,2018-07-18,2018-07-18,0.0,1,11,1,daycase,134
188,28,2,A,Less deprived 40-50%,2,X01,2018-07-18,2018-07-18,0.0,1,11,1,daycase,134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129010,75,2,A,More deprived 30-40%,2,X01,2019-03-13,2019-03-13,0.0,1,11,1,daycase,93
129010,75,2,A,More deprived 30-40%,2,X01,2019-03-13,2019-03-13,0.0,1,11,1,daycase,94
129010,75,2,A,More deprived 30-40%,2,X01,2019-03-13,2019-03-13,0.0,1,11,1,daycase,95
129010,75,2,A,More deprived 30-40%,2,X01,2019-03-13,2019-03-13,0.0,1,11,1,daycase,95


In [36]:
op

Unnamed: 0_level_0,age,sex,ethnos,imd04_decile,tretspef,refsourc,is_surgical_specialty,is_adult,is_gp_ref,is_cons_cons_ref,is_first,has_procedures,type,hsagrp,attendances,tele_attendances
rn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
298081,81,1,99,More deprived 40-50%,100,01,False,True,False,False,True,False,adult_non-surgical,op_adult_non-surgical_first,1,0
190482,59,1,A,More deprived 10-20%,100,01,False,True,False,False,True,False,adult_non-surgical,op_adult_non-surgical_first,1,0
264729,74,1,A,Most deprived 10%,100,01,False,True,False,False,True,False,adult_non-surgical,op_adult_non-surgical_first,1,0
265608,74,1,A,Less deprived 20-30%,100,01,False,True,False,False,True,False,adult_non-surgical,op_adult_non-surgical_first,1,0
270331,75,1,A,More deprived 40-50%,100,01,False,True,False,False,True,False,adult_non-surgical,op_adult_non-surgical_first,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3341,0,2,Z,Less deprived 10-20%,X01,97,True,False,False,False,True,True,child_surgical,op_child_surgical_procedure,1,0
2009,0,2,A,Most deprived 10%,X01,99,True,False,False,False,True,True,child_surgical,op_child_surgical_procedure,1,0
6635,2,2,A,Less deprived 20-30%,X01,99,True,False,False,False,True,True,child_surgical,op_child_surgical_procedure,1,0
25853,11,2,A,More deprived 20-30%,X01,99,True,False,False,False,True,True,child_surgical,op_child_surgical_procedure,1,0
