## Detailed results

This notebook is for producing detailed results for model v4.0.0.

Assumes you have already authenticated via Azure CLI - [instructions here](https://github.com/The-Strategy-Unit/data_science/blob/fa37cbc01513127626364049124d71f06a35183a/blogs/posts/2024-05-22_storing-data-safely/azure_python.ipynb#L43-L47). Outputs into a `data/` folder the detailed aggregations of IP, OP, and AAE model results in CSV and Parquet formats.

Also assumes the scenario has already been run with `full_model_results = True`.

You can check if this has happened using `nhpy.check_full_results`, and if not, produce full model results using `nhpy.run_full_results`

In [None]:
# ⚠️ Set this to the path where the aggregated model results are saved

agg_results_folder = "aggregated-model-results/vX.X/RXX/scenarioname/datetime/"

In [None]:
# We want to be in the nhp_products root folder so that we can load nhpy.az
%cd ../..

import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
from nhpy import az, process_data, process_results

%load_ext autoreload
%autoreload 2

In [None]:
# Setup

load_dotenv()
account_url = os.getenv("AZ_STORAGE_EP")
results_container = os.getenv("AZ_STORAGE_RESULTS")
data_container = os.getenv("AZ_STORAGE_DATA")
api_key = os.getenv("API_KEY")

results_connection = az.connect_to_container(account_url, results_container)
data_connection = az.connect_to_container(account_url, data_container)
params = az.load_agg_params(results_connection, agg_results_folder)

## Setup

In [None]:
# Get lots of info from the results file

scenario_name = params["scenario"]
trust = params["dataset"]
model_version = params["app_version"]
baseline_year = params["start_year"]
run_id = params["create_datetime"]

# Patch model version for loading the data
# Results folder name truncated, e.g. v3.0 - does not show the patch version. But data stores in format v3.0.1
model_version_data = az.find_latest_version(data_connection, params["app_version"])
print(f"Using data: {model_version_data}")

In [None]:
# Add Data folder if it doesn't exist

if not os.path.exists("notebooks/PRODUCT_detailed_results/data/"):
    os.makedirs("notebooks/PRODUCT_detailed_results/data/")

In [None]:
# Add principal to the "vanilla" model results
actual_results_df = az.load_agg_results(results_connection, agg_results_folder)
actual_results_df = process_results.convert_results_format(actual_results_df)

## Inpatients

In [None]:
original_df = az.load_data_file(
    data_connection, model_version_data, trust, "ip", baseline_year
)

In [None]:
model_runs = {}
for run in range(1, 257):
    df = az.load_model_run_results_file(
        results_connection, model_version, trust, scenario_name, run_id, "ip", run
    )
    # We want to use the speldur and classpat from the results, not from the original df
    merged = (
        original_df.copy()
        .drop(columns=["speldur", "classpat"])
        .merge(df, on="rn", how="inner")
    )
    results = process_data.process_ip_detailed_results(merged)
    results_dict = results.to_dict()
    for k, v in results_dict["value"].items():
        if k not in model_runs.keys():
            model_runs[k] = []
        model_runs[k].append(v)

In [None]:
model_runs_df = process_data.process_model_runs_dict(
    model_runs,
    columns=[
        "sitetret",
        "age_group",
        "sex",
        "pod",
        "tretspef",
        "los_group",
        "maternity_delivery_in_spell",
        "measure",
    ],
)

In [None]:
# Useful for checking if "main" model results from Azure line up with aggregated model results
# Not always the same because of rounding

default_beddays_principal = (
    actual_results_df[actual_results_df["measure"] == "beddays"]["mean"]
    .sum()
    .astype(int)
)
detailed_beddays_principal = (
    model_runs_df.loc[
        (
            slice(None),
            slice(None),
            slice(None),
            slice(None),
            slice(None),
            slice(None),
            slice(None),
            "beddays",
        ),
        :,
    ]
    .sum()
    .loc["mean"]
    .astype(int)
)

try:
    assert abs(default_beddays_principal - detailed_beddays_principal) <= 1
except:
    print(default_beddays_principal)
    print(detailed_beddays_principal)

In [None]:
# Save

model_runs_df.to_csv(
    f"notebooks/PRODUCT_detailed_results/data/{scenario_name}_detailed_ip_results.csv"
)
model_runs_df.to_parquet(
    f"notebooks/PRODUCT_detailed_results/data/{scenario_name}_detailed_ip_results.parquet"
)

## Outpatients

In [None]:
original_df = az.load_data_file(
    data_connection, model_version_data, trust, "op", baseline_year
)
original_df = original_df.rename(columns={"index": "rn"}).fillna("unknown")

In [None]:
op_model_runs = {}

for run in range(1, 257):
    df = az.load_model_run_results_file(
        results_connection, model_version, trust, scenario_name, run_id, "op", run
    )
    assert df.shape[0] == original_df.shape[0]
    merged = (
        original_df.copy()
        .drop(columns=["attendances", "tele_attendances"])
        .merge(df, on="rn", how="inner")
    )
    results = process_data.process_op_detailed_results(merged)
    # Handle activity converted from IP to OP
    df_conv = az.load_model_run_results_file(
        results_connection,
        model_version,
        trust,
        scenario_name,
        run_id,
        "op_conversion",
        run,
    )
    df_conv = process_data.process_op_converted_from_ip(df_conv)
    results = process_data.combine_converted_with_main_results(df_conv, results)
    results_dict = results.to_dict()
    for k, v in results_dict["value"].items():
        if k not in op_model_runs.keys():
            op_model_runs[k] = []
        op_model_runs[k].append(v)

In [None]:
op_model_runs_df = process_data.process_model_runs_dict(
    op_model_runs, columns=["sitetret", "pod", "age_group", "tretspef", "measure"]
)
op_model_runs_df.head()

In [None]:
# Useful for checking if "main" model results from Azure line up with aggregated model results using "full model results"
detailed_attendances_principal = (
    op_model_runs_df.round(1)
    .loc[(slice(None), slice(None), slice(None), slice(None), "attendances"), :]
    .sum()
    .astype(int)
    .loc["mean"]
)
default_attendances_principal = (
    actual_results_df[actual_results_df["measure"] == "attendances"]["mean"]
    .sum()
    .astype(int)
)
# They're not always exactly the same because of rounding
try:
    assert abs(default_attendances_principal - detailed_attendances_principal) <= 1
except:
    print(default_attendances_principal)
    print(detailed_attendances_principal)

In [None]:
op_model_runs_df.to_csv(
    f"notebooks/PRODUCT_detailed_results/data/{scenario_name}_detailed_op_results.csv"
)
op_model_runs_df.to_parquet(
    f"notebooks/PRODUCT_detailed_results/data/{scenario_name}_detailed_op_results.parquet"
)

## AAE

In [None]:
original_df = az.load_data_file(
    data_connection, model_version_data, trust, "aae", baseline_year
)
original_df = original_df.rename(columns={"index": "rn"}).fillna("unknown")

In [None]:
ae_model_runs = {}

for run in range(1, 257):
    df = az.load_model_run_results_file(
        results_connection, model_version, trust, scenario_name, run_id, "aae", run
    )
    assert len(df) == len(original_df)
    merged = original_df.drop(columns=["arrivals"]).merge(df, on="rn", how="inner")
    results = process_data.process_aae_results(merged)
    # Handle activity converted from IP to OP
    df_conv = az.load_model_run_results_file(
        results_connection,
        model_version,
        trust,
        scenario_name,
        run_id,
        "sdec_conversion",
        run,
    )
    df_conv = process_data.process_aae_converted_from_ip(df_conv)
    results = process_data.combine_converted_with_main_results(df_conv, results)
    results_dict = results.to_dict()
    for k, v in results_dict["arrivals"].items():
        if k not in ae_model_runs.keys():
            ae_model_runs[k] = []
        ae_model_runs[k].append(v)

In [None]:
ae_model_runs_df = process_data.process_model_runs_dict(
    ae_model_runs,
    columns=[
        "sitetret",
        "pod",
        "age_group",
        "attendance_category",
        "aedepttype",
        "acuity",
        "measure",
    ],
)


In [None]:
# Useful for checking if "main" model results from Azure line up with aggregated model results using full model results
detailed_ambulance_principal = (
    ae_model_runs_df.loc[
        (
            slice(None),
            slice(None),
            slice(None),
            slice(None),
            slice(None),
            slice(None),
            "ambulance",
        ),
        :,
    ]
    .sum()
    .loc["mean"]
    .round(0)
)
default_ambulance_principal = (
    actual_results_df[actual_results_df["measure"] == "ambulance"]["mean"]
    .sum()
    .round(0)
)

# They're not always exactly the same because of rounding
try:
    assert abs(default_ambulance_principal - detailed_ambulance_principal) <= 1
except:
    print("OH NO!!")
    print(default_ambulance_principal)
    print(detailed_ambulance_principal)

In [None]:
# Useful for checking if "main" model results from Azure line up with aggregated model results using full model results
detailed_walkins_principal = (
    ae_model_runs_df.loc[
        (
            slice(None),
            slice(None),
            slice(None),
            slice(None),
            slice(None),
            slice(None),
            "walk-in",
        ),
        :,
    ]
    .sum()
    .loc["mean"]
    .round(0)
)
default_walkins_principal = (
    actual_results_df[actual_results_df["measure"] == "walk-in"]["mean"]
    .sum()
    .round(0)
)

# They're not always exactly the same because of rounding
try:
    assert abs(default_walkins_principal - detailed_walkins_principal) <= 1
except:
    print("OH NO!!")
    print(default_walkins_principal)
    print(detailed_walkins_principal)

In [None]:
# Save
ae_model_runs_df.to_csv(
    f"notebooks/PRODUCT_detailed_results/data/{scenario_name}_detailed_ae_results.csv"
)
ae_model_runs_df.to_parquet(
    f"notebooks/PRODUCT_detailed_results/data/{scenario_name}_detailed_ae_results.parquet"
)