# Detailed baseline including mitigators

This is an updated version of the site level mitigators file. It produces 7 CSVs for each trust:
- detailed ip baseline
- detailed op baseline
- detailed aae baseline
- ip_activity_avoidance
- ip_efficiencies
- op_activity_avoidance
- aae_activity_avoidance

The only thing you need to set is the model_version in the first cell. The data is read directly from Azure and loops through all the trusts currently in the NHP programme. The trusts and the baseline year they are using are listed in the `trusts` variable.

⚠️ this notebook only works for v3.7 and later.

## Prepare workspace

In [None]:
model_version = 'v4.0.0'

In [None]:
import os
import glob
import pandas as pd
import io
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.storage.blob import ContainerClient

# Load all environment variables
load_dotenv()
account_url = os.getenv('AZ_STORAGE_EP')
container_name_data = os.getenv('AZ_STORAGE_CONTAINER')
container_name_inputs = os.getenv('AZ_STORAGE_INPUTS')
# Authenticate
default_credential = DefaultAzureCredential()
# Connect to containers
container_client_data = ContainerClient(account_url, container_name_data, default_credential)
container_client_inputs = ContainerClient(account_url, container_name_inputs, default_credential)

In [None]:
trusts = [
    "RCF",
    "RDU",
    "RGN",
    "RGP",
    "RCX",
    "RBT",
    "RN5",
    "RAS",
    "RQW",
    "RWG",
    "R1H",
    "RWE",
    "RVR",
    "RNQ",
    "RH5",
    "RA9",
    "R0A",
    "RXC",
    "RTX",
    "RH8",
    "RHW",
    "RXN",
    "RYJ",
    "RX1",
    "RGR",
    "RD8",
    "R0A",
    "REF"
]

In [None]:
# Extract patchless model-version string
def simplify_version(model):
    v_str = 'dev'
    if model != "dev":
        v_split = model.split(".")
        v_str = f'{v_split[0]}.{v_split[1]}'  # e.g. v3.0.0 is now v3.0
    return v_str

# Build string to the write directory
def enstring_dir(model, trust, year):
    v_dir = simplify_version(model)
    dir = f'{v_dir}/{trust}/baseline_data_detailed/{year}'  # mimics Azure path
    return dir

# Create the write directory if it doesn't already exist
def create_dir(model, trust, year):
    dir = enstring_dir(model, trust, year)
    if not os.path.exists(dir):
        os.makedirs(dir)

# Prepare partial filepath for saving output CSVs
def create_path_stub(model, trust, year):
    dir = enstring_dir(model, trust, year)
    path_stub = f'{dir}/{trust}_{model}_{year}'  # later appended with *.csv
    return path_stub

## Generate outputs

### Inpatients

In [None]:
def add_pod_to_data_v3(model_results):
    # Add beddays
    model_results['beddays'] = model_results['speldur'] + 1
    # Add pod - this is from _add_pod_to_data() function
    model_results["pod"] = "ip_" + model_results["group"] + "_admission"
    classpat = model_results["classpat"]
    model_results.loc[classpat == "2", "pod"] = "ip_elective_daycase"
    model_results.loc[classpat == "3", "pod"] = "ip_regular_day_attender"
    model_results.loc[classpat == "4", "pod"] = "ip_regular_night_attender"
    return model_results

def get_data_v3(version, trust, activity_type, year=2023):
    blob_name = [b for b in container_client_data.list_blob_names(name_starts_with=f'{version}/{activity_type}/fyear={year}/dataset={trust}') if b.endswith('.parquet')][0]
    blob_client = container_client_data.get_blob_client(blob_name)
    download_stream = blob_client.download_blob()
    stream_object = io.BytesIO(download_stream.readall())
    data = pd.read_parquet(stream_object)
    return data

def aggregate_data(data):
    data = data.groupby(['sitetret', 'group', 'pod', 'tretspef']).agg({'rn': 'count', 'beddays': 'sum', 'speldur': 'sum'}).rename(columns={'rn': 'admissions'})
    return data


In [None]:
for t in trusts:
    y = 2023

    create_dir(model_version, t, y)  # prep write directory
    path_stub = create_path_stub(model_version, t, y)  # prep partial filepath

    # get main ip data file and do basic processing
    data = get_data_v3(model_version, t, 'ip', y)
    data = add_pod_to_data_v3(data)
    data['beddays'] = data['speldur'] + 1

    # efficiencies
    ip_eff = get_data_v3(model_version,t,'ip_efficiencies_strategies', y)
    ip_eff = ip_eff.merge(data, how='inner', left_on='rn', right_on='rn')
    ip_eff = ip_eff.groupby(['strategy', 'sitetret']).agg({'speldur':'sum', 'beddays':'sum','rn': 'count'}).round(2).rename(columns={'rn': 'admissions'})
    ip_eff.to_csv(f'{path_stub}_ip_efficiencies.csv')

    # activity avoidance
    ip_aa = get_data_v3(model_version,t,'ip_activity_avoidance_strategies',y)
    ip_aa = ip_aa.merge(data, how='inner', left_on='rn', right_on='rn')
    ip_aa = ip_aa.groupby(['strategy', 'sitetret']).agg(
                {'rn': 'count', 'sample_rate': 'sum', 'beddays': 'sum'}).rename(columns={'sample_rate': 'sample_rate_sum', 'rn': 'admissions'})
    ip_aa.to_csv(f'{path_stub}_ip_activity_avoidance.csv')

    # aggregate and save main ip data file
    data = aggregate_data(data)
    data.to_csv(f'{path_stub}_detailed_ip_baseline.csv')


### Outpatients

In [None]:
def get_cons_cons(df):
    cons_df = df[df['is_cons_cons_ref']][["type", 'sitetret', 'attendances', 'tele_attendances']]
    cons_df = cons_df.groupby(['sitetret', 'type']).sum()
    return cons_df

## Followup reduction
# NOT FIRST, NO PROCEDURES
def get_followup_df(df):
    followup_df = df[~df['has_procedures']]
    followup_df = followup_df[~followup_df['is_first']][['sitetret', 'type', 'attendances', 'tele_attendances']]
    followup_df = followup_df.groupby(['sitetret', 'type']).sum()
    return followup_df

# ## GP referred first attendance
# # IS FIRST AND IS GP REFERRED
def get_gp_df(df):
    gp_df = df[df['is_gp_ref']]
    gp_df = gp_df[gp_df['is_first']][['sitetret', 'type', 'attendances', 'tele_attendances']]
    gp_df = gp_df.groupby(['sitetret', 'type']).sum()
    return gp_df

In [None]:
for t in trusts:
    op_data = get_data_v3(model_version, t,'op', y)
    tele_df = op_data[~op_data['has_procedures']].copy()
    tele_df = tele_df.groupby(['sitetret', "type"])[["attendances","tele_attendances"]].sum()
    tele_df = tele_df.rename(columns = {'attendances': 'convert_to_tele: attendances', 'tele_attendances': 'convert_to_tele: tele_attendances'})
    op_aa = tele_df.copy()
    cons_df = get_cons_cons(op_data).rename(columns={'attendances': 'consultant_to_consultant_referrals_attendances',
                                                    'tele_attendances': 'consultant_to_consultant_referrals_tele_attendances'})
    op_aa = op_aa.merge(cons_df, left_index = True, right_index = True, how='outer')
    followup_df = get_followup_df(op_data).rename(columns = {'attendances': 'followup_reduction_attendances',
                                                            'tele_attendances': 'followup_reduction_tele_attendances'})
    op_aa = op_aa.merge(followup_df, left_index = True, right_index = True, how = 'outer')
    gp_df = get_gp_df(op_data).rename(columns = {'attendances': 'gp_referred_attendances',
                                                'tele_attendances': 'gp_referred_tele_attendances'})
    op_aa = op_aa.merge(gp_df, left_index = True, right_index = True, how = 'outer')
    op_aa = op_aa.fillna(0).astype(int)

    create_dir(model_version, t, y)  # prep write directory
    path_stub = create_path_stub(model_version, t, y)  # prep partial filepath
    op_aa.to_csv(f'{path_stub}_op_activity_avoidance.csv')
    op_data.groupby(['sitetret', 'group', 'tretspef', 'type']).agg({'attendances': 'sum', 'tele_attendances': 'sum'}).to_csv(f'{path_stub}_detailed_op_baseline.csv')

### AAE

In [None]:
def get_ae_aggregation(df, col_name):
    df = df[df[col_name]].groupby(['sitetret', 'hsagrp'])[['arrivals']].sum()
    df = df.rename(columns={'arrivals': col_name})
    return df

In [None]:
for t in trusts:
        ae_data = get_data_v3(model_version, t,'aae', y)
        # start with empty df
        ae_df = pd.DataFrame(index=ae_data.groupby(['sitetret', 'hsagrp']).count().index)

        for col in ['is_discharged_no_treatment', 'is_frequent_attender', 'is_left_before_treatment',
                'is_low_cost_referred_or_discharged']:
                df = get_ae_aggregation(ae_data, col)
                ae_df = ae_df.merge(df, left_index=True, right_index=True, how="outer").fillna(0)

        create_dir(model_version, t, y)  # prep write directory
        path_stub = create_path_stub(model_version, t, y)  # prep partial filepath

        ae_df.to_csv(f'{path_stub}_aae_activity_avoidance.csv')
        ae_data.groupby(['sitetret', 'hsagrp', 'aedepttype', 'attendance_category'])[['arrivals']].sum().to_csv(f'{path_stub}_detailed_aae_baseline.csv')


## Copy data to Azure

CSVs were generated above in the local `vX.Y/XYZ/baseline_data_detailed/YYYY/` directory structure. They'll be copied to the same directory structure on Azure.

In [None]:
for t in trusts:

    path = enstring_dir(model_version, t, y)
    filepaths = f'{path}/*.csv'
    for filepath in glob.glob(filepaths, recursive = True):
        with open(file = filepath, mode = "rb") as data:
            try:
                blob_client = container_client_inputs.upload_blob(name = filepath, data = data, overwrite=True)
            except:
                print(filepath)