# Import Packages

In [17]:
# declare imports
# import glob
# import pandas as pd
# import dask.dataframe as dd
# from pandas.api.types import is_numeric_dtype
# import matplotlib.pyplot as plt
# import matplotlib as mpl
# from pathlib import Path
# import numpy as np
# import seaborn as sns
# import colorcet as cc
# import math
# from pearllib import group_title_dict, NA_ACCORD_group_title_dict
# import yaml

In [18]:
import argparse
from datetime import datetime
from pathlib import Path
import os
import sys
import dask.dataframe as dd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import yaml
import colorcet as cc

path_to_functions = os.path.abspath("../src")

# Add this folder to the system path
sys.path.append(path_to_functions)

from pearl.post_processing.bmi import (
    add_summary,
    calc_overall_risk,
    calc_percentage,
    calc_percentage_and_add_summary,
    calc_risk_by_group,
    clean_control,
    create_summary_table,
    group_order,
    group_order_with_sub_total,
    group_title_dict,
    palette,
    rearrange_group_order,
    round_thousand,
    calc_dm_prop,
    add_sub_total,
    calc_overall_bmi_risk,
)

start_time = datetime.now()
df_summary_dict = {}

# Define the argument parser

baseline_dir = Path('../out/S0_10/combined')
variable_dir = Path('../out/S3_10/combined')
out_dir = Path('../results')

start_year = 2013
end_year = 2017

# Define Group Order

In [19]:
group_order = [
    "Black HET Women",
    "White HET Women",
    "Hispanic HET Women",
    "Black HET Men",
    "White HET Men",
    "Hispanic HET Men",
    "Black WWID",
    "White WWID",
    "Hispanic WWID",
    "Black MWID",
    "White MWID",
    "Hispanic MWID",
    "Black MSM",
    "White MSM",
    "Hispanic MSM",
    "Overall"
]

# Plot Color Setting

In [20]:
#define color pallete
palette = sns.color_palette(cc.glasbey_light, n_colors=16)

## calc_percentage

In [21]:
round_thousand = lambda x: int(math.ceil(x / 100.0)) * 100 if  x > 1000 else x

# Old Risk Calculation

In [22]:
def calc_risk_by_group_old(df, years_follow_up):

    # filter for only x-year follow up with dm
    df_follow_up = df.loc[(df['years_after_h1yy'] > 0) & (df['years_after_h1yy'] <= years_follow_up)]

    # group by replication and group and sum
    df_follow_up_sum = df_follow_up.groupby(['group', 'replication'])['n'].sum().reset_index()
    df_follow_up_sum = df_follow_up_sum.rename(columns={'n': 'dm_num'})

    # group by replication and group and sum
    df_all_sum = df.groupby(['group', 'replication'])['n'].sum().reset_index()
    df_all_sum = df_all_sum.rename(columns={'n': 'num'})

    # merge dataframes
    group_dm_risk_table = dd.merge(df_follow_up_sum, df_all_sum, how='left')

    # calculate risk
    group_dm_risk_table['risk'] = group_dm_risk_table['dm_num'] / group_dm_risk_table['num']

    return group_dm_risk_table

# Define Data Path

In [23]:
config_type = 'baseline'

if config_type != 'baseline':
    config_file_path = f'../config/S0_{config_type}.yaml' #gitignore

    # Load the YAML file
    with open(config_file_path, 'r') as file:
        config_data = yaml.safe_load(file)
    
    coverage_rate = config_data['bmi_intervention_coverage']
    effectiveness = config_data['bmi_intervention_effectiveness']
    
    # Set path to input folder
    s0_data_dir = Path(f"../out/S0_{config_type}/combined/") #gitignore
    s1_data_dir = Path(f"../out/S3_{config_type}/combined/") #gitignore
else:
    # For Baseline running
    s0_data_dir = Path(f"../out/S0_10/combined/") #gitignore
    s1_data_dir = Path(f"../out/S3_10/combined/") #gitignore

    coverage_rate = 1
    effectiveness = 1

# Create Summary Table

In [24]:
SA_summary_df = pd.DataFrame()
SA_summary_df['group'] = [
        "Black HET Women",
        "White HET Women",
        "Hispanic HET Women",
        "Black HET Men",
        "White HET Men",
        "Hispanic HET Men",
        "Black WWID",
        "White WWID",
        "Hispanic WWID",
        "Black MWID",
        "White MWID",
        "Hispanic MWID",
        "Black MSM",
        "White MSM",
        "Hispanic MSM",
        "Overall"
    ]

# Control Arm

## Number of People received BMI intervention

In [25]:
# we will look at the "bmi_int_dm_prev.h5" for S0
bmi_int_dm_prev = dd.read_parquet(s0_data_dir /'dm_final_output.parquet').reset_index()

# Add Overall
all_but_group = list(bmi_int_dm_prev.columns[1:])
bmi_int_dm_prev_overall = bmi_int_dm_prev.groupby(all_but_group).sum().reset_index()
bmi_int_dm_prev_overall['group'] = 'overall'
bmi_int_dm_prev = dd.concat([bmi_int_dm_prev, bmi_int_dm_prev_overall], ignore_index=True)

# type the dataframe for space efficiency
bmi_int_dm_prev = bmi_int_dm_prev.astype({'group':'str', 'replication':'int16', 'bmiInt_scenario':np.int8, 'h1yy': np.int16, 'bmiInt_impacted':bool, 'dm': bool, 't_dm': np.int16, 'n': np.int16})

# clean to control specifications
control_bmi_int_dm_prev = clean_control(bmi_int_dm_prev, only_eligible=True, only_received=True)

# sum across replications, group, and years_after_h1yy
control_bmi_int_dm_prev_agg = control_bmi_int_dm_prev.groupby(['group', 'years_after_h1yy','replication'])['n'].sum().reset_index().compute()

# display table
control_bmi_int_dm_prev_agg

Unnamed: 0,group,years_after_h1yy,replication,n
0,het_black_female,-2017,0,905
1,het_black_female,-2017,1,890
2,het_black_female,-2017,2,915
3,het_black_female,-2017,3,981
4,het_black_female,-2017,4,858
...,...,...,...,...
4314,overall,22,5,457
4315,overall,22,6,475
4316,overall,22,7,455
4317,overall,22,8,437


In [26]:
df = control_bmi_int_dm_prev_agg.groupby(['group', 'replication'])[['n']].sum().reset_index()
df['group'] = df['group'].map(group_title_dict)
df = df.groupby('group')[['n']].apply(lambda x: x.quantile([0.025,0.5,0.975])).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:.0f} [{:.0f} - {:.0f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)

SA_summary_df['Control|Number receiving intervention'] = df['formatted']

In [27]:
# Now we will work on the remaining percentage columns
bmi_int_cascade = dd.read_parquet(s0_data_dir / 'bmi_int_cascade.parquet').reset_index()
# filter for only starting h1yy after 2013 and before 2017
s0_bmi_int_cascade = bmi_int_cascade.loc[(bmi_int_cascade['h1yy'] >= start_year) & (bmi_int_cascade['h1yy'] <= 2017)]


# we will look at the "bmi_int_dm_prev.h5" for S0
bmi_int_dm_prev = dd.read_parquet(s0_data_dir /'dm_final_output.parquet').reset_index()

# Add Overall
all_but_group = list(bmi_int_dm_prev.columns[1:])
bmi_int_dm_prev_overall = bmi_int_dm_prev.groupby(all_but_group).sum().reset_index()
bmi_int_dm_prev_overall['group'] = 'overall'
bmi_int_dm_prev = dd.concat([bmi_int_dm_prev, bmi_int_dm_prev_overall], ignore_index=True)

# type the dataframe for space efficiency
bmi_int_dm_prev = bmi_int_dm_prev.astype({'group':'str', 'replication':'int16', 'bmiInt_scenario':np.int8, 'h1yy': np.int16, 'bmiInt_impacted':bool, 'dm': bool, 't_dm': np.int16, 'n': np.int16})

# clean to control specifications
control_bmi_int_dm_prev = clean_control(bmi_int_dm_prev, only_eligible=False)

# sum across replications, group, and years_after_h1yy
control_bmi_int_dm_prev_agg = control_bmi_int_dm_prev.groupby(['group', 'years_after_h1yy','replication',"time_exposure_to_risk"])['n'].sum().reset_index().compute()

# display table
control_bmi_int_dm_prev_agg

Unnamed: 0,group,years_after_h1yy,replication,time_exposure_to_risk,n
0,het_black_female,-2017,0,-2017,1211
1,het_black_female,-2017,0,0,21
2,het_black_female,-2017,0,1,24
3,het_black_female,-2017,0,2,18
4,het_black_female,-2017,0,3,21
...,...,...,...,...,...
66008,overall,22,8,-2013,549
66009,overall,22,8,22,16
66010,overall,22,9,-2013,563
66011,overall,22,9,22,18


## number of new dm & risk events during 7 year follow up

In [28]:
group_dm_risk_table = calc_risk_by_group(control_bmi_int_dm_prev_agg, 7).compute()

group_dm_risk_table['group'] = group_dm_risk_table['group'].map(group_title_dict)

# New DM
df = group_dm_risk_table.groupby('group')[['dm_num']].quantile([0.025,0.5,0.975]).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:.1f} [{:.1f} - {:.1f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)

SA_summary_df['Control|Number of new dm events during 7 year follow up'] = df['formatted']

# DM Risk
df = group_dm_risk_table.groupby('group')[['risk']].quantile([0.025,0.5,0.975]).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:.3f} [{:.3f} - {:.3f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)

SA_summary_df['Control|7 year risk of dm'] = df['formatted']

## Number of dm per 1000 receiving the intervention

In [29]:
# we will look at the "bmi_int_dm_prev.h5" for S0
bmi_int_dm_prev = dd.read_parquet(s0_data_dir /'dm_final_output.parquet').reset_index()

# Add Overall
all_but_group = list(bmi_int_dm_prev.columns[1:])
bmi_int_dm_prev_overall = bmi_int_dm_prev.groupby(all_but_group).sum().reset_index()
bmi_int_dm_prev_overall['group'] = 'overall'
bmi_int_dm_prev = dd.concat([bmi_int_dm_prev, bmi_int_dm_prev_overall], ignore_index=True)

# type the dataframe for space efficiency
bmi_int_dm_prev = bmi_int_dm_prev.astype({'group':'str', 'replication':'int16', 'bmiInt_scenario':np.int8, 'h1yy': np.int16, 'bmiInt_impacted':bool, 'dm': bool, 't_dm': np.int16, 'n': np.int16})

# clean to control specifications
control_bmi_int_dm_prev = clean_control(bmi_int_dm_prev, only_eligible=True, only_received=True)

# sum across replications, group, and years_after_h1yy
control_bmi_int_dm_prev_agg = control_bmi_int_dm_prev.groupby(['group', 'years_after_h1yy','replication', "time_exposure_to_risk"])['n'].sum().reset_index().compute()

group_dm_risk_table = calc_risk_by_group_old(control_bmi_int_dm_prev_agg, 7).compute()

group_dm_risk_table['group'] = group_dm_risk_table['group'].map(group_title_dict)

group_dm_risk_table['dm_per_1000'] = np.round(group_dm_risk_table['risk']*1000,0)

# New DM
df = group_dm_risk_table.groupby('group')[['dm_per_1000']].quantile([0.025,0.5,0.975]).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:.1f} [{:.1f} - {:.1f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)

SA_summary_df['Control|nubmer of dm event per 1000 people receiving intervention'] = df['formatted']

# Intervention Arm

## number of people received intervention

In [30]:
# we will look at the "bmi_int_dm_prev.h5" for S1
bmi_int_dm_prev = dd.read_parquet(s1_data_dir /'dm_final_output.parquet').reset_index()

# Add Overall
all_but_group = list(bmi_int_dm_prev.columns[1:])
bmi_int_dm_prev_overall = bmi_int_dm_prev.groupby(all_but_group).sum().reset_index()
bmi_int_dm_prev_overall['group'] = 'overall'
bmi_int_dm_prev = dd.concat([bmi_int_dm_prev, bmi_int_dm_prev_overall], ignore_index=True)

# type the dataframe for space efficiency
bmi_int_dm_prev = bmi_int_dm_prev.astype({'group':'str', 'replication':'int16', 'bmiInt_scenario':np.int8, 'h1yy': np.int16, 'bmiInt_impacted':bool, 'dm': bool, 't_dm': np.int16, 'n': np.int16})

# clean to control specifications
control_bmi_int_dm_prev = clean_control(bmi_int_dm_prev, only_eligible=True, only_received=True)

# sum across replications, group, and years_after_h1yy
control_bmi_int_dm_prev_agg = control_bmi_int_dm_prev.groupby(['group', 'years_after_h1yy','replication', "time_exposure_to_risk"])['n'].sum().reset_index().compute()

# display table
control_bmi_int_dm_prev_agg

df = control_bmi_int_dm_prev_agg.groupby(['group', 'replication'])[['n']].sum().reset_index()
df['group'] = df['group'].map(group_title_dict)
df = df.groupby('group')[['n']].apply(lambda x: x.quantile([0.025,0.5,0.975])).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:.0f} [{:.0f} - {:.0f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)

SA_summary_df['Intervention|Number receiving intervention'] = df['formatted']

In [31]:
# we will look at the "bmi_int_dm_prev.h5" for S1
bmi_int_dm_prev = dd.read_parquet(s1_data_dir /'dm_final_output.parquet').reset_index()

# Add Overall
all_but_group = list(bmi_int_dm_prev.columns[1:])
bmi_int_dm_prev_overall = bmi_int_dm_prev.groupby(all_but_group).sum().reset_index()
bmi_int_dm_prev_overall['group'] = 'overall'
bmi_int_dm_prev = dd.concat([bmi_int_dm_prev, bmi_int_dm_prev_overall], ignore_index=True)

# type the dataframe for space efficiency
bmi_int_dm_prev = bmi_int_dm_prev.astype({'group':'str', 'replication':'int16', 'bmiInt_scenario':np.int8, 'h1yy': np.int16, 'bmiInt_impacted':bool, 'dm': bool, 't_dm': np.int16, 'n': np.int16})

# clean to control specifications
control_bmi_int_dm_prev = clean_control(bmi_int_dm_prev, only_eligible=True, only_received=True)

# sum across replications, group, and years_after_h1yy
control_bmi_int_dm_prev_agg = control_bmi_int_dm_prev.groupby(['group', 'years_after_h1yy','replication', "time_exposure_to_risk"])['n'].sum().reset_index().compute()

# display table
control_bmi_int_dm_prev_agg

df = control_bmi_int_dm_prev_agg.groupby(['group', 'replication'])[['n']].sum().reset_index()
df['group'] = df['group'].map(group_title_dict)
df = df.groupby('group')[['n']].apply(lambda x: x.quantile([0.025,0.5,0.975])).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:.0f} [{:.0f} - {:.0f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)

SA_summary_df['Intervention|Number receiving intervention'] = df['formatted']

In [32]:
# Now we will work on the remaining percentage columns
bmi_int_cascade = dd.read_parquet(s1_data_dir / 'bmi_int_cascade.parquet').reset_index()
# filter for only starting h1yy after 2013 and before 2017
s1_bmi_int_cascade = bmi_int_cascade.loc[(bmi_int_cascade['h1yy'] >= start_year) & (bmi_int_cascade['h1yy'] <= 2017)]

# we will look at the "bmi_int_dm_prev.h5" for S1
bmi_int_dm_prev = dd.read_parquet(s1_data_dir /'dm_final_output.parquet').reset_index()

# Add Overall
all_but_group = list(bmi_int_dm_prev.columns[1:])
bmi_int_dm_prev_overall = bmi_int_dm_prev.groupby(all_but_group).sum().reset_index()
bmi_int_dm_prev_overall['group'] = 'overall'
bmi_int_dm_prev = dd.concat([bmi_int_dm_prev, bmi_int_dm_prev_overall], ignore_index=True)

# type the dataframe for space efficiency
bmi_int_dm_prev = bmi_int_dm_prev.astype({'group':'str', 'replication':'int16', 'bmiInt_scenario':np.int8, 'h1yy': np.int16, 'bmiInt_impacted':bool, 'dm': bool, 't_dm': np.int16, 'n': np.int16})

# clean to control specifications
s1_bmi_int_dm_prev = clean_control(bmi_int_dm_prev, only_eligible=False)

# sum across replications, group, and years_after_h1yy
s1_bmi_int_dm_prev_agg = s1_bmi_int_dm_prev.groupby(['group', 'years_after_h1yy','replication', "time_exposure_to_risk"])['n'].sum().reset_index().compute()

# display table
s1_bmi_int_dm_prev_agg

Unnamed: 0,group,years_after_h1yy,replication,time_exposure_to_risk,n
0,het_black_female,-2017,0,-2017,1305
1,het_black_female,-2017,0,0,17
2,het_black_female,-2017,0,1,18
3,het_black_female,-2017,0,2,20
4,het_black_female,-2017,0,3,21
...,...,...,...,...,...
64942,overall,22,9,-2013,523
64943,overall,22,9,22,7
64944,overall,1,6,22,4
64945,overall,2,3,21,9


## number of new dm & risk events during 7 year follow up

In [33]:
group_dm_risk_table = calc_risk_by_group(s1_bmi_int_dm_prev_agg, 7).compute()

group_dm_risk_table['group'] = group_dm_risk_table['group'].map(group_title_dict)

# New DM
df = group_dm_risk_table.groupby('group')[['dm_num']].quantile([0.025,0.5,0.975]).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:.1f} [{:.1f} - {:.1f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)

SA_summary_df['Intervention|Number of new dm events during 7 year follow up'] = df['formatted']

# DM Risk
df = group_dm_risk_table.groupby('group')[['risk']].quantile([0.025,0.5,0.975]).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:.3f} [{:.3f} - {:.3f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)

SA_summary_df['Intervention|7 year risk of dm'] = df['formatted']

## Number of dm per 1000 receiving the intervention

In [34]:
# we will look at the "bmi_int_dm_prev.h5" for S1
bmi_int_dm_prev = dd.read_parquet(s1_data_dir /'dm_final_output.parquet').reset_index()

# Add Overall
all_but_group = list(bmi_int_dm_prev.columns[1:])
bmi_int_dm_prev_overall = bmi_int_dm_prev.groupby(all_but_group).sum().reset_index()
bmi_int_dm_prev_overall['group'] = 'overall'
bmi_int_dm_prev = dd.concat([bmi_int_dm_prev, bmi_int_dm_prev_overall], ignore_index=True)

# type the dataframe for space efficiency
bmi_int_dm_prev = bmi_int_dm_prev.astype({'group':'str', 'replication':'int16', 'bmiInt_scenario':np.int8, 'h1yy': np.int16, 'bmiInt_impacted':bool, 'dm': bool, 't_dm': np.int16, 'n': np.int16})

# clean to control specifications
s1_bmi_int_dm_prev = clean_control(bmi_int_dm_prev, only_eligible=True, only_received=True)

# sum across replications, group, and years_after_h1yy
s1_bmi_int_dm_prev_agg = s1_bmi_int_dm_prev.groupby(['group', 'years_after_h1yy','replication', "time_exposure_to_risk"])['n'].sum().reset_index().compute()

group_dm_risk_table = calc_risk_by_group_old(s1_bmi_int_dm_prev_agg, 7).compute()

group_dm_risk_table['group'] = group_dm_risk_table['group'].map(group_title_dict)

group_dm_risk_table['dm_per_1000'] = np.round(group_dm_risk_table['risk']*1000,0)

# New DM
df = group_dm_risk_table.groupby('group')[['dm_per_1000']].quantile([0.025,0.5,0.975]).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:.1f} [{:.1f} - {:.1f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)

SA_summary_df['Intervention|nubmer of dm event per 1000 people receiving intervention'] = df['formatted']

# Comparison

In [35]:
# we will look at the "bmi_int_dm_prev.h5" for S1
bmi_int_dm_prev_s1 = dd.read_parquet(s1_data_dir /'dm_final_output.parquet').reset_index()
bmi_int_dm_prev_s1 = bmi_int_dm_prev_s1.astype({'group':'str', 'replication':'int16', 'bmiInt_scenario':np.int8, 'h1yy': np.int16, 'bmiInt_impacted':bool, 'dm': bool, 't_dm': np.int16, 'n': np.int16})

# Add Overall
all_but_group = list(bmi_int_dm_prev_s1.columns[1:])
bmi_int_dm_prev_s1_overall = bmi_int_dm_prev_s1.groupby(all_but_group).sum().reset_index()
bmi_int_dm_prev_s1_overall['group'] = 'overall'
bmi_int_dm_prev_s1 = dd.concat([bmi_int_dm_prev_s1, bmi_int_dm_prev_s1_overall], ignore_index=True)

# clean to control specifications
control_bmi_int_dm_prev_s1 = clean_control(bmi_int_dm_prev_s1, only_eligible=True, only_received = True)

# filter for only people eligible for intervention
bmi_int_s1_eligible_risk = calc_risk_by_group(control_bmi_int_dm_prev_s1, 7)

bmi_int_s1_eligible_risk['received_num'] = bmi_int_s1_eligible_risk['num']

In [36]:
# we will look at the "bmi_int_dm_prev.h5" for S0
bmi_int_dm_prev = dd.read_parquet(s0_data_dir /'dm_final_output.parquet').reset_index()

# Add Overall
all_but_group = list(bmi_int_dm_prev.columns[1:])
bmi_int_dm_prev_overall = bmi_int_dm_prev.groupby(all_but_group).sum().reset_index()
bmi_int_dm_prev_overall['group'] = 'overall'
bmi_int_dm_prev = dd.concat([bmi_int_dm_prev, bmi_int_dm_prev_overall], ignore_index=True)

# type the dataframe for space efficiency
bmi_int_dm_prev = bmi_int_dm_prev.astype({'group':'str', 'replication':'int16', 'bmiInt_scenario':np.int8, 'h1yy': np.int16, 'bmiInt_impacted':bool, 'dm': bool, 't_dm': np.int16, 'n': np.int16})

# clean to control specifications
control_bmi_int_dm_prev = clean_control(bmi_int_dm_prev, only_eligible=True,only_received = True)

bmi_int_eligible_risk = calc_risk_by_group(control_bmi_int_dm_prev, 7)

In [37]:
num_samples = 2000

s0_sample = bmi_int_eligible_risk.groupby('group').apply(lambda x: x.sample(num_samples, replace=True)).reset_index(drop=True).compute()
s1_sample = bmi_int_s1_eligible_risk.groupby('group').apply(lambda x: x.sample(num_samples, replace=True)).reset_index(drop=True).compute()

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  s0_sample = bmi_int_eligible_risk.groupby('group').apply(lambda x: x.sample(num_samples, replace=True)).reset_index(drop=True).compute()
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  s1_sample = bmi_int_s1_eligible_risk.groupby('group').apply(lambda x: x.sample(num_samples, replace=True)).reset_index(drop=True).compute()


In [38]:
s0_sample = s0_sample.sort_values(by = 'group').reset_index(drop = True)
s0_sample

Unnamed: 0,group,replication,dm_num,person-time-contributed,num,risk
0,het_black_female,6,1668,69258,9894,24.083860
1,het_black_female,7,1618,66815,9545,24.216119
2,het_black_female,9,1577,66871,9553,23.582719
3,het_black_female,6,1668,69258,9894,24.083860
4,het_black_female,9,1577,66871,9553,23.582719
...,...,...,...,...,...,...
31995,overall,1,9573,881811,125973,10.856068
31996,overall,6,9271,877296,125328,10.567699
31997,overall,0,9386,868511,124073,10.807002
31998,overall,2,9431,873502,124786,10.796770


In [39]:
s1_sample = s1_sample.sort_values(by = 'group').reset_index(drop = True)
s1_sample

Unnamed: 0,group,replication,dm_num,person-time-contributed,num,risk,received_num
0,het_black_female,2,1390,71603,10229,19.412594,10229
1,het_black_female,9,1199,66871,9553,17.930044,9553
2,het_black_female,4,1368,68369,9767,20.009068,9767
3,het_black_female,9,1199,66871,9553,17.930044,9553
4,het_black_female,7,1269,66815,9545,18.992741,9545
...,...,...,...,...,...,...,...
31995,overall,4,7755,879445,125635,8.818061,125635
31996,overall,8,7701,884758,126394,8.704075,126394
31997,overall,7,7736,884800,126400,8.743219,126400
31998,overall,1,7721,881811,125973,8.755845,125973


## abs change in risk

In [40]:
# absolute difference
abs_sample_diff = s1_sample[['dm_num', 'risk']] - s0_sample[['dm_num', 'risk']]
abs_sample_diff['group'] = s0_sample['group']
abs_sample_diff['received_num'] = s1_sample['received_num']

In [41]:
abs_sample_diff_plot = abs_sample_diff.copy()
abs_sample_diff_plot['group'] = abs_sample_diff_plot['group'].map(group_title_dict)

# diff_ax = sns.boxplot(x=abs_sample_diff_plot['group'],
#             y=abs_sample_diff_plot['risk'],
#             color='seagreen',
#             showfliers = False,
#             palette=palette,
#             hue=abs_sample_diff_plot['group'],
#             order=group_order,
#             hue_order=group_order)

# diff_ax.tick_params(axis='x', rotation=90)

# diff_ax.set_xlabel('')
# diff_ax.set_ylabel('Absolute risk reduction (ARR) of new DM diagnosis (intervention vs. control arm over 5-year follow up)')
# diff_ax.axhline(y=0, color='r', linestyle='-')
# diff_fig = diff_ax.get_figure()
# diff_fig.savefig("../outputs/fig3a.png")

In [42]:
# abs_sample_diff_plot.groupby('group')[['risk']].median().to_csv('../outputs/figure3a_table.csv')

df = abs_sample_diff_plot.groupby('group')[['risk']].quantile([0.025,0.5,0.975]).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:.3f} [{:.3f} - {:.3f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)

SA_summary_df['absolute change in risk'] = df['formatted']

## relative change in risk

In [43]:
# relative difference
rel_sample_diff = (s1_sample[['risk']] - s0_sample[['risk']]) / s0_sample[['risk']]
rel_sample_diff['group'] = s0_sample['group']

In [44]:
rel_sample_diff_plot = rel_sample_diff.copy()
rel_sample_diff_plot['group'] = rel_sample_diff_plot['group'].map(group_title_dict)

df = rel_sample_diff_plot.groupby('group')[['risk']].quantile([0.025,0.5,0.975]).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:.3f} [{:.3f} - {:.3f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)

SA_summary_df['relative change in risk'] = df['formatted']

## Number dm cases averted per 1000 & NNT

In [45]:
abs_sample_diff_plot['dm_per_1000'] = -np.round(abs_sample_diff_plot['dm_num']/abs_sample_diff_plot['received_num']*1000, 0)
abs_sample_diff_plot['NNT'] = -np.round(abs_sample_diff_plot['received_num']/abs_sample_diff_plot['dm_num'], 0)

# dm cases averted per 1000
df = abs_sample_diff_plot.groupby('group')[['dm_per_1000']].quantile([0.025,0.5,0.975]).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:,.0f} [{:,.0f} - {:,.0f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)
SA_summary_df['num dm cases averted per 1000'] = df['formatted']

# NNT
df = abs_sample_diff_plot.groupby('group')[['NNT']].quantile([0.025,0.5,0.975]).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:,.0f} [{:,.0f} - {:,.0f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)
SA_summary_df['NNT'] = df['formatted']

## total number of dm averted

In [46]:
abs_sample_diff_plot['dm_num_prevented'] = abs_sample_diff_plot['dm_num'] * -1

df = abs_sample_diff_plot.groupby('group')[['dm_num_prevented']].quantile([0.025,0.5,0.975]).unstack().reset_index()
df.columns = ['group',0.025, 0.5, 0.975]
df['formatted'] = df.apply(
    lambda row: '{:,.0f} [{:,.0f} - {:,.0f}]'.format(row[0.50], row[0.025], row[0.975]), axis=1
)
df = rearrange_group_order(df)

SA_summary_df['total number of dm averted'] = df['formatted']

# Save summary df

In [47]:
SA_summary_df

Unnamed: 0,group,Control|Number receiving intervention,Control|Number of new dm events during 7 year follow up,Control|7 year risk of dm,Control|nubmer of dm event per 1000 people receiving intervention,Intervention|Number receiving intervention,Intervention|Number of new dm events during 7 year follow up,Intervention|7 year risk of dm,Intervention|nubmer of dm event per 1000 people receiving intervention,absolute change in risk,relative change in risk,num dm cases averted per 1000,NNT,total number of dm averted
0,Black HET Women,9867 [9522 - 10205],3593.5 [3417.7 - 3769.2],26.252 [25.779 - 26.952],169.5 [165.4 - 172.0],9867 [9522 - 10205],3241.5 [3127.2 - 3373.0],23.797 [23.315 - 24.193],133.0 [126.9 - 139.1],-5.223 [-6.628 - -3.919],-0.216 [-0.270 - -0.164],36 [21 - 53],28 [19 - 47],357 [209 - 502]
1,White HET Women,2515 [2355 - 2587],698.0 [656.0 - 736.1],23.357 [21.733 - 24.134],161.0 [145.4 - 167.8],2515 [2355 - 2587],627.5 [610.4 - 659.5],20.722 [19.908 - 21.880],125.5 [122.0 - 136.8],-4.302 [-6.424 - -1.472],-0.186 [-0.269 - -0.070],29 [13 - 51],35 [19 - 78],74 [33 - 125]
2,Hispanic HET Women,2769 [2709 - 2976],806.0 [749.2 - 871.6],23.320 [21.951 - 23.869],154.5 [149.2 - 165.5],2769 [2709 - 2976],734.5 [703.1 - 775.0],21.207 [20.582 - 21.510],127.5 [120.7 - 134.1],-3.907 [-6.262 - -2.535],-0.177 [-0.262 - -0.119],28 [9 - 57],36 [17 - 109],78 [26 - 156]
3,Black HET Men,6290 [6098 - 6552],1227.5 [1180.5 - 1265.5],18.816 [18.169 - 19.231],128.0 [123.5 - 131.6],6290 [6098 - 6552],1051.0 [995.2 - 1099.0],16.143 [15.435 - 16.701],99.5 [93.2 - 104.8],-4.014 [-5.151 - -2.836],-0.220 [-0.277 - -0.159],28 [16 - 41],36 [24 - 62],175 [104 - 251]
4,White HET Men,1486 [1417 - 1536],252.5 [221.9 - 273.8],17.556 [15.688 - 19.098],116.5 [106.7 - 133.4],1486 [1417 - 1536],202.5 [184.4 - 224.6],14.019 [13.238 - 15.759],86.0 [81.2 - 102.0],-4.403 [-7.242 - -1.432],-0.260 [-0.376 - -0.088],30 [7 - 50],33 [20 - 146],44 [10 - 74]
5,Hispanic HET Men,2096 [1980 - 2230],366.5 [341.4 - 410.2],18.083 [17.752 - 19.569],125.0 [117.6 - 136.3],2096 [1980 - 2230],282.0 [266.4 - 322.5],14.438 [13.783 - 15.383],91.0 [79.3 - 97.3],-5.123 [-7.611 - -3.217],-0.284 [-0.398 - -0.194],36 [11 - 65],27 [16 - 89],77 [25 - 130]
6,Black WWID,1118 [1053 - 1218],394.5 [335.5 - 422.8],27.910 [24.964 - 30.778],186.5 [154.6 - 209.5],1118 [1053 - 1218],344.5 [321.7 - 385.1],25.064 [23.986 - 26.993],161.0 [144.0 - 172.0],-3.845 [-8.247 - 1.928],-0.142 [-0.287 - 0.091],28 [-23 - 64],30 [-99 - 300],32 [-28 - 70]
7,White WWID,1388 [1312 - 1555],414.5 [381.7 - 477.5],28.604 [27.045 - 32.344],206.5 [189.6 - 230.7],1388 [1312 - 1555],353.0 [309.4 - 411.9],25.025 [22.263 - 26.272],162.5 [140.9 - 175.7],-6.677 [-12.210 - -2.616],-0.227 [-0.378 - -0.094],48 [5 - 101],20 [9 - 156],67 [7 - 142]
8,Hispanic WWID,606 [558 - 669],164.0 [144.7 - 199.8],27.248 [24.339 - 30.392],200.0 [175.9 - 225.5],606 [558 - 669],143.0 [117.0 - 176.3],22.587 [19.696 - 26.986],155.0 [138.4 - 184.3],-6.095 [-11.916 - 0.003],-0.216 [-0.363 - 0.000],40 [-18 - 105],21 [-657 - 111],25 [-12 - 63]
9,Black MWID,2790 [2621 - 3040],404.0 [355.6 - 444.9],14.145 [12.987 - 15.290],104.5 [93.5 - 115.5],2790 [2621 - 3040],337.0 [272.8 - 361.4],11.327 [10.034 - 12.255],78.0 [66.9 - 81.8],-4.022 [-6.703 - -1.764],-0.267 [-0.403 - -0.133],28 [6 - 55],35 [18 - 151],79 [18 - 146]


In [48]:
SA_summary_df.to_csv(f'../results/SA/{config_type}_cov_{coverage_rate}_eff_{effectiveness}.csv', index = False)