In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import pandas as pd
import warnings
import sys
import numpy as np
import scipy as sc
import random
from scipy import stats
from numpy import log10, sqrt

mydir = '/Users/kenlocey/GitHub/HACRP-HAIs/'
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def Winsorize_it(x, WinZs):
    
    x2 = []
    for i, val in enumerate(x):
        wz = float(WinZs[i])
        val = float(val)
        if np.isnan(wz) == True:
            x2.append(np.nan)
        else:
            x2.append(val)
    
    p5 = np.nanpercentile(x2, 5)
    p95 = np.nanpercentile(x2, 95)
    WinScores = []
    
    for i, val in enumerate(x2):
        wz = float(WinZs[i])
        val = float(val)
        
        if np.isnan(wz) == True:
            WinScores.append(np.nan)
            
        elif np.isnan(wz) == False:
            if val >= p5 and val <= p95:
                WinScores.append(val)
            elif val < p5:
                WinScores.append(p5)
            elif val > p95:
                WinScores.append(p95)
            elif np.isnan(val) == True:
                #print('val:', val, '|', WinZs[i])
                WinScores.append(np.nan)
        
    
    return WinScores
        

def ZScore_it(x, WinZs):
    
    x2 = []
    for i, val in enumerate(x):
        wz = float(WinZs[i])
        val = float(val)
        if np.isnan(wz) == True:
            x2.append(np.nan)
        else:
            x2.append(val)
    
    x2 = np.array(x2)
    avg = np.nanmean(x2)
    std = np.nanstd(x2)
    zscores = (x2 - avg) / std
    return zscores

hac_mo = '04'

## Load HAC file

In [2]:
cauti_file =  '/Users/kenlocey/GitHub/HACRP-HAIs/data/optimized_by_HAI_file_date/CAUTI/CAUTI_Data_opt_for_SIRs_2022_04.pkl'
clabsi_file = '/Users/kenlocey/GitHub/HACRP-HAIs/data/optimized_by_HAI_file_date/CLABSI/CLABSI_Data_opt_for_SIRs_2022_04.pkl'

mrsa_file = '/Users/kenlocey/GitHub/HACRP-HAIs/data/optimized_by_HAI_file_date/MRSA/MRSA_Data_opt_for_SIRs_2022_04.pkl'
cdi_file = '/Users/kenlocey/GitHub/HACRP-HAIs/data/optimized_by_HAI_file_date/CDI/CDI_Data_opt_for_SIRs_2022_04.pkl'


tdf, tdf1, tdf2 = 0, 0, 0
cauti_df, clabsi_df = 0, 0
mrsa_df, cdi_df = 0, 0

cauti_df = pd.read_pickle(cauti_file)
cauti_df['CAUTI SIS'] = (cauti_df['CAUTI Observed Cases'] - cauti_df['expected O'])/cauti_df['CAUTI Predicted Cases']
cauti_df.rename(columns={
        'O/E': 'CAUTI O/P',
        'simulated O': 'CAUTI simulated O',
        'simulated O/E': 'CAUTI simulated O/P',
        'expected O': 'CAUTI expected O',
        'expected O/E': 'CAUTI expected O/P',
        'pi_opt': 'CAUTI pi_opt',
        'z_opt': 'CAUTI z_opt',
    }, inplace=True)
cauti_sis_deciles = np.nanpercentile(cauti_df['CAUTI SIS'], np.arange(0, 100, 10))
cauti_rand_sir_deciles = np.nanpercentile(cauti_df['CAUTI expected O/P'], np.arange(0, 100, 10))

clabsi_df = pd.read_pickle(clabsi_file)
clabsi_df['CLABSI SIS'] = (clabsi_df['CLABSI Observed Cases'] - clabsi_df['expected O'])/clabsi_df['CLABSI Predicted Cases']
clabsi_df.rename(columns={
        'O/E': 'CLABSI O/P',
        'simulated O': 'CLABSI simulated O',
        'simulated O/E': 'CLABSI simulated O/P',
        'expected O': 'CLABSI expected O',
        'expected O/E': 'CLABSI expected O/P',
        'pi_opt': 'CLABSI pi_opt',
        'z_opt': 'CLABSI z_opt',
    }, inplace=True)
clabsi_sis_deciles = np.nanpercentile(clabsi_df['CLABSI SIS'], np.arange(0, 100, 10))
clabsi_rand_sir_deciles = np.nanpercentile(clabsi_df['CLABSI expected O/P'], np.arange(0, 100, 10))




mrsa_df = pd.read_pickle(mrsa_file)
mrsa_df['MRSA SIS'] = (mrsa_df['MRSA Observed Cases'] - mrsa_df['expected O'])/mrsa_df['MRSA Predicted Cases']
mrsa_df.rename(columns={
        'O/E': 'MRSA O/P',
        'simulated O': 'MRSA simulated O',
        'simulated O/E': 'MRSA simulated O/P',
        'expected O': 'MRSA expected O',
        'expected O/E': 'MRSA expected O/P',
        'pi_opt': 'MRSA pi_opt',
        'z_opt': 'MRSA z_opt',
    }, inplace=True)
tdf = mrsa_df[mrsa_df['MRSA Predicted Cases'] >= 1]
mrsa_sis_deciles = np.nanpercentile(tdf['MRSA SIS'], np.arange(0, 100, 10))    
mrsa_rand_sir_deciles = np.nanpercentile(mrsa_df['MRSA expected O/P'], np.arange(0, 100, 10))


cdi_df = pd.read_pickle(cdi_file)
cdi_df['CDI SIS'] = (cdi_df['CDI Observed Cases'] - cdi_df['expected O'])/cdi_df['CDI Predicted Cases']
cdi_df.rename(columns={
        'O/E': 'CDI O/P',
        'simulated O': 'CDI simulated O',
        'simulated O/E': 'CDI simulated O/P',
        'expected O': 'CDI expected O',
        'expected O/E': 'CDI expected O/P',
        'pi_opt': 'CDI pi_opt',
        'z_opt': 'CDI z_opt',
    }, inplace=True)
tdf = cdi_df[cdi_df['CDI Predicted Cases'] >= 1]
cdi_sis_deciles = np.nanpercentile(tdf['CDI SIS'], np.arange(0, 100, 10))    
cdi_rand_sir_deciles = np.nanpercentile(cdi_df['CDI expected O/P'], np.arange(0, 100, 10))


ls1 = list(cauti_df)
ls2 = list(clabsi_df)
ls = list(filter(lambda x:x in ls1, ls2))
df_yr = cauti_df.merge(clabsi_df, on=ls, how='outer')
    
ls1 = list(df_yr)
ls2 = list(mrsa_df)
ls = list(filter(lambda x:x in ls1, ls2))
df_yr = df_yr.merge(mrsa_df, on=ls, how='outer')

ls1 = list(df_yr)
ls2 = list(cdi_df)
ls = list(filter(lambda x:x in ls1, ls2))
df_yr = df_yr.merge(cdi_df, on=ls, how='outer')


del tdf
del cauti_df, clabsi_df
del mrsa_df, cdi_df

df_yr.sort_values(by=['Facility ID', 'Line 19'], inplace=True, ascending=False)
df_yr.drop_duplicates(subset=['Facility ID'], inplace=True, keep='first')

print(len(df_yr['Facility ID'].unique()), 'hospitals resulting from merging the above dataframes.')

df_yr.head()


2745 hospitals resulting from merging the above dataframes.


Unnamed: 0,RPT_REC_NUM,IPPS interim payment (E_A_HOS_C1_72),IPPS payment (E_A_HOS_C1_59),PRVDR_CTRL_TYPE_CD,Facility ID,file_year,Line 19,Reconstructed IPPS payment (pre HAC penalty),Reconstructed HAC penalty,Reconstructed IPPS payment (post HAC penalty),HAC penalty imputed from E_A_HOS_C1_59,IPPS payment (from RAND),AHRQ PSI-90 Score,CAUTI Score,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Score,Start Date,Facility Name,Fiscal Year,State,Total HAC Score,file_month,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,CAUTI derived score,CLABSI derived score,Total HAC Score (derived),Payment Reduction (derived),Payment Reduction Reproduced?,HAC delta,CAUTI delta,CLABSI delta,AHRQ PSI-90 Footnote,CAUTI Footnote,CLABSI Footnote,Domain 1 Score Footnote,Domain 2 Score Footnote,SSI Footnote,SSI Score,Total HAC Footnote,CDI Footnote,CDI Score,MRSA Footnote,MRSA Score,MRSA patient days,CDI patient days,MRSA Observed Cases,CDI Observed Cases,MRSA Predicted Cases,CDI Predicted Cases,MRSA derived SIR,CDI derived SIR,MRSA derived score,CDI derived score,CDI delta,MRSA delta,Payment Reduction Footnote,CAUTI W Z Score,CDI W Z Score,CLABSI W Z Score,Domain 1 Footnote,Domain 2 Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,SSI W Z Score,CAUTI derived Winsorized SIR,CAUTI derived W Z Score,CLABSI derived Winsorized SIR,CLABSI derived W Z Score,MRSA derived Winsorized SIR,MRSA derived W Z Score,HAC penalty (E_A_HOS_C1_7099),CDI derived Winsorized SIR,CDI derived W Z Score,PSI-90 End Date,PSI-90 Start Date,HAC penalty (imputed from RAND),file date,STATE,Dollar difference in payments (RAND vs E_A_HOS_C1_59),% Difference in payments (RAND vs E_A_HOS_C1_59),% Error in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),Dollar difference in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),% Error in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),Dollar difference in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),avg_Reconstructed HAC penalty,"HAC penalty, final",CAUTI O/P,CAUTI simulated O,CAUTI simulated O/P,CAUTI expected O,CAUTI expected O/P,CAUTI pi_opt,CAUTI z_opt,CAUTI SIS,CLABSI O/P,CLABSI simulated O,CLABSI simulated O/P,CLABSI expected O,CLABSI expected O/P,CLABSI pi_opt,CLABSI z_opt,CLABSI SIS,MRSA O/P,MRSA simulated O,MRSA simulated O/P,MRSA expected O,MRSA expected O/P,MRSA pi_opt,MRSA z_opt,MRSA SIS,CDI O/P,CDI simulated O,CDI simulated O/P,CDI expected O,CDI expected O/P,CDI pi_opt,CDI z_opt,CDI SIS
0,,,,,670122,2022,0.0,0.0,0.0,0.0,0.0,,,,,,,,2019-12-31,,2019-01-01,HOUSTON METHODIST THE WOODLANDS HOSPITAL,2022.0,TX,-1.0198,4,No,10323.0,6067.0,4256.0,2.0,0.0,4.155,3.03,0.4813,0.0,,,-1.020663,No,1.0,0.000863,5.8e-05,0.001412,,,,,,,,,,,,,37689.0,35901.0,1.0,2.0,1.731,25.604,0.5777,0.0781,,,0.002395,0.001112,,-0.4497,-1.4219,-1.2914,,,-0.3655,,-1.233,-1.3575,0.4813,-0.449758,0.0,-1.292812,0.5777,-0.366612,0.0,0.0781,-1.424295,12/31/2019,07/01/2018,,2022_04,TX,,,,,,,451290.666667,451290.666667,0.481348,4.0,0.962696,4.099395,0.986617,0.001219,4873.993895,-0.50527,0.0,1.0,0.330033,2.116212,0.69842,0.00084,2934.766512,-0.69842,0.577701,0.0,0.0,1.203409,0.69521,8e-05,56563.129806,-0.11751,0.078113,6.0,0.234338,10.968227,0.428379,0.000399,10943.46178,-0.350267
1,,,,,670120,2022,0.0,0.0,0.0,0.0,0.0,,,,,,,,2019-12-31,,2019-01-01,THE HOSPITALS OF PROVIDENCE TRANSMOUNTAIN CAMPUS,2022.0,TX,-0.6903,4,No,3231.0,2015.0,1216.0,0.0,0.0,1.202,0.844,0.0,0.0,,,-0.691447,No,1.0,0.001147,0.000879,,,,5.0,,,,,,,,5.0,,24042.0,23329.0,0.0,3.0,0.747,12.065,0.0,0.2487,,,0.003808,,,-1.4169,-0.8966,,,,,,0.9099,-1.3575,0.0,-1.417779,,,,,0.0,0.2487,-0.900408,12/31/2019,07/01/2018,,2022_04,TX,,,,,,,141113.25,141113.25,0.0,1.0,0.831947,0.718163,0.597473,0.001219,4873.993895,-0.597473,,,,,,,,,,,,,,,,,0.248653,6.0,0.497306,6.330362,0.524688,0.000399,10943.46178,-0.276035
2169,,,,,670108,2022,0.0,0.0,0.0,0.0,0.0,,,,,,,,2019-12-31,,2019-01-01,BAYLOR SCOTT & WHITE MEDICAL CENTER - MARBLE F...,2022.0,TX,-0.117,4,No,1517.0,1172.0,345.0,0.0,0.0,0.602,0.212,0.0,0.0,,,-0.118472,No,1.0,0.001472,,,,5.0,5.0,,,5.0,,,,,5.0,,10097.0,9449.0,1.0,3.0,0.334,5.263,2.994,0.57,,,0.003045,,,,0.0893,,,,,,-0.3232,,,,,,,,0.0,0.57,0.086255,12/31/2019,07/01/2018,,2022_04,TX,,,,,,,104701.25,104701.25,,,,,,,,,,,,,,,,,,,,,,,,,0.570017,4.0,0.760023,1.745354,0.331627,0.000399,10943.46178,0.23839
2170,,,,,670106,2022,0.0,0.0,0.0,0.0,0.0,,,,,,,,2019-12-31,,2019-01-01,HCA HOUSTON HEALTHCARE PEARLAND,2022.0,TX,-0.5046,4,No,2586.0,1164.0,1422.0,1.0,2.0,0.57,0.824,1.7544,2.4272,,,-0.505264,No,1.0,0.000664,,,,5.0,5.0,,,5.0,,,,,5.0,,13331.0,13331.0,0.0,2.0,0.509,5.229,0.0,0.3825,,,0.001429,,,,-0.4881,,,,,,-0.521,,,,,,,,0.0,0.3825,-0.489529,12/31/2019,07/01/2018,,2022_04,TX,,,,,,,83461.833333,83461.833333,,,,,,,,,,,,,,,,,,,,,,,,,0.382482,3.0,0.573723,2.918485,0.558134,0.000399,10943.46178,-0.175652
2171,,,,,670103,2022,0.0,0.0,0.0,0.0,0.0,,,,,,,,2019-12-31,,2019-01-01,MEDICAL CITY ALLIANCE,2022.0,TX,-0.2932,4,No,2183.0,1082.0,1101.0,0.0,0.0,0.589,0.768,0.0,0.0,,,-0.293941,No,1.0,0.000741,,,,5.0,5.0,,,,,,,,5.0,,15909.0,14157.0,0.0,1.0,0.43,5.286,0.0,0.1892,,,0.002224,,,,-1.0809,,,,,,0.0832,0.1181,,,,,,,0.0,0.1892,-1.083124,12/31/2019,07/01/2018,,2022_04,TX,,,,,,,86255.333333,86255.333333,,,,,,,,,,,,,,,,,,,,,,,,,0.189179,2.0,0.378358,3.183042,0.602165,0.000399,10943.46178,-0.412986


In [3]:
main_df = pd.read_pickle('~/GitHub/HACRP-HAIs/data/Compiled_HCRIS-HACRP-HAI-RAND/Compiled_HCRIS-HACRP-HAI-RAND.pkl')
main_df = main_df[main_df['file_year'] == '2022']
main_df.sort_values(by='Facility ID', inplace=True)
print('Initial no. of hospitals in main_df:', len(main_df['Facility ID'].unique()))

main_df = main_df[(~main_df['Total HAC Score'].isin([float("NaN"), np.nan]))]
print('No. of hospitals in main_df having real HAC scores:', len(main_df['Facility ID'].unique()))

main_df = main_df[main_df['State'] != 'MD']
print('No. of these hospitals not in Maryland:', len(main_df['Facility ID'].unique()))

main_df = main_df[(main_df['CAUTI Predicted Cases'].isin([np.nan, float('NaN')])) | (main_df['CAUTI Predicted Cases'] < 1)]
main_df = main_df[(main_df['CLABSI Predicted Cases'].isin([np.nan, float('NaN')])) | (main_df['CLABSI Predicted Cases'] < 1)]
main_df = main_df[(main_df['MRSA Predicted Cases'].isin([np.nan, float('NaN')])) | (main_df['MRSA Predicted Cases'] < 1)]
main_df = main_df[(main_df['CDI Predicted Cases'].isin([np.nan, float('NaN')])) | (main_df['CDI Predicted Cases'] < 1)]
main_df = main_df[~main_df['PSI-90 W Z Score'].isin([np.nan, float('NaN')])]
print('No. of these hospitals without legit predicted cases for CAUTI and CLABSI:', len(main_df['Facility ID'].unique()))

main_df.head()


Initial no. of hospitals in main_df: 3155
No. of hospitals in main_df having real HAC scores: 3105
No. of these hospitals not in Maryland: 3060
No. of these hospitals without legit predicted cases for CAUTI and CLABSI: 360


Unnamed: 0,RPT_REC_NUM,IPPS interim payment (E_A_HOS_C1_72),IPPS payment (E_A_HOS_C1_59),PRVDR_CTRL_TYPE_CD,Facility ID,file_year,Line 19,Reconstructed IPPS payment (pre HAC penalty),Reconstructed HAC penalty,Reconstructed IPPS payment (post HAC penalty),HAC penalty imputed from E_A_HOS_C1_59,IPPS payment (from RAND),AHRQ PSI-90 Score,CAUTI Score,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Score,Start Date,Facility Name,Fiscal Year,State,Total HAC Score,file_month,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,CAUTI derived score,CLABSI derived score,Total HAC Score (derived),Payment Reduction (derived),Payment Reduction Reproduced?,HAC delta,CAUTI delta,CLABSI delta,AHRQ PSI-90 Footnote,CAUTI Footnote,CLABSI Footnote,Domain 1 Score Footnote,Domain 2 Score Footnote,SSI Footnote,SSI Score,Total HAC Footnote,CDI Footnote,CDI Score,MRSA Footnote,MRSA Score,MRSA patient days,CDI patient days,MRSA Observed Cases,CDI Observed Cases,MRSA Predicted Cases,CDI Predicted Cases,MRSA derived SIR,CDI derived SIR,MRSA derived score,CDI derived score,CDI delta,MRSA delta,Payment Reduction Footnote,CAUTI W Z Score,CDI W Z Score,CLABSI W Z Score,Domain 1 Footnote,Domain 2 Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,SSI W Z Score,CAUTI derived Winsorized SIR,CAUTI derived W Z Score,CLABSI derived Winsorized SIR,CLABSI derived W Z Score,MRSA derived Winsorized SIR,MRSA derived W Z Score,HAC penalty (E_A_HOS_C1_7099),CDI derived Winsorized SIR,CDI derived W Z Score,PSI-90 End Date,PSI-90 Start Date,HAC penalty (imputed from RAND),file date,STATE,Dollar difference in payments (RAND vs E_A_HOS_C1_59),% Difference in payments (RAND vs E_A_HOS_C1_59),% Error in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),Dollar difference in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),% Error in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),Dollar difference in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),avg_Reconstructed HAC penalty,"HAC penalty, final"
26459,,,,,10008,2022,0.0,0.0,0.0,0.0,0.0,,,,,,,,2019-12-31,,2019-01-01,CRENSHAW COMMUNITY HOSPITAL,2022.0,AL,0.0445,4,No,350.0,284.0,66.0,2.0,0.0,0.155,0.039,12.9032,0.0,,,0.0445,No,1.0,0.0,,,,5.0,5.0,,,5.0,,,5.0,,5.0,,2519.0,2519.0,0.0,0.0,0.064,0.462,0.0,0.0,,,,,,,,,,,,,0.0445,,,,,,,,0.0,,,12/31/2019,07/01/2018,,2022_04,AL,,,,,,,12028.5,12028.5
26460,,,,,10018,2022,0.0,0.0,0.0,0.0,0.0,,,,,,,,2019-12-31,,2019-01-01,CALLAHAN EYE HOSPITAL,2022.0,AL,0.0482,4,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0482,No,1.0,0.0,,,,5.0,5.0,,,5.0,,,5.0,,5.0,,320.0,320.0,0.0,0.0,0.006,0.08,0.0,0.0,,,,,,,,,,,,,0.0482,,,,,,,,0.0,,,12/31/2019,07/01/2018,,2022_04,AL,,,,,,,3401.0,3401.0
26458,,,,,10044,2022,0.0,0.0,0.0,0.0,0.0,,,,,,,,2019-12-31,,2019-01-01,MARION REGIONAL MEDICAL CENTER,2022.0,AL,0.0323,4,No,290.0,130.0,160.0,0.0,0.0,0.043,0.094,0.0,0.0,,,0.0323,No,1.0,0.0,,,,5.0,5.0,,,5.0,,,5.0,,5.0,,3964.0,3964.0,0.0,1.0,0.051,0.735,0.0,1.3605,,,,,,,,,,,,,0.0323,,,,,,,,0.0,,,12/31/2019,07/01/2018,,2022_04,AL,,,,,,,22361.571429,22361.571429
26457,,,,,10045,2022,0.0,0.0,0.0,0.0,0.0,,,,,,,,2019-12-31,,2019-01-01,FAYETTE MEDICAL CENTER,2022.0,AL,-0.0945,4,No,509.0,384.0,125.0,1.0,0.0,0.19,0.074,5.2632,0.0,,,-0.0945,No,1.0,0.0,,,,5.0,5.0,,,5.0,,,5.0,,5.0,,4162.0,4162.0,0.0,1.0,0.107,0.758,0.0,1.3193,,,,,,,,,,,,,-0.0945,,,,,,,,0.0,,,12/31/2019,07/01/2018,,2022_04,AL,,,,,,,32057.857143,32057.857143
26452,,,,,10051,2022,0.0,0.0,0.0,0.0,0.0,,,,,,,,2019-12-31,,2019-01-01,GREENE COUNTY HOSPITAL,2022.0,AL,0.0983,4,No,0.0,24.0,0.0,0.0,0.0,0.013,0.0,0.0,,,,0.0983,No,1.0,0.0,,,,5.0,5.0,,,5.0,,,5.0,,5.0,,1805.0,1805.0,0.0,0.0,0.031,0.424,0.0,0.0,,,,,,,,,,,,,0.0983,,,,,,,,0.0,,,12/31/2019,07/01/2018,,2022_04,AL,,,,,,,8093.285714,8093.285714


In [4]:
ls1 = list(df_yr)
ls2 = list(main_df)
ls = list(filter(lambda x:x in ls1, ls2))
df_yr = df_yr.merge(main_df, on=ls, how='outer')

print(df_yr.shape)
print(len(df_yr['Facility ID'].unique()), 'hospitals resulting from merging the above dataframes.')
print('This is NOT the same number as the number of hospitals having real HAC scores.')
print('Instead, it is the same as the number of hospitals with legit scores for PSI-90 or one or more HAIs.')


(3105, 132)
3105 hospitals resulting from merging the above dataframes.
This is NOT the same number as the number of hospitals having real HAC scores.
Instead, it is the same as the number of hospitals with legit scores for PSI-90 or one or more HAIs.


## Generate Winsorized z-scores

In [5]:
hais = ['CAUTI', 'CLABSI', 'MRSA', 'CDI']
   
for i, hai in enumerate(hais):
    tdf2 = df_yr[~df_yr[hai + ' Footnote'].isin([18, '18', '18 ', ' 18', 
                                              5, '5', ' 5', '5 ',
                                              4, '4', ' 4', '4 ',
                                              ])]
      
    reported_winZ = tdf2[hai + ' W Z Score'].tolist()
    sis = tdf2[hai + ' SIS'].tolist()
    tdf2[hai + ' Winsorized SIS'] = Winsorize_it(sis, reported_winZ)
    tdf2[hai + ' SIS W Z Score'] = ZScore_it(tdf2[hai + ' Winsorized SIS'], reported_winZ)
    
    sir = tdf2[hai + ' expected O/P'].tolist()
    tdf2[hai + ' Winsorized random-based'] = Winsorize_it(sis, reported_winZ)
    tdf2[hai + ' random-based W Z Score'] = ZScore_it(tdf2[hai + ' Winsorized random-based'], reported_winZ)
    
    
    # Assign maximum WinZ scores to hospitals with HAI footnote 18 
    maxWinZ = np.nanmax(tdf2[hai + ' SIS W Z Score'])
    tdf3 = df_yr[df_yr[hai + ' Footnote'].isin([18, '18', '18 ', ' 18'])]
    tdf3[hai + ' Winsorized SIS'] = [np.nan]*tdf3.shape[0]
    tdf3[hai + ' SIS W Z Score'] = [maxWinZ]*tdf3.shape[0]
    
    maxWinZ = np.nanmax(tdf2[hai + ' random-based W Z Score'])
    tdf3 = df_yr[df_yr[hai + ' Footnote'].isin([18, '18', '18 ', ' 18'])]
    tdf3[hai + ' Winsorized random-based'] = [np.nan]*tdf3.shape[0]
    tdf3[hai + ' random-based W Z Score'] = [maxWinZ]*tdf3.shape[0]
    
    
    tdf4 = df_yr[df_yr[hai + ' Footnote'].isin([5, '5', ' 5', '5 ', 4, '4', ' 4', '4 '])]
    tdf4[hai + ' Winsorized SIS'] = [np.nan]*tdf4.shape[0]
    tdf4[hai + ' SIS W Z Score'] = [np.nan]*tdf4.shape[0]
    
    tdf4[hai + ' Winsorized random-based'] = [np.nan]*tdf4.shape[0]
    tdf4[hai + ' random-based W Z Score'] = [np.nan]*tdf4.shape[0]
    
    df_yr = pd.concat([tdf2, tdf3, tdf4], axis=0)

del tdf2
del tdf3

In [6]:
print('Results from attempting to reproduce Yes/No penalty assignments:\n')
print('Excluded from results below:')
print('1. MD hospitals')
print('2. Hospitals with payment reduction values other than Yes or No\n')

holdout_df = df_yr[(df_yr['State'] == 'MD') | ~df_yr['Payment Reduction'].isin(['Yes', 'No']) | (df_yr['Total HAC Score'].isin([float("NaN"), np.nan]))]
df_yr = df_yr[(df_yr['State'] != 'MD') & (df_yr['Payment Reduction'].isin(['Yes', 'No'])) & (~df_yr['Total HAC Score'].isin([float("NaN"), np.nan]))]


Results from attempting to reproduce Yes/No penalty assignments:

Excluded from results below:
1. MD hospitals
2. Hospitals with payment reduction values other than Yes or No



## Generate SIS-based scores

In [7]:
hac_scores = []
ct1 = 0
ct2 = 0

for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    d = 0
        
    w_ls = []
    sum_ls = []

    m_ls = ['PSI-90 W Z Score', 'CDI SIS W Z Score', 'CAUTI SIS W Z Score', 'CLABSI SIS W Z Score', 'MRSA SIS W Z Score', 'SSI W Z Score']
                
    s = 0
    w = 0
    for m in m_ls:
        v = tdf[m].tolist()
        if len(list(set(v))) > 1:
            #print('len(list(set(v))) > 1')
            if np.isnan(v[0]) == True:
                v = [np.nan]
            
        else:
            v = tdf[m].iloc[0]

        if np.isnan(v) == False: 
            s += v
            w += 1

    if w == 0:
        d = np.nan
    else:
        d = s/w
        
    # If the derived score is NaN and the original score is NaN ...
    if np.isnan(d) == True:
        hac_scores.append(np.nan)
        
    # If the derived score is a float and the original score is a float ...
    elif np.isnan(d) == False:
        hac_scores.append(d)


print('holdout_df.shape:', holdout_df.shape)
df_yr['Total HAC Score (SIS-based)'] = hac_scores
print(df_yr.shape[0], 'hospitals in hac_df')




p75 = np.nanpercentile(df_yr['Total HAC Score (SIS-based)'], 75)
print('p75:', p75)

pr = []
for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    score = tdf['Total HAC Score (SIS-based)'].iloc[0]

    if np.isnan(score) == True:
        pr.append('No')
    elif score <= p75:
        pr.append('No')
    elif score > p75:
        pr.append('Yes')
    else:
        print('This score is an error:', score)
        sys.exit()

df_yr['Payment Reduction (SIS-based)'] = pr
    
o_list = df_yr['Payment Reduction'].tolist()
d_list = df_yr['Payment Reduction (SIS-based)'].tolist()

same = 0
diff = 0
p_to_np = 0
np_to_p = 0

res_ls = []
for i, o in enumerate(o_list):
    if o == d_list[i]:
        same += 1
        res_ls.append(1)
    else:
        diff += 1
        if o == 'Yes' and d_list[i] == 'No':
            p_to_np += 1
        elif o == 'No' and d_list[i] == 'Yes':
            np_to_p += 1
        else:
            print('Error')
            break
        res_ls.append(0)
               
print(same, "SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments.")
print(diff, "SIR-based penalty assignments were NOT the same as their corresponding SIS-based penalty assignments.")
print(str(np.round(100 * same/(same+diff),2)) + '% of SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments."\n')

print(p_to_np, 'hospitals were penalized but should not have been.')
print(np_to_p, 'hospitals were NOT penalized but should have been.')

holdout_df.shape: (49, 148)
3056 hospitals in hac_df
p75: 0.32151621432521843
2851 SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments.
205 SIR-based penalty assignments were NOT the same as their corresponding SIS-based penalty assignments.
93.29% of SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments."

101 hospitals were penalized but should not have been.
104 hospitals were NOT penalized but should have been.


## Generate scores based on random expectations

In [8]:
hac_scores = []
ct1 = 0
ct2 = 0

for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    d = 0
        
    w_ls = []
    sum_ls = []

    m_ls = ['PSI-90 W Z Score', 'CDI random-based W Z Score', 'CAUTI random-based W Z Score', 'CLABSI random-based W Z Score', 'MRSA random-based W Z Score', 'SSI W Z Score']
                
    s = 0
    w = 0
    for m in m_ls:
        v = tdf[m].tolist()
        if len(list(set(v))) > 1:
            #print('len(list(set(v))) > 1')
            if np.isnan(v[0]) == True:
                v = [np.nan]
            
        else:
            v = tdf[m].iloc[0]

        if np.isnan(v) == False: 
            s += v
            w += 1

    if w == 0:
        d = np.nan
    else:
        d = s/w
        
    # If the derived score is NaN and the original score is NaN ...
    if np.isnan(d) == True:
        hac_scores.append(np.nan)
        
    # If the derived score is a float and the original score is a float ...
    elif np.isnan(d) == False:
        hac_scores.append(d)


print('holdout_df.shape:', holdout_df.shape)
df_yr['Total HAC Score (random-based)'] = hac_scores
print(df_yr.shape[0], 'hospitals in hac_df')




p75 = np.nanpercentile(df_yr['Total HAC Score (random-based)'], 75)
print('p75:', p75)

pr = []
for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    score = tdf['Total HAC Score (random-based)'].iloc[0]

    if np.isnan(score) == True:
        pr.append('No')
    elif score <= p75:
        pr.append('No')
    elif score > p75:
        pr.append('Yes')
    else:
        print('This score is an error:', score)
        sys.exit()

df_yr['Payment Reduction (random-based)'] = pr


holdout_df.shape: (49, 148)
3056 hospitals in hac_df
p75: 0.3243834145647264


In [9]:
## SIS-based

tdf1 = df_yr[df_yr['Payment Reduction (SIS-based)'] == 'Yes']
tdf2 = df_yr[df_yr['Payment Reduction (SIS-based)'].isin(['Yes', 'No'])]
print(tdf1.shape[0]/tdf2.shape[0])

tdf1 = df_yr[df_yr['Payment Reduction (SIS-based)'] == 'No']
print(tdf1.shape[0]/tdf2.shape[0])
print(tdf2.shape[0], len(tdf2['Facility ID'].unique()))
print('\n')

## Random-based 
tdf1 = df_yr[df_yr['Payment Reduction (random-based)'] == 'Yes']
tdf2 = df_yr[df_yr['Payment Reduction (random-based)'].isin(['Yes', 'No'])]
print(tdf1.shape[0]/tdf2.shape[0])

tdf1 = df_yr[df_yr['Payment Reduction (random-based)'] == 'No']
print(tdf1.shape[0]/tdf2.shape[0])
print(tdf2.shape[0], len(tdf2['Facility ID'].unique()))
print('\n')

## Original payment reduction
tdf1 = df_yr[df_yr['Payment Reduction'] == 'Yes']
tdf2 = df_yr[df_yr['Payment Reduction'].isin(['Yes', 'No'])]
print(tdf1.shape[0]/tdf2.shape[0])

tdf1 = df_yr[df_yr['Payment Reduction'] == 'No']
print(tdf1.shape[0]/tdf2.shape[0])
print(df_yr.shape[0], len(df_yr['Facility ID'].unique()))

0.25
0.75
3056 3056


0.25
0.75
3056 3056


0.24901832460732984
0.7509816753926701
3056 3056


In [10]:
ls1 = list(df_yr)
ls2 = list(holdout_df)
ls = list(filter(lambda x:x in ls1, ls2))
print(df_yr.shape)
print(holdout_df.shape)
df_yr = df_yr.merge(holdout_df, how='outer', on=ls)
print(df_yr.shape)

(3056, 152)
(49, 148)
(3105, 152)


In [11]:
p_np_df = df_yr[(df_yr['Payment Reduction'] == 'Yes') & (df_yr['Payment Reduction (SIS-based)'] == 'No')]
ip_ = np.round(np.nansum(p_np_df['HAC penalty, final']))
print(p_np_df.shape[0])

np_p_df = df_yr[(df_yr['Payment Reduction'] == 'No') & (df_yr['Payment Reduction (SIS-based)'] == 'Yes')]
is_ = np.round(np.nansum(np_p_df['HAC penalty, final']))
print(np_p_df.shape[0])

print(ip_, 'dollars of inappropriate penalties')
print(is_, 'dollars of inappropriate hospital savings')
print('', ip_ - is_)


101
104
67263134.0 dollars of inappropriate penalties
17387310.0 dollars of inappropriate hospital savings
 49875824.0


In [12]:
df_yr.to_pickle('~/GitHub/HACRP-HAIs/data/finalized/final_2022.pkl', protocol=5)