In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import pandas as pd
import warnings
import sys
import numpy as np
import scipy as sc
import random
from scipy import stats
from numpy import log10, sqrt

mydir = '/Users/kenlocey/GitHub/HACRP-HAIs/'
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


def Winsorize_it(x, WinZs):
    
    x2 = []
    for i, val in enumerate(x):
        wz = float(WinZs[i])
        val = float(val)
        if np.isnan(wz) == True:
            x2.append(np.nan)
        else:
            x2.append(val)
    
    p5 = np.nanpercentile(x2, 5)
    p95 = np.nanpercentile(x2, 95)
    WinScores = []
    
    for i, val in enumerate(x2):
        wz = float(WinZs[i])
        val = float(val)
        
        if np.isnan(wz) == True:
            WinScores.append(np.nan)
            
        elif np.isnan(wz) == False:
            if val >= p5 and val <= p95:
                WinScores.append(val)
            elif val < p5:
                WinScores.append(p5)
            elif val > p95:
                WinScores.append(p95)
            elif np.isnan(val) == True:
                #print('val:', val, '|', WinZs[i])
                WinScores.append(np.nan)
        
    
    return WinScores
        

def ZScore_it(x, WinZs):
    
    x2 = []
    for i, val in enumerate(x):
        wz = float(WinZs[i])
        val = float(val)
        if np.isnan(wz) == True:
            x2.append(np.nan)
        else:
            x2.append(val)
    
    x2 = np.array(x2)
    avg = np.nanmean(x2)
    std = np.nanstd(x2)
    zscores = (x2 - avg) / std
    return zscores

hac_mo = '10'

## Load HAC file

In [2]:
main_df = pd.read_pickle('~/GitHub/HACRP-HAIs/data/Compiled_HCRIS-HACRP-HAI-RAND/Compiled_HCRIS-HACRP-HAI-RAND.pkl')
main_df = main_df[main_df['file_year'] == '2020']
main_df.sort_values(by='Facility ID', inplace=True)

print(main_df.shape[0], 'rows in hac_df')
print(len(main_df['Facility ID'].unique()), 'hospitals in hac_df')

##############  Label rows that have duplicate dates (per provider) ####################
##############  For each provider with rows having duplicate dates,  ###################

main_df.sort_values(by=['Facility ID', 'Line 19'], inplace=True, ascending=False)
main_df.drop_duplicates(subset=['Facility ID'], inplace=True, keep='first')

print(main_df.shape)

3218 rows in hac_df
3218 hospitals in hac_df
(3218, 100)


In [3]:
cauti_file =  '/Users/kenlocey/GitHub/HACRP-HAIs/4_optimize_random_sampling_models/optimized_by_HAI_file_date/CAUTI/CAUTI_Data_opt_for_SIRs_2020_10.pkl'
clabsi_file = '/Users/kenlocey/GitHub/HACRP-HAIs/4_optimize_random_sampling_models/optimized_by_HAI_file_date/CLABSI/CLABSI_Data_opt_for_SIRs_2020_10.pkl'

mrsa_file = '/Users/kenlocey/GitHub/HACRP-HAIs/4_optimize_random_sampling_models/optimized_by_HAI_file_date/MRSA/MRSA_Data_opt_for_SIRs_2020_10.pkl'
cdi_file = '/Users/kenlocey/GitHub/HACRP-HAIs/4_optimize_random_sampling_models/optimized_by_HAI_file_date/CDI/CDI_Data_opt_for_SIRs_2020_10.pkl'


In [4]:

tdf, tdf1, tdf2 = 0, 0, 0
cauti_df, clabsi_df = 0, 0
mrsa_df, cdi_df = 0, 0

cauti_df = pd.read_pickle(cauti_file)
cauti_df['CAUTI SIS'] = (cauti_df['CAUTI Observed Cases'] - cauti_df['expected O'])/cauti_df['CAUTI Predicted Cases']
cauti_df.rename(columns={
        'O/E': 'CAUTI O/E',
        'simulated O': 'CAUTI simulated O',
        'simulated O/E': 'CAUTI simulated O/E',
        'expected O': 'CAUTI expected O',
        'expected O/E': 'CAUTI expected O/E',
        'pi_opt': 'CAUTI pi_opt',
        'z_opt': 'CAUTI z_opt',
    }, inplace=True)
tdf = cauti_df[cauti_df['CAUTI Predicted Cases'] >= 1]
cauti_deciles = np.nanpercentile(tdf['CAUTI SIS'], np.arange(0, 100, 10))

    
clabsi_df = pd.read_pickle(clabsi_file)
clabsi_df['CLABSI SIS'] = (clabsi_df['CLABSI Observed Cases'] - clabsi_df['expected O'])/clabsi_df['CLABSI Predicted Cases']
clabsi_df.rename(columns={
        'O/E': 'CLABSI O/E',
        'simulated O': 'CLABSI simulated O',
        'simulated O/E': 'CLABSI simulated O/E',
        'expected O': 'CLABSI expected O',
        'expected O/E': 'CLABSI expected O/E',
        'pi_opt': 'CLABSI pi_opt',
        'z_opt': 'CLABSI z_opt',
    }, inplace=True)
tdf = clabsi_df[clabsi_df['CLABSI Predicted Cases'] >= 1]
clabsi_deciles = np.nanpercentile(tdf['CLABSI SIS'], np.arange(0, 100, 10))    
    
mrsa_df = pd.read_pickle(mrsa_file)
mrsa_df['MRSA SIS'] = (mrsa_df['MRSA Observed Cases'] - mrsa_df['expected O'])/mrsa_df['MRSA Predicted Cases']
mrsa_df.rename(columns={
        'O/E': 'MRSA O/E',
        'simulated O': 'MRSA simulated O',
        'simulated O/E': 'MRSA simulated O/E',
        'expected O': 'MRSA expected O',
        'expected O/E': 'MRSA expected O/E',
        'pi_opt': 'MRSA pi_opt',
        'z_opt': 'MRSA z_opt',
    }, inplace=True)
tdf = mrsa_df[mrsa_df['MRSA Predicted Cases'] >= 1]
mrsa_deciles = np.nanpercentile(tdf['MRSA SIS'], np.arange(0, 100, 10))    


cdi_df = pd.read_pickle(cdi_file)
cdi_df['CDI SIS'] = (cdi_df['CDI Observed Cases'] - cdi_df['expected O'])/cdi_df['CDI Predicted Cases']
cdi_df.rename(columns={
        'O/E': 'CDI O/E',
        'simulated O': 'CDI simulated O',
        'simulated O/E': 'CDI simulated O/E',
        'expected O': 'CDI expected O',
        'expected O/E': 'CDI expected O/E',
        'pi_opt': 'CDI pi_opt',
        'z_opt': 'CDI z_opt',
    }, inplace=True)
tdf = cdi_df[cdi_df['CDI Predicted Cases'] >= 1]
cdi_deciles = np.nanpercentile(tdf['CDI SIS'], np.arange(0, 100, 10))    


In [5]:
opt_df = cauti_df.copy(deep=True)
ls1 = list(opt_df)
ls2 = list(clabsi_df)
ls = list(filter(lambda x:x in ls1, ls2))
opt_df = opt_df.merge(clabsi_df, on=ls, how='outer')
    
ls1 = list(opt_df)
ls2 = list(mrsa_df)
ls = list(filter(lambda x:x in ls1, ls2))
opt_df = opt_df.merge(mrsa_df, on=ls, how='outer')

ls1 = list(opt_df)
ls2 = list(cdi_df)
ls = list(filter(lambda x:x in ls1, ls2))
opt_df = opt_df.merge(cdi_df, on=ls, how='outer')


del tdf, tdf1, tdf2
del cauti_df, clabsi_df
del mrsa_df, cdi_df

opt_df.sort_values(by=['Facility ID', 'Line 19'], inplace=True, ascending=False)
opt_df.drop_duplicates(subset=['Facility ID'], inplace=True, keep='first')

opt_df.head()


Unnamed: 0,RPT_REC_NUM,IPPS interim payment (E_A_HOS_C1_72),IPPS payment (E_A_HOS_C1_59),PRVDR_CTRL_TYPE_CD,Facility ID,file_year,Line 19,Reconstructed IPPS payment (pre HAC penalty),Reconstructed HAC penalty,Reconstructed IPPS payment (post HAC penalty),HAC penalty imputed from E_A_HOS_C1_59,IPPS payment (from RAND),AHRQ PSI-90 Score,CAUTI Score,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Score,Start Date,Facility Name,Fiscal Year,State,Total HAC Score,file_month,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,CAUTI derived score,CLABSI derived score,Total HAC Score (derived),Payment Reduction (derived),Payment Reduction Reproduced?,HAC delta,CAUTI delta,CLABSI delta,AHRQ PSI-90 Footnote,CAUTI Footnote,CLABSI Footnote,Domain 1 Score Footnote,Domain 2 Score Footnote,SSI Footnote,SSI Score,Total HAC Footnote,CDI Footnote,CDI Score,MRSA Footnote,MRSA Score,MRSA patient days,CDI patient days,MRSA Observed Cases,CDI Observed Cases,MRSA Predicted Cases,CDI Predicted Cases,MRSA derived SIR,CDI derived SIR,MRSA derived score,CDI derived score,CDI delta,MRSA delta,Payment Reduction Footnote,CAUTI W Z Score,CDI W Z Score,CLABSI W Z Score,Domain 1 Footnote,Domain 2 Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,SSI W Z Score,CAUTI derived Winsorized SIR,CAUTI derived W Z Score,CLABSI derived Winsorized SIR,CLABSI derived W Z Score,MRSA derived Winsorized SIR,MRSA derived W Z Score,HAC penalty (E_A_HOS_C1_7099),CDI derived Winsorized SIR,CDI derived W Z Score,PSI-90 End Date,PSI-90 Start Date,HAC penalty (imputed from RAND),file date,STATE,Dollar difference in payments (RAND vs E_A_HOS_C1_59),% Difference in payments (RAND vs E_A_HOS_C1_59),% Error in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),Dollar difference in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),% Error in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),Dollar difference in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),avg_Reconstructed HAC penalty,"HAC penalty, final",CAUTI O/E,CAUTI simulated O,CAUTI simulated O/E,CAUTI expected O,CAUTI expected O/E,CAUTI pi_opt,CAUTI z_opt,CAUTI SIS,CLABSI O/E,CLABSI simulated O,CLABSI simulated O/E,CLABSI expected O,CLABSI expected O/E,CLABSI pi_opt,CLABSI z_opt,CLABSI SIS,MRSA O/E,MRSA simulated O,MRSA simulated O/E,MRSA expected O,MRSA expected O/E,MRSA pi_opt,MRSA z_opt,MRSA SIS,CDI O/E,CDI simulated O,CDI simulated O/E,CDI expected O,CDI expected O/E,CDI pi_opt,CDI z_opt,CDI SIS
0,696699,38244623.0,42456686.0,2,670122,2020,42316645.0,42063981.0,420640.0,41643341.0,424566.86,,,,,,,,2018-12-31,,2017-01-01,HOUSTON METHODIST THE WOODLANDS HOSPITAL,2020.0,TX,-1.1537,10,No,8498.0,4724.0,3774.0,1.0,0.0,3.217,2.792,0.3108,0.0,,,-1.156022,No,1.0,0.002322,0.00312,0.003043,,,,,,,,,,,,,39779.0,35852.0,0.0,13.0,1.497,22.885,0.0,0.5681,,,0.004277,0.000972,,-0.9444,-0.4331,-1.4459,,,-1.4453,5.0,,-1.5,0.3108,-0.94752,0.0,-1.448943,0.0,-1.446272,0.0,0.5681,-0.437377,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,451290.666667,420640.0,0.310849,1.0,0.310849,1.990634,0.618786,0.00144,11416.187813,-0.307937,0.0,1.0,0.358166,1.455376,0.521266,0.000921,5243.040669,-0.521266,0.0,0.0,0.0,0.870192,0.581291,9.8e-05,138133.328249,-0.581291,0.568058,20.0,0.873935,12.630333,0.551904,0.000617,26985.623979,0.016153
1,724899,10464285.0,13147096.0,4,670120,2020,13147096.0,13082201.0,130822.0,12951379.0,131470.96,,,,,,,,2018-12-31,,2017-01-01,THE HOSPITALS OF PROVIDENCE TRANSMOUNTAIN CAMPUS,2020.0,TX,-0.2559,10,No,4201.0,2472.0,1729.0,0.0,0.0,1.734,1.29,0.0,0.0,,,-0.255792,No,1.0,-0.000108,0.003647,0.003043,,,,,,,,,,,5.0,,25794.0,25794.0,0.0,19.0,0.782,11.419,0.0,1.6639,,,-0.007028,,,-1.5354,1.971,-1.4459,,,,,1.231,-1.5,0.0,-1.539047,0.0,-1.448943,,,0.0,1.36798,1.978028,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,141113.25,130822.0,0.0,2.0,1.153403,0.633478,0.365328,0.00144,11416.187813,-0.365328,0.0,0.0,0.0,0.395062,0.30625,0.000921,5243.040669,-0.30625,,,,,,,,,1.663894,9.0,0.78816,7.783571,0.681633,0.000617,26985.623979,0.98226
2,718577,10580679.0,10700766.0,1,670108,2020,10700766.0,11878922.0,118789.0,11760133.0,107007.66,10700760.0,,,,,,,2018-12-31,,2017-01-01,BAYLOR SCOTT & WHITE MEDICAL CENTER - MARBLE F...,2020.0,TX,0.0915,10,No,3092.0,2463.0,629.0,1.0,0.0,1.26,0.392,0.7937,0.0,,,0.08877,No,1.0,0.00273,0.001945,,,,5.0,,,,,,,,5.0,,18443.0,17305.0,1.0,4.0,0.627,10.512,1.5949,0.3805,,,0.008875,,,-0.0265,-0.995,,,,,,0.0707,1.3167,0.7937,-0.028445,,,,,0.0,0.3805,-1.003875,06/30/2018,07/01/2016,107008.0,2020_10,TX,6.0,5.6e-05,,,,,104701.25,118789.0,0.793651,0.0,0.0,0.629281,0.49943,0.00144,11416.187813,0.294221,,,,,,,,,,,,,,,,,0.380518,6.0,0.570776,4.174834,0.397149,0.000617,26985.623979,-0.016632
2553,725557,12032027.0,11805318.0,4,670106,2020,11805318.0,11740624.0,117406.0,11623218.0,118053.18,,,,,,,,2018-12-31,,2017-01-01,HCA HOUSTON HEALTHCARE PEARLAND,2020.0,TX,0.7338,10,Yes,2410.0,1230.0,1180.0,2.0,1.0,0.605,0.683,3.3058,1.4641,,,0.733323,Yes,1.0,0.000477,,,,5.0,5.0,,,,,,,,5.0,,13268.0,12991.0,2.0,4.0,0.52,4.945,3.8462,0.8089,,,0.001331,,,,0.2911,,,,,,-0.0568,1.967,,,,,,,117406.0,0.8089,0.289769,06/30/2018,07/01/2016,,2020_10,TX,,,,,0.0,0.0,83461.833333,117406.0,,,,,,,,,,,,,,,,,,,,,,,,,0.808898,6.0,1.213347,2.606675,0.527133,0.000617,26985.623979,0.281764
2528,724583,9784673.0,9844229.0,4,670103,2020,9844229.0,9821048.0,98210.0,9722838.0,98442.29,,,,,,,,2018-12-31,,2017-01-01,MEDICAL CITY ALLIANCE,2020.0,TX,-0.5062,10,No,4038.0,1806.0,2232.0,1.0,0.0,0.953,1.729,1.0493,0.0,,,-0.505288,No,1.0,-0.000912,,0.003043,,5.0,,,,,,,,,5.0,,26596.0,23416.0,1.0,15.0,0.912,12.035,1.0965,1.2464,,,-0.006592,,,,1.6043,-1.4459,,,,,-0.6831,-1.5,,,0.0,-1.448943,,,0.0,1.2464,1.610892,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,86255.333333,98210.0,,,,,,,,,0.0,1.0,0.578369,0.614059,0.355153,0.000921,5243.040669,-0.355153,,,,,,,,,1.246365,6.0,0.498546,6.717206,0.558139,0.000617,26985.623979,0.688225


In [6]:
ls1 = list(main_df)
ls2 = list(opt_df)
ls = list(filter(lambda x:x in ls1, ls2))
main_df = main_df.merge(opt_df, on=ls, how='outer')

print(main_df.shape)
main_df.head()

(3218, 132)


Unnamed: 0,RPT_REC_NUM,IPPS interim payment (E_A_HOS_C1_72),IPPS payment (E_A_HOS_C1_59),PRVDR_CTRL_TYPE_CD,Facility ID,file_year,Line 19,Reconstructed IPPS payment (pre HAC penalty),Reconstructed HAC penalty,Reconstructed IPPS payment (post HAC penalty),HAC penalty imputed from E_A_HOS_C1_59,IPPS payment (from RAND),AHRQ PSI-90 Score,CAUTI Score,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Score,Start Date,Facility Name,Fiscal Year,State,Total HAC Score,file_month,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,CAUTI derived score,CLABSI derived score,Total HAC Score (derived),Payment Reduction (derived),Payment Reduction Reproduced?,HAC delta,CAUTI delta,CLABSI delta,AHRQ PSI-90 Footnote,CAUTI Footnote,CLABSI Footnote,Domain 1 Score Footnote,Domain 2 Score Footnote,SSI Footnote,SSI Score,Total HAC Footnote,CDI Footnote,CDI Score,MRSA Footnote,MRSA Score,MRSA patient days,CDI patient days,MRSA Observed Cases,CDI Observed Cases,MRSA Predicted Cases,CDI Predicted Cases,MRSA derived SIR,CDI derived SIR,MRSA derived score,CDI derived score,CDI delta,MRSA delta,Payment Reduction Footnote,CAUTI W Z Score,CDI W Z Score,CLABSI W Z Score,Domain 1 Footnote,Domain 2 Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,SSI W Z Score,CAUTI derived Winsorized SIR,CAUTI derived W Z Score,CLABSI derived Winsorized SIR,CLABSI derived W Z Score,MRSA derived Winsorized SIR,MRSA derived W Z Score,HAC penalty (E_A_HOS_C1_7099),CDI derived Winsorized SIR,CDI derived W Z Score,PSI-90 End Date,PSI-90 Start Date,HAC penalty (imputed from RAND),file date,STATE,Dollar difference in payments (RAND vs E_A_HOS_C1_59),% Difference in payments (RAND vs E_A_HOS_C1_59),% Error in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),Dollar difference in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),% Error in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),Dollar difference in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),avg_Reconstructed HAC penalty,"HAC penalty, final",CAUTI O/E,CAUTI simulated O,CAUTI simulated O/E,CAUTI expected O,CAUTI expected O/E,CAUTI pi_opt,CAUTI z_opt,CAUTI SIS,CLABSI O/E,CLABSI simulated O,CLABSI simulated O/E,CLABSI expected O,CLABSI expected O/E,CLABSI pi_opt,CLABSI z_opt,CLABSI SIS,MRSA O/E,MRSA simulated O,MRSA simulated O/E,MRSA expected O,MRSA expected O/E,MRSA pi_opt,MRSA z_opt,MRSA SIS,CDI O/E,CDI simulated O,CDI simulated O/E,CDI expected O,CDI expected O/E,CDI pi_opt,CDI z_opt,CDI SIS
0,687971.0,2070589.0,2339344.0,2.0,670128,2020,2339344.0,2339344.0,23393.0,2315951.0,23393.44,,,,,,,,2018-12-31,,2017-01-01,BAYLOR SCOTT & WHITE MEDICAL CENTER PFLUGERVILLE,2020.0,TX,,10,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,5.0,5.0,,,5.0,,5.0,5.0,,5.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,5.0,,,,,,,,,0.0,,,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,27063.0,23393.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,724123.0,95222.0,88216.0,10.0,670125,2020,88216.0,88216.0,882.0,87334.0,882.16,88216.0,,,,,,,2018-12-31,,2017-01-01,TEXAS CENTER FOR INFECTIOUS DISEASE,2020.0,TX,,10,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,5.0,5.0,,,5.0,,5.0,5.0,,5.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,5.0,,,,,,,,,0.0,,,06/30/2018,07/01/2016,882.0,2020_10,TX,0.0,0.0,,,,,882.0,882.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,699756.0,629255.0,712993.0,4.0,670124,2020,712993.0,712993.0,7130.0,705863.0,7129.93,,,,,,,,2018-12-31,,2017-01-01,THE HOSPITALS OF PROVIDENCE HORIZON CITY CAMPUS,2020.0,TX,,10,No,0.0,5.0,0.0,0.0,0.0,0.003,0.0,0.0,,,,,,,,,,,5.0,5.0,,,5.0,,5.0,5.0,,5.0,,562.0,562.0,0.0,0.0,0.01,0.128,0.0,0.0,,,,,,,,,,,,5.0,,,,,,,,,0.0,,,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,8032.333333,7130.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,696699.0,38244623.0,42456686.0,2.0,670122,2020,42316645.0,42063981.0,420640.0,41643341.0,424566.86,,,,,,,,2018-12-31,,2017-01-01,HOUSTON METHODIST THE WOODLANDS HOSPITAL,2020.0,TX,-1.1537,10,No,8498.0,4724.0,3774.0,1.0,0.0,3.217,2.792,0.3108,0.0,,,-1.156022,No,1.0,0.002322,0.00312,0.003043,,,,,,,,,,,,,39779.0,35852.0,0.0,13.0,1.497,22.885,0.0,0.5681,,,0.004277,0.000972,,-0.9444,-0.4331,-1.4459,,,-1.4453,5.0,,-1.5,0.3108,-0.94752,0.0,-1.448943,0.0,-1.446272,0.0,0.5681,-0.437377,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,451290.666667,420640.0,0.310849,1.0,0.310849,1.990634,0.618786,0.00144,11416.187813,-0.307937,0.0,1.0,0.358166,1.455376,0.521266,0.000921,5243.040669,-0.521266,0.0,0.0,0.0,0.870192,0.581291,9.8e-05,138133.328249,-0.581291,0.568058,20.0,0.873935,12.630333,0.551904,0.000617,26985.623979,0.016153
4,,,,,670121,2020,0.0,0.0,0.0,0.0,0.0,,,,,,,,2018-12-31,,2017-01-01,SAINT CAMILLUS MEDICAL CENTER,2020.0,TX,0.0487,10,No,67.0,62.0,5.0,0.0,0.0,0.03,0.004,0.0,0.0,,,0.0487,No,1.0,0.0,,,,5.0,5.0,,,5.0,,,5.0,,5.0,,157.0,157.0,0.0,0.0,0.002,0.025,0.0,0.0,,,,,,,,,,,,,0.0487,,,,,,,,0.0,,,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,2980.0,2980.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [7]:
print(main_df['file_year'].unique())
hac_df = main_df[main_df['file_year'] == '2020']
hac_df = main_df.copy(deep=True)
hac_df = hac_df[hac_df['file_month'] == hac_mo]
hac_df.dropna(how='all', axis=1, inplace=True)
hac_df.head()

['2020']


Unnamed: 0,RPT_REC_NUM,IPPS interim payment (E_A_HOS_C1_72),IPPS payment (E_A_HOS_C1_59),PRVDR_CTRL_TYPE_CD,Facility ID,file_year,Line 19,Reconstructed IPPS payment (pre HAC penalty),Reconstructed HAC penalty,Reconstructed IPPS payment (post HAC penalty),HAC penalty imputed from E_A_HOS_C1_59,IPPS payment (from RAND),End Date,Start Date,Facility Name,Fiscal Year,State,Total HAC Score,file_month,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,Total HAC Score (derived),Payment Reduction (derived),Payment Reduction Reproduced?,HAC delta,CAUTI delta,CLABSI delta,CAUTI Footnote,CLABSI Footnote,SSI Footnote,Total HAC Footnote,CDI Footnote,MRSA Footnote,MRSA patient days,CDI patient days,MRSA Observed Cases,CDI Observed Cases,MRSA Predicted Cases,CDI Predicted Cases,MRSA derived SIR,CDI derived SIR,CDI delta,MRSA delta,Payment Reduction Footnote,CAUTI W Z Score,CDI W Z Score,CLABSI W Z Score,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,SSI W Z Score,CAUTI derived Winsorized SIR,CAUTI derived W Z Score,CLABSI derived Winsorized SIR,CLABSI derived W Z Score,MRSA derived Winsorized SIR,MRSA derived W Z Score,HAC penalty (E_A_HOS_C1_7099),CDI derived Winsorized SIR,CDI derived W Z Score,PSI-90 End Date,PSI-90 Start Date,HAC penalty (imputed from RAND),file date,STATE,Dollar difference in payments (RAND vs E_A_HOS_C1_59),% Difference in payments (RAND vs E_A_HOS_C1_59),% Error in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),Dollar difference in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),% Error in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),Dollar difference in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),avg_Reconstructed HAC penalty,"HAC penalty, final",CAUTI O/E,CAUTI simulated O,CAUTI simulated O/E,CAUTI expected O,CAUTI expected O/E,CAUTI pi_opt,CAUTI z_opt,CAUTI SIS,CLABSI O/E,CLABSI simulated O,CLABSI simulated O/E,CLABSI expected O,CLABSI expected O/E,CLABSI pi_opt,CLABSI z_opt,CLABSI SIS,MRSA O/E,MRSA simulated O,MRSA simulated O/E,MRSA expected O,MRSA expected O/E,MRSA pi_opt,MRSA z_opt,MRSA SIS,CDI O/E,CDI simulated O,CDI simulated O/E,CDI expected O,CDI expected O/E,CDI pi_opt,CDI z_opt,CDI SIS
0,687971.0,2070589.0,2339344.0,2.0,670128,2020,2339344.0,2339344.0,23393.0,2315951.0,23393.44,,2018-12-31,2017-01-01,BAYLOR SCOTT & WHITE MEDICAL CENTER PFLUGERVILLE,2020.0,TX,,10,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,5.0,,,,,,,,,0.0,,,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,27063.0,23393.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,724123.0,95222.0,88216.0,10.0,670125,2020,88216.0,88216.0,882.0,87334.0,882.16,88216.0,2018-12-31,2017-01-01,TEXAS CENTER FOR INFECTIOUS DISEASE,2020.0,TX,,10,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,5.0,,,,,,,,,0.0,,,06/30/2018,07/01/2016,882.0,2020_10,TX,0.0,0.0,,,,,882.0,882.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,699756.0,629255.0,712993.0,4.0,670124,2020,712993.0,712993.0,7130.0,705863.0,7129.93,,2018-12-31,2017-01-01,THE HOSPITALS OF PROVIDENCE HORIZON CITY CAMPUS,2020.0,TX,,10,No,0.0,5.0,0.0,0.0,0.0,0.003,0.0,0.0,,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,562.0,562.0,0.0,0.0,0.01,0.128,0.0,0.0,,,,,,,,5.0,,,,,,,,,0.0,,,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,8032.333333,7130.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,696699.0,38244623.0,42456686.0,2.0,670122,2020,42316645.0,42063981.0,420640.0,41643341.0,424566.86,,2018-12-31,2017-01-01,HOUSTON METHODIST THE WOODLANDS HOSPITAL,2020.0,TX,-1.1537,10,No,8498.0,4724.0,3774.0,1.0,0.0,3.217,2.792,0.3108,0.0,-1.156022,No,1.0,0.002322,0.00312,0.003043,,,,,,,39779.0,35852.0,0.0,13.0,1.497,22.885,0.0,0.5681,0.004277,0.000972,,-0.9444,-0.4331,-1.4459,-1.4453,5.0,,-1.5,0.3108,-0.94752,0.0,-1.448943,0.0,-1.446272,0.0,0.5681,-0.437377,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,451290.666667,420640.0,0.310849,1.0,0.310849,1.990634,0.618786,0.00144,11416.187813,-0.307937,0.0,1.0,0.358166,1.455376,0.521266,0.000921,5243.040669,-0.521266,0.0,0.0,0.0,0.870192,0.581291,9.8e-05,138133.328249,-0.581291,0.568058,20.0,0.873935,12.630333,0.551904,0.000617,26985.623979,0.016153
4,,,,,670121,2020,0.0,0.0,0.0,0.0,0.0,,2018-12-31,2017-01-01,SAINT CAMILLUS MEDICAL CENTER,2020.0,TX,0.0487,10,No,67.0,62.0,5.0,0.0,0.0,0.03,0.004,0.0,0.0,0.0487,No,1.0,0.0,,,5.0,5.0,5.0,,5.0,5.0,157.0,157.0,0.0,0.0,0.002,0.025,0.0,0.0,,,,,,,,,0.0487,,,,,,,,0.0,,,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,2980.0,2980.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Attempt to reproduce HAC scores for 2020

In [8]:
df_yr = hac_df[hac_df['file_year'] == '2020']
df_yr.head()

Unnamed: 0,RPT_REC_NUM,IPPS interim payment (E_A_HOS_C1_72),IPPS payment (E_A_HOS_C1_59),PRVDR_CTRL_TYPE_CD,Facility ID,file_year,Line 19,Reconstructed IPPS payment (pre HAC penalty),Reconstructed HAC penalty,Reconstructed IPPS payment (post HAC penalty),HAC penalty imputed from E_A_HOS_C1_59,IPPS payment (from RAND),End Date,Start Date,Facility Name,Fiscal Year,State,Total HAC Score,file_month,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,Total HAC Score (derived),Payment Reduction (derived),Payment Reduction Reproduced?,HAC delta,CAUTI delta,CLABSI delta,CAUTI Footnote,CLABSI Footnote,SSI Footnote,Total HAC Footnote,CDI Footnote,MRSA Footnote,MRSA patient days,CDI patient days,MRSA Observed Cases,CDI Observed Cases,MRSA Predicted Cases,CDI Predicted Cases,MRSA derived SIR,CDI derived SIR,CDI delta,MRSA delta,Payment Reduction Footnote,CAUTI W Z Score,CDI W Z Score,CLABSI W Z Score,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,SSI W Z Score,CAUTI derived Winsorized SIR,CAUTI derived W Z Score,CLABSI derived Winsorized SIR,CLABSI derived W Z Score,MRSA derived Winsorized SIR,MRSA derived W Z Score,HAC penalty (E_A_HOS_C1_7099),CDI derived Winsorized SIR,CDI derived W Z Score,PSI-90 End Date,PSI-90 Start Date,HAC penalty (imputed from RAND),file date,STATE,Dollar difference in payments (RAND vs E_A_HOS_C1_59),% Difference in payments (RAND vs E_A_HOS_C1_59),% Error in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),Dollar difference in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),% Error in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),Dollar difference in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),avg_Reconstructed HAC penalty,"HAC penalty, final",CAUTI O/E,CAUTI simulated O,CAUTI simulated O/E,CAUTI expected O,CAUTI expected O/E,CAUTI pi_opt,CAUTI z_opt,CAUTI SIS,CLABSI O/E,CLABSI simulated O,CLABSI simulated O/E,CLABSI expected O,CLABSI expected O/E,CLABSI pi_opt,CLABSI z_opt,CLABSI SIS,MRSA O/E,MRSA simulated O,MRSA simulated O/E,MRSA expected O,MRSA expected O/E,MRSA pi_opt,MRSA z_opt,MRSA SIS,CDI O/E,CDI simulated O,CDI simulated O/E,CDI expected O,CDI expected O/E,CDI pi_opt,CDI z_opt,CDI SIS
0,687971.0,2070589.0,2339344.0,2.0,670128,2020,2339344.0,2339344.0,23393.0,2315951.0,23393.44,,2018-12-31,2017-01-01,BAYLOR SCOTT & WHITE MEDICAL CENTER PFLUGERVILLE,2020.0,TX,,10,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,5.0,,,,,,,,,0.0,,,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,27063.0,23393.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,724123.0,95222.0,88216.0,10.0,670125,2020,88216.0,88216.0,882.0,87334.0,882.16,88216.0,2018-12-31,2017-01-01,TEXAS CENTER FOR INFECTIOUS DISEASE,2020.0,TX,,10,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,5.0,,,,,,,,,0.0,,,06/30/2018,07/01/2016,882.0,2020_10,TX,0.0,0.0,,,,,882.0,882.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,699756.0,629255.0,712993.0,4.0,670124,2020,712993.0,712993.0,7130.0,705863.0,7129.93,,2018-12-31,2017-01-01,THE HOSPITALS OF PROVIDENCE HORIZON CITY CAMPUS,2020.0,TX,,10,No,0.0,5.0,0.0,0.0,0.0,0.003,0.0,0.0,,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,562.0,562.0,0.0,0.0,0.01,0.128,0.0,0.0,,,,,,,,5.0,,,,,,,,,0.0,,,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,8032.333333,7130.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,696699.0,38244623.0,42456686.0,2.0,670122,2020,42316645.0,42063981.0,420640.0,41643341.0,424566.86,,2018-12-31,2017-01-01,HOUSTON METHODIST THE WOODLANDS HOSPITAL,2020.0,TX,-1.1537,10,No,8498.0,4724.0,3774.0,1.0,0.0,3.217,2.792,0.3108,0.0,-1.156022,No,1.0,0.002322,0.00312,0.003043,,,,,,,39779.0,35852.0,0.0,13.0,1.497,22.885,0.0,0.5681,0.004277,0.000972,,-0.9444,-0.4331,-1.4459,-1.4453,5.0,,-1.5,0.3108,-0.94752,0.0,-1.448943,0.0,-1.446272,0.0,0.5681,-0.437377,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,451290.666667,420640.0,0.310849,1.0,0.310849,1.990634,0.618786,0.00144,11416.187813,-0.307937,0.0,1.0,0.358166,1.455376,0.521266,0.000921,5243.040669,-0.521266,0.0,0.0,0.0,0.870192,0.581291,9.8e-05,138133.328249,-0.581291,0.568058,20.0,0.873935,12.630333,0.551904,0.000617,26985.623979,0.016153
4,,,,,670121,2020,0.0,0.0,0.0,0.0,0.0,,2018-12-31,2017-01-01,SAINT CAMILLUS MEDICAL CENTER,2020.0,TX,0.0487,10,No,67.0,62.0,5.0,0.0,0.0,0.03,0.004,0.0,0.0,0.0487,No,1.0,0.0,,,5.0,5.0,5.0,,5.0,5.0,157.0,157.0,0.0,0.0,0.002,0.025,0.0,0.0,,,,,,,,,0.0487,,,,,,,,0.0,,,06/30/2018,07/01/2016,,2020_10,TX,,,,,,,2980.0,2980.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Generate Winsorized z-scores

In [9]:
hais = ['CAUTI', 'CLABSI', 'MRSA', 'CDI']
    
for i, hai in enumerate(hais):
    tdf2 = df_yr[~df_yr[hai + ' Footnote'].isin([18, '18', '18 ', ' 18', 
                                              5, '5', ' 5', '5 ',
                                              4, '4', ' 4', '4 ',
                                              ])]
      
    reported_winZ = tdf2[hai + ' W Z Score'].tolist()
    sirs = tdf2[hai + ' SIS'].tolist()
    tdf2[hai + ' Winsorized SIS'] = Winsorize_it(sirs, reported_winZ)
    tdf2[hai + ' SIS W Z Score'] = ZScore_it(tdf2[hai + ' Winsorized SIS'], reported_winZ)
    
    # Assign maximum WinZ scores to hospitals with HAI footnote 18 
    maxWinZ = np.nanmax(tdf2[hai + ' SIS W Z Score'])
    tdf3 = df_yr[df_yr[hai + ' Footnote'].isin([18, '18', '18 ', ' 18'])]
    tdf3[hai + ' Winsorized SIS'] = [np.nan]*tdf3.shape[0]
    tdf3[hai + ' SIS W Z Score'] = [maxWinZ]*tdf3.shape[0]
    
    tdf4 = df_yr[df_yr[hai + ' Footnote'].isin([5, '5', ' 5', '5 ', 4, '4', ' 4', '4 '])]
    tdf4[hai + ' Winsorized SIS'] = [np.nan]*tdf4.shape[0]
    tdf4[hai + ' SIS W Z Score'] = [np.nan]*tdf4.shape[0]
    
    df_yr = pd.concat([tdf2, tdf3, tdf4], axis=0)

del tdf2
del tdf3

## Attempt to reproduce HAC scores for 2020

In [10]:
print('Results from attempting to reproduce Yes/No penalty assignments:\n')
print('Excluded from results below:')
print('1. MD hospitals')
print('2. Hospitals with payment reduction values other than Yes or No\n')

holdout_df = df_yr[(df_yr['State'] == 'MD') | ~df_yr['Payment Reduction'].isin(['Yes', 'No']) | (df_yr['Total HAC Score'].isin([float("NaN"), np.nan]))]
df_yr = df_yr[(df_yr['State'] != 'MD') & (df_yr['Payment Reduction'].isin(['Yes', 'No'])) & (~df_yr['Total HAC Score'].isin([float("NaN"), np.nan]))]

hac_scores = []
ct1 = 0
ct2 = 0


for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    d = 0
        
    w_ls = []
    sum_ls = []

    m_ls = ['PSI-90 W Z Score', 'CDI SIS W Z Score', 'CAUTI SIS W Z Score', 'CLABSI SIS W Z Score', 'MRSA SIS W Z Score', 'SSI W Z Score']
                
    s = 0
    w = 0
    for m in m_ls:
        v = tdf[m].tolist()
        if len(list(set(v))) > 1:
            #print('len(list(set(v))) > 1')
            if np.isnan(v[0]) == True:
                v = [np.nan]
            
        else:
            v = tdf[m].iloc[0]

        if np.isnan(v) == False: 
            s += v
            w += 1

    if w == 0:
        d = np.nan
    else:
        d = s/w
        
    # If the derived score is NaN and the original score is NaN ...
    if np.isnan(d) == True:
        hac_scores.append(np.nan)
        
    # If the derived score is a float and the original score is a float ...
    elif np.isnan(d) == False:
        hac_scores.append(d)


print('holdout_df.shape:', holdout_df.shape)
df_yr['Total HAC Score (SIS-based)'] = hac_scores
print(df_yr.shape[0], 'hospitals in hac_df')


Results from attempting to reproduce Yes/No penalty assignments:

Excluded from results below:
1. MD hospitals
2. Hospitals with payment reduction values other than Yes or No

holdout_df.shape: (69, 121)
3149 hospitals in hac_df


In [11]:
p75 = np.nanpercentile(df_yr['Total HAC Score (SIS-based)'], 75)
print('p75:', p75)

pr = []
for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    score = tdf['Total HAC Score (SIS-based)'].iloc[0]

    if np.isnan(score) == True:
        pr.append('No')
    elif score <= p75:
        pr.append('No')
    elif score > p75:
        pr.append('Yes')
    else:
        print('This score is an error:', score)
        sys.exit()

df_yr['Payment Reduction (SIS-based)'] = pr
    
o_list = df_yr['Payment Reduction'].tolist()
d_list = df_yr['Payment Reduction (SIS-based)'].tolist()

same = 0
diff = 0
p_to_np = 0
np_to_p = 0

res_ls = []
for i, o in enumerate(o_list):
    if o == d_list[i]:
        same += 1
        res_ls.append(1)
    else:
        diff += 1
        if o == 'Yes' and d_list[i] == 'No':
            p_to_np += 1
        elif o == 'No' and d_list[i] == 'Yes':
            np_to_p += 1
        else:
            print('Error')
            break
        res_ls.append(0)
               
print(same, "SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments.")
print(diff, "SIR-based penalty assignments were NOT the same as their corresponding SIS-based penalty assignments.")
print(str(np.round(100 * same/(same+diff),2)) + '% of SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments."\n')

print(p_to_np, 'hospitals were penalized but should not have been.')
print(np_to_p, 'hospitals were NOT penalized but should have been.')

p75: 0.3330960933499678
2870 SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments.
279 SIR-based penalty assignments were NOT the same as their corresponding SIS-based penalty assignments.
91.14% of SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments."

139 hospitals were penalized but should not have been.
140 hospitals were NOT penalized but should have been.


In [12]:
tdf = df_yr[df_yr['Payment Reduction (SIS-based)'] == 'Yes']
print(tdf.shape[0]/df_yr.shape[0])

tdf = df_yr[df_yr['Payment Reduction (SIS-based)'] == 'No']
print(tdf.shape[0]/df_yr.shape[0])


0.2499206097173706
0.7500793902826294


In [13]:
ls1 = list(df_yr)
ls2 = list(holdout_df)
ls = list(filter(lambda x:x in ls1, ls2))
print(df_yr.shape)
print(holdout_df.shape)
df_yr = df_yr.merge(holdout_df, how='outer', on=ls)
print(df_yr.shape)

(3149, 123)
(69, 121)
(3218, 123)


In [14]:
p_np_df = df_yr[(df_yr['Payment Reduction'] == 'Yes') & (df_yr['Payment Reduction (SIS-based)'] == 'No')]
ip_ = np.round(np.nansum(p_np_df['HAC penalty, final']))
print(p_np_df.shape[0])

np_p_df = df_yr[(df_yr['Payment Reduction'] == 'No') & (df_yr['Payment Reduction (SIS-based)'] == 'Yes')]
is_ = np.round(np.nansum(np_p_df['HAC penalty, final']))
print(np_p_df.shape[0])

print(ip_, 'dollars of inappropriate penalties')
print(is_, 'dollars of inappropriate hospital savings')
print('', ip_ - is_)


139
140
121792520.0 dollars of inappropriate penalties
17067746.0 dollars of inappropriate hospital savings
 104724774.0


In [15]:
df_yr.to_pickle('~/GitHub/HACRP-HAIs/data/finalized/final_2020.pkl', protocol=5)