In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import pandas as pd
import warnings
import sys
import numpy as np
import scipy as sc
import random
from scipy import stats
from numpy import log10, sqrt

mydir = '/Users/kenlocey/GitHub/HACRP-HAIs/'
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def def_display_df_misses(df):
    df = df[df['Payment Reduction Reproduced?'] == 0]
    items = ['file_year', 'HAI Measures End Date', 'HAI Measures Start Date',
             'CAUTI Footnote', 'CAUTI W Z Score', 'CAUTI SIR W Z Score',
             'CDI Footnote', 'CDI W Z Score', 'CDI SIR W Z Score',
             'CLABSI Footnote', 'CLABSI W Z Score', 'CLABSI SIR W Z Score',
             'MRSA Footnote', 'MRSA W Z Score', 'MRSA SIR W Z Score',
             'SSI Footnote', 'SSI W Z Score', 'SSI SIR W Z Score',
             'PSI-90 Footnote', 'PSI-90 W Z Score', 'PSI-90 SIR W Z Score',
             'Total HAC Footnote', 'Total HAC Score', 'Total HAC Score (derived)',
             'Payment Reduction Footnote', 'Payment Reduction', 
             'Payment Reduction (derived)', 'Payment Reduction Reproduced?',
             ]
    return df.filter(items=items)


def Winsorize_it(x, WinZs):
    
    x2 = []
    for i, val in enumerate(x):
        wz = float(WinZs[i])
        val = float(val)
        if np.isnan(wz) == True:
            x2.append(np.nan)
        else:
            x2.append(val)
    
    p5 = np.nanpercentile(x2, 5)
    p95 = np.nanpercentile(x2, 95)
    WinScores = []
    
    for i, val in enumerate(x2):
        wz = float(WinZs[i])
        val = float(val)
        
        if np.isnan(wz) == True:
            WinScores.append(np.nan)
            
        elif np.isnan(wz) == False:
            if val >= p5 and val <= p95:
                WinScores.append(val)
            elif val < p5:
                WinScores.append(p5)
            elif val > p95:
                WinScores.append(p95)
            elif np.isnan(val) == True:
                #print('val:', val, '|', WinZs[i])
                WinScores.append(np.nan)
        
    
    return WinScores
        

def ZScore_it(x, WinZs):
    
    x2 = []
    for i, val in enumerate(x):
        wz = float(WinZs[i])
        val = float(val)
        if np.isnan(wz) == True:
            x2.append(np.nan)
        else:
            x2.append(val)
    
    x2 = np.array(x2)
    avg = np.nanmean(x2)
    std = np.nanstd(x2)
    zscores = (x2 - avg) / std
    return zscores

hac_mo = '10'

## Load HAC file

In [2]:
main_df = pd.read_pickle('~/GitHub/HACRP-HAIs/data/Compiled_HCRIS-HACRP-HAI-RAND/Compiled_HCRIS-HACRP-HAI-RAND.pkl')
main_df = main_df[main_df['file_year'] == '2019']
main_df.sort_values(by='Facility ID', inplace=True)

print(main_df.shape[0], 'rows in hac_df')
print(len(main_df['Facility ID'].unique()), 'hospitals in hac_df')

##############  Label rows that have duplicate dates (per provider) ####################
##############  For each provider with rows having duplicate dates,  ###################

main_df.sort_values(by=['Facility ID', 'Line 19'], inplace=True, ascending=False)
main_df.drop_duplicates(subset=['Facility ID'], inplace=True, keep='first')

print(main_df.shape)

3274 rows in hac_df
3274 hospitals in hac_df
(3274, 100)


In [3]:
import glob

hais = ['CAUTI', 'CLABSI', 'CDI', 'MRSA']
file_ls = []
for hai in hais:
    file_ls.append(sorted(glob.glob(mydir + "4_optimize_random_sampling_models/optimized_by_HAI_file_date/" + hai + "/*pkl")))

In [4]:
cauti_file =  '/Users/kenlocey/GitHub/HACRP-HAIs/4_optimize_random_sampling_models/optimized_by_HAI_file_date/CAUTI/CAUTI_Data_opt_for_SIRs_2019_10.pkl'
clabsi_file = '/Users/kenlocey/GitHub/HACRP-HAIs/4_optimize_random_sampling_models/optimized_by_HAI_file_date/CLABSI/CLABSI_Data_opt_for_SIRs_2019_10.pkl'

mrsa_file = '/Users/kenlocey/GitHub/HACRP-HAIs/4_optimize_random_sampling_models/optimized_by_HAI_file_date/MRSA/MRSA_Data_opt_for_SIRs_2019_10.pkl'
cdi_file = '/Users/kenlocey/GitHub/HACRP-HAIs/4_optimize_random_sampling_models/optimized_by_HAI_file_date/CDI/CDI_Data_opt_for_SIRs_2019_10.pkl'


In [5]:

tdf, tdf1, tdf2 = 0, 0, 0
cauti_df, clabsi_df = 0, 0
mrsa_df, cdi_df = 0, 0

cauti_df = pd.read_pickle(cauti_file)
cauti_df['CAUTI SIS'] = (cauti_df['CAUTI Observed Cases'] - cauti_df['expected O'])/cauti_df['CAUTI Predicted Cases']
cauti_df.rename(columns={
        'O/E': 'CAUTI O/E',
        'simulated O': 'CAUTI simulated O',
        'simulated O/E': 'CAUTI simulated O/E',
        'expected O': 'CAUTI expected O',
        'expected O/E': 'CAUTI expected O/E',
        'pi_opt': 'CAUTI pi_opt',
        'z_opt': 'CAUTI z_opt',
    }, inplace=True)
tdf = cauti_df[cauti_df['CAUTI Predicted Cases'] >= 1]
cauti_deciles = np.nanpercentile(tdf['CAUTI SIS'], np.arange(0, 100, 10))

    
clabsi_df = pd.read_pickle(clabsi_file)
clabsi_df['CLABSI SIS'] = (clabsi_df['CLABSI Observed Cases'] - clabsi_df['expected O'])/clabsi_df['CLABSI Predicted Cases']
clabsi_df.rename(columns={
        'O/E': 'CLABSI O/E',
        'simulated O': 'CLABSI simulated O',
        'simulated O/E': 'CLABSI simulated O/E',
        'expected O': 'CLABSI expected O',
        'expected O/E': 'CLABSI expected O/E',
        'pi_opt': 'CLABSI pi_opt',
        'z_opt': 'CLABSI z_opt',
    }, inplace=True)
tdf = clabsi_df[clabsi_df['CLABSI Predicted Cases'] >= 1]
clabsi_deciles = np.nanpercentile(tdf['CLABSI SIS'], np.arange(0, 100, 10))    
    
mrsa_df = pd.read_pickle(mrsa_file)
mrsa_df['MRSA SIS'] = (mrsa_df['MRSA Observed Cases'] - mrsa_df['expected O'])/mrsa_df['MRSA Predicted Cases']
mrsa_df.rename(columns={
        'O/E': 'MRSA O/E',
        'simulated O': 'MRSA simulated O',
        'simulated O/E': 'MRSA simulated O/E',
        'expected O': 'MRSA expected O',
        'expected O/E': 'MRSA expected O/E',
        'pi_opt': 'MRSA pi_opt',
        'z_opt': 'MRSA z_opt',
    }, inplace=True)
tdf = mrsa_df[mrsa_df['MRSA Predicted Cases'] >= 1]
mrsa_deciles = np.nanpercentile(tdf['MRSA SIS'], np.arange(0, 100, 10))    


cdi_df = pd.read_pickle(cdi_file)
cdi_df['CDI SIS'] = (cdi_df['CDI Observed Cases'] - cdi_df['expected O'])/cdi_df['CDI Predicted Cases']
cdi_df.rename(columns={
        'O/E': 'CDI O/E',
        'simulated O': 'CDI simulated O',
        'simulated O/E': 'CDI simulated O/E',
        'expected O': 'CDI expected O',
        'expected O/E': 'CDI expected O/E',
        'pi_opt': 'CDI pi_opt',
        'z_opt': 'CDI z_opt',
    }, inplace=True)
tdf = cdi_df[cdi_df['CDI Predicted Cases'] >= 1]
cdi_deciles = np.nanpercentile(tdf['CDI SIS'], np.arange(0, 100, 10))    


In [6]:
opt_df = cauti_df.copy(deep=True)
ls1 = list(opt_df)
ls2 = list(clabsi_df)
ls = list(filter(lambda x:x in ls1, ls2))
opt_df = opt_df.merge(clabsi_df, on=ls, how='outer')
    
ls1 = list(opt_df)
ls2 = list(mrsa_df)
ls = list(filter(lambda x:x in ls1, ls2))
opt_df = opt_df.merge(mrsa_df, on=ls, how='outer')

ls1 = list(opt_df)
ls2 = list(cdi_df)
ls = list(filter(lambda x:x in ls1, ls2))
opt_df = opt_df.merge(cdi_df, on=ls, how='outer')


del tdf, tdf1, tdf2
del cauti_df, clabsi_df
del mrsa_df, cdi_df

opt_df.sort_values(by=['Facility ID', 'Line 19'], inplace=True, ascending=False)
opt_df.drop_duplicates(subset=['Facility ID'], inplace=True, keep='first')

opt_df.head()


Unnamed: 0,RPT_REC_NUM,IPPS interim payment (E_A_HOS_C1_72),IPPS payment (E_A_HOS_C1_59),PRVDR_CTRL_TYPE_CD,Facility ID,file_year,Line 19,Reconstructed IPPS payment (pre HAC penalty),Reconstructed HAC penalty,Reconstructed IPPS payment (post HAC penalty),HAC penalty imputed from E_A_HOS_C1_59,IPPS payment (from RAND),AHRQ PSI-90 Score,CAUTI Score,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Score,Start Date,Facility Name,Fiscal Year,State,Total HAC Score,file_month,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,CAUTI derived score,CLABSI derived score,Total HAC Score (derived),Payment Reduction (derived),Payment Reduction Reproduced?,HAC delta,CAUTI delta,CLABSI delta,AHRQ PSI-90 Footnote,CAUTI Footnote,CLABSI Footnote,Domain 1 Score Footnote,Domain 2 Score Footnote,SSI Footnote,SSI Score,Total HAC Footnote,CDI Footnote,CDI Score,MRSA Footnote,MRSA Score,MRSA patient days,CDI patient days,MRSA Observed Cases,CDI Observed Cases,MRSA Predicted Cases,CDI Predicted Cases,MRSA derived SIR,CDI derived SIR,MRSA derived score,CDI derived score,CDI delta,MRSA delta,Payment Reduction Footnote,CAUTI W Z Score,CDI W Z Score,CLABSI W Z Score,Domain 1 Footnote,Domain 2 Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,SSI W Z Score,CAUTI derived Winsorized SIR,CAUTI derived W Z Score,CLABSI derived Winsorized SIR,CLABSI derived W Z Score,MRSA derived Winsorized SIR,MRSA derived W Z Score,HAC penalty (E_A_HOS_C1_7099),CDI derived Winsorized SIR,CDI derived W Z Score,PSI-90 End Date,PSI-90 Start Date,HAC penalty (imputed from RAND),file date,STATE,Dollar difference in payments (RAND vs E_A_HOS_C1_59),% Difference in payments (RAND vs E_A_HOS_C1_59),% Error in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),Dollar difference in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),% Error in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),Dollar difference in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),avg_Reconstructed HAC penalty,"HAC penalty, final",CAUTI O/E,CAUTI simulated O,CAUTI simulated O/E,CAUTI expected O,CAUTI expected O/E,CAUTI pi_opt,CAUTI z_opt,CAUTI SIS,CLABSI O/E,CLABSI simulated O,CLABSI simulated O/E,CLABSI expected O,CLABSI expected O/E,CLABSI pi_opt,CLABSI z_opt,CLABSI SIS,MRSA O/E,MRSA simulated O,MRSA simulated O/E,MRSA expected O,MRSA expected O/E,MRSA pi_opt,MRSA z_opt,MRSA SIS,CDI O/E,CDI simulated O,CDI simulated O/E,CDI expected O,CDI expected O/E,CDI pi_opt,CDI z_opt,CDI SIS
2594,704940,34163505.0,41940876.0,2,670122,2019,41940876.0,41822732.0,418227.0,41404505.0,419408.76,,,,,06/30/2017,,10/01/2015,2017-12-31,,2016-01-01,HOUSTON METHODIST THE WOODLANDS HOSPITAL,2019.0,TX,,10,No,2119.0,1258.0,861.0,0.0,0.0,0.862,0.602,0.0,0.0,,,,,,,,,,5.0,5.0,,,5.0,,5.0,5.0,,5.0,,10424.0,8311.0,0.0,5.0,0.455,5.016,0.0,0.9968,,,,,,,,,5.0,5.0,,5.0,,,,,,,,,0.0,,,,,,2019_10,TX,,,,,,,451290.666667,418227.0,,,,,,,,,,,,,,,,,,,,,,,,,0.99681,0.0,0.0,1.326444,0.264443,0.000746,30520.044057,0.732368
2595,710001,11912495.0,15923070.0,4,670120,2019,15923070.0,15862213.0,158622.0,15703591.0,159230.7,,,,,06/30/2017,,10/01/2015,2017-12-31,1.1256,2016-01-01,THE HOSPITALS OF PROVIDENCE TRANSMOUNTAIN CAMPUS,2019.0,TX,1.1256,10,Yes,1646.0,1026.0,620.0,0.0,0.0,0.758,0.475,0.0,0.0,,,1.130864,Yes,1.0,-0.005264,,,,5.0,5.0,,,5.0,,,,,5.0,,8715.0,8715.0,0.0,4.0,0.253,3.281,0.0,1.2191,,,-0.005264,,,,1.1256,,5.0,,,5.0,,,,,,,,,105391.0,1.2191,1.130864,,,,2019_10,TX,,,,,50.508108,53231.0,141113.25,158622.0,,,,,,,,,,,,,,,,,,,,,,,,,1.219141,2.0,0.60957,1.443517,0.439963,0.000746,30520.044057,0.779178
2596,705077,8003429.0,8247780.0,1,670108,2019,8247780.0,9150870.0,91509.0,9059361.0,82477.8,,,,,06/30/2017,-0.1216,10/01/2015,2017-12-31,-0.2377,2016-01-01,BAYLOR SCOTT & WHITE MEDICAL CENTER - MARBLE F...,2019.0,TX,-0.2203,10,No,2280.0,1819.0,461.0,1.0,0.0,0.931,0.285,1.0741,0.0,,,-0.220038,No,1.0,-0.000262,,,,5.0,5.0,,,,,,,,5.0,,14756.0,14120.0,1.0,6.0,0.462,8.255,2.1645,0.7268,,,-0.000481,,,,-0.2338,,,,,,-0.1216,-0.2415,,,,,,,0.0,0.7268,-0.233319,,,,2019_10,TX,,,,,,,104701.25,91509.0,,,,,,,,,,,,,,,,,,,,,,,,,0.726832,4.0,0.484555,3.330474,0.403449,0.000746,30520.044057,0.323383
2597,722711,10363160.0,9190282.0,4,670106,2019,9190282.0,9192784.0,91928.0,9100856.0,91902.82,,,,,06/30/2017,0.0432,10/01/2015,2017-12-31,1.3327,2016-01-01,PEARLAND MEDICAL CENTER,2019.0,TX,1.1393,10,Yes,1637.0,1105.0,532.0,1.0,1.0,0.6,0.27,1.6667,3.7037,,,1.138715,Yes,1.0,0.000585,,,,5.0,5.0,,,,,,,,5.0,,11041.0,9853.0,1.0,9.0,0.265,4.328,3.7736,2.0795,,,0.001318,,,,1.9711,,,,,,0.0432,0.6943,,,,,,,91928.0,1.521845,1.969782,,,,2019_10,TX,,,,,0.0,0.0,83461.833333,91928.0,,,,,,,,,,,,,,,,,,,,,,,,,2.079482,2.0,0.462107,1.793109,0.414304,0.000746,30520.044057,1.665178
2574,723642,11651056.0,9945359.0,4,670103,2019,9945359.0,9891775.0,98918.0,9792857.0,99453.59,,,,,06/30/2017,-0.0983,10/01/2015,2017-12-31,-0.4126,2016-01-01,MEDICAL CITY ALLIANCE,2019.0,TX,-0.3655,10,No,4163.0,1517.0,2646.0,0.0,0.0,0.797,2.342,0.0,0.0,,,-0.365711,No,1.0,0.000211,,0.004726,,5.0,,,,,,,,,5.0,,20923.0,17792.0,2.0,11.0,0.771,9.2,2.594,1.1957,,,-0.003921,,,,1.0621,-1.4878,,,,,-0.0983,-0.8122,,,0.0,-1.492526,,,0.0,1.1957,1.066021,,,,2019_10,TX,,,,,,,86255.333333,98918.0,,,,,,,,,0.0,1.0,0.426985,0.948557,0.40502,0.000914,4102.767193,-0.40502,,,,,,,,,1.195652,4.0,0.434783,4.886022,0.531089,0.000746,30520.044057,0.664563


In [7]:
ls1 = list(main_df)
ls2 = list(opt_df)
ls = list(filter(lambda x:x in ls1, ls2))
main_df = main_df.merge(opt_df, on=ls, how='outer')

print(main_df.shape)
main_df.head()

(3274, 132)


Unnamed: 0,RPT_REC_NUM,IPPS interim payment (E_A_HOS_C1_72),IPPS payment (E_A_HOS_C1_59),PRVDR_CTRL_TYPE_CD,Facility ID,file_year,Line 19,Reconstructed IPPS payment (pre HAC penalty),Reconstructed HAC penalty,Reconstructed IPPS payment (post HAC penalty),HAC penalty imputed from E_A_HOS_C1_59,IPPS payment (from RAND),AHRQ PSI-90 Score,CAUTI Score,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Score,Start Date,Facility Name,Fiscal Year,State,Total HAC Score,file_month,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,CAUTI derived score,CLABSI derived score,Total HAC Score (derived),Payment Reduction (derived),Payment Reduction Reproduced?,HAC delta,CAUTI delta,CLABSI delta,AHRQ PSI-90 Footnote,CAUTI Footnote,CLABSI Footnote,Domain 1 Score Footnote,Domain 2 Score Footnote,SSI Footnote,SSI Score,Total HAC Footnote,CDI Footnote,CDI Score,MRSA Footnote,MRSA Score,MRSA patient days,CDI patient days,MRSA Observed Cases,CDI Observed Cases,MRSA Predicted Cases,CDI Predicted Cases,MRSA derived SIR,CDI derived SIR,MRSA derived score,CDI derived score,CDI delta,MRSA delta,Payment Reduction Footnote,CAUTI W Z Score,CDI W Z Score,CLABSI W Z Score,Domain 1 Footnote,Domain 2 Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,SSI W Z Score,CAUTI derived Winsorized SIR,CAUTI derived W Z Score,CLABSI derived Winsorized SIR,CLABSI derived W Z Score,MRSA derived Winsorized SIR,MRSA derived W Z Score,HAC penalty (E_A_HOS_C1_7099),CDI derived Winsorized SIR,CDI derived W Z Score,PSI-90 End Date,PSI-90 Start Date,HAC penalty (imputed from RAND),file date,STATE,Dollar difference in payments (RAND vs E_A_HOS_C1_59),% Difference in payments (RAND vs E_A_HOS_C1_59),% Error in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),Dollar difference in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),% Error in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),Dollar difference in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),avg_Reconstructed HAC penalty,"HAC penalty, final",CAUTI O/E,CAUTI simulated O,CAUTI simulated O/E,CAUTI expected O,CAUTI expected O/E,CAUTI pi_opt,CAUTI z_opt,CAUTI SIS,CLABSI O/E,CLABSI simulated O,CLABSI simulated O/E,CLABSI expected O,CLABSI expected O/E,CLABSI pi_opt,CLABSI z_opt,CLABSI SIS,MRSA O/E,MRSA simulated O,MRSA simulated O/E,MRSA expected O,MRSA expected O/E,MRSA pi_opt,MRSA z_opt,MRSA SIS,CDI O/E,CDI simulated O,CDI simulated O/E,CDI expected O,CDI expected O/E,CDI pi_opt,CDI z_opt,CDI SIS
0,707152.0,448455.0,543174.0,4.0,670124,2019,543174.0,543174.0,5432.0,537742.0,5431.74,,,,,06/30/2017,,10/01/2015,2017-12-31,,2016-01-01,THE HOSPITALS OF PROVIDENCE HORIZON CITY CAMPUS,2019.0,TX,,10,No,0.0,5.0,0.0,0.0,0.0,0.003,0.0,0.0,,,,,,,,,,,5.0,5.0,,,5.0,,5.0,5.0,,5.0,,62.0,62.0,0.0,0.0,0.001,0.011,0.0,0.0,,,,,,,,,5.0,5.0,,5.0,,,,,,,,,0.0,,,,,,2019_10,TX,,,,,,,8032.333333,5432.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,704940.0,34163505.0,41940876.0,2.0,670122,2019,41940876.0,41822732.0,418227.0,41404505.0,419408.76,,,,,06/30/2017,,10/01/2015,2017-12-31,,2016-01-01,HOUSTON METHODIST THE WOODLANDS HOSPITAL,2019.0,TX,,10,No,2119.0,1258.0,861.0,0.0,0.0,0.862,0.602,0.0,0.0,,,,,,,,,,5.0,5.0,,,5.0,,5.0,5.0,,5.0,,10424.0,8311.0,0.0,5.0,0.455,5.016,0.0,0.9968,,,,,,,,,5.0,5.0,,5.0,,,,,,,,,0.0,,,,,,2019_10,TX,,,,,,,451290.666667,418227.0,,,,,,,,,,,,,,,,,,,,,,,,,0.99681,0.0,0.0,1.326444,0.264443,0.000746,30520.044057,0.732368
2,704939.0,273288.0,297961.0,4.0,670121,2019,297961.0,297961.0,2980.0,294981.0,2979.61,,,,,06/30/2017,,10/01/2015,2017-12-31,,2016-01-01,SAINT CAMILLUS MEDICAL CENTER,2019.0,TX,,10,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,5.0,5.0,,,5.0,,5.0,5.0,,5.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,5.0,5.0,,5.0,,,,,,,,,0.0,,,,,,2019_10,TX,,,,,,,2980.0,2980.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,710001.0,11912495.0,15923070.0,4.0,670120,2019,15923070.0,15862213.0,158622.0,15703591.0,159230.7,,,,,06/30/2017,,10/01/2015,2017-12-31,1.1256,2016-01-01,THE HOSPITALS OF PROVIDENCE TRANSMOUNTAIN CAMPUS,2019.0,TX,1.1256,10,Yes,1646.0,1026.0,620.0,0.0,0.0,0.758,0.475,0.0,0.0,,,1.130864,Yes,1.0,-0.005264,,,,5.0,5.0,,,5.0,,,,,5.0,,8715.0,8715.0,0.0,4.0,0.253,3.281,0.0,1.2191,,,-0.005264,,,,1.1256,,5.0,,,5.0,,,,,,,,,105391.0,1.2191,1.130864,,,,2019_10,TX,,,,,50.508108,53231.0,141113.25,158622.0,,,,,,,,,,,,,,,,,,,,,,,,,1.219141,2.0,0.60957,1.443517,0.439963,0.000746,30520.044057,0.779178
4,,,,,670119,2019,0.0,0.0,0.0,0.0,0.0,,,,,06/30/2017,,10/01/2015,2017-12-31,2.1342,2016-01-01,PROVIDENCE HOSPITAL OF NORTH HOUSTON LLC,2019.0,TX,2.1342,10,Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,2.130279,Yes,1.0,0.003921,0.01387,0.004769,,18.0,18.0,,,18.0,,18.0,18.0,,18.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.001318,-0.000154,,2.1658,1.9711,2.2018,5.0,18.0,2.196,5.0,,2.1365,,2.15193,,2.197031,,2.196154,0.0,,1.969782,,,,2019_10,TX,,,,,,,10367.0,10367.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [8]:
print(main_df['file_year'].unique())
hac_df = main_df[main_df['file_year'] == '2019']
hac_df = main_df.copy(deep=True)
hac_df = hac_df[hac_df['file_month'] == hac_mo]
hac_df.dropna(how='all', axis=1, inplace=True)
hac_df.head()

['2019']


Unnamed: 0,RPT_REC_NUM,IPPS interim payment (E_A_HOS_C1_72),IPPS payment (E_A_HOS_C1_59),PRVDR_CTRL_TYPE_CD,Facility ID,file_year,Line 19,Reconstructed IPPS payment (pre HAC penalty),Reconstructed HAC penalty,Reconstructed IPPS payment (post HAC penalty),HAC penalty imputed from E_A_HOS_C1_59,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Score,Start Date,Facility Name,Fiscal Year,State,Total HAC Score,file_month,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,Total HAC Score (derived),Payment Reduction (derived),Payment Reduction Reproduced?,HAC delta,CAUTI delta,CLABSI delta,CAUTI Footnote,CLABSI Footnote,SSI Footnote,Total HAC Footnote,CDI Footnote,MRSA Footnote,MRSA patient days,CDI patient days,MRSA Observed Cases,CDI Observed Cases,MRSA Predicted Cases,CDI Predicted Cases,MRSA derived SIR,CDI derived SIR,CDI delta,MRSA delta,Payment Reduction Footnote,CAUTI W Z Score,CDI W Z Score,CLABSI W Z Score,Domain 1 Footnote,Domain 2 Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,SSI W Z Score,CAUTI derived Winsorized SIR,CAUTI derived W Z Score,CLABSI derived Winsorized SIR,CLABSI derived W Z Score,MRSA derived Winsorized SIR,MRSA derived W Z Score,HAC penalty (E_A_HOS_C1_7099),CDI derived Winsorized SIR,CDI derived W Z Score,file date,STATE,% Error in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),Dollar difference in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),avg_Reconstructed HAC penalty,"HAC penalty, final",CAUTI O/E,CAUTI simulated O,CAUTI simulated O/E,CAUTI expected O,CAUTI expected O/E,CAUTI pi_opt,CAUTI z_opt,CAUTI SIS,CLABSI O/E,CLABSI simulated O,CLABSI simulated O/E,CLABSI expected O,CLABSI expected O/E,CLABSI pi_opt,CLABSI z_opt,CLABSI SIS,MRSA O/E,MRSA simulated O,MRSA simulated O/E,MRSA expected O,MRSA expected O/E,MRSA pi_opt,MRSA z_opt,MRSA SIS,CDI O/E,CDI simulated O,CDI simulated O/E,CDI expected O,CDI expected O/E,CDI pi_opt,CDI z_opt,CDI SIS
0,707152.0,448455.0,543174.0,4.0,670124,2019,543174.0,543174.0,5432.0,537742.0,5431.74,06/30/2017,,10/01/2015,2017-12-31,,2016-01-01,THE HOSPITALS OF PROVIDENCE HORIZON CITY CAMPUS,2019.0,TX,,10,No,0.0,5.0,0.0,0.0,0.0,0.003,0.0,0.0,,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,62.0,62.0,0.0,0.0,0.001,0.011,0.0,0.0,,,,,,,5.0,5.0,,5.0,,,,,,,,,0.0,,,2019_10,TX,,,8032.333333,5432.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,704940.0,34163505.0,41940876.0,2.0,670122,2019,41940876.0,41822732.0,418227.0,41404505.0,419408.76,06/30/2017,,10/01/2015,2017-12-31,,2016-01-01,HOUSTON METHODIST THE WOODLANDS HOSPITAL,2019.0,TX,,10,No,2119.0,1258.0,861.0,0.0,0.0,0.862,0.602,0.0,0.0,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,10424.0,8311.0,0.0,5.0,0.455,5.016,0.0,0.9968,,,,,,,5.0,5.0,,5.0,,,,,,,,,0.0,,,2019_10,TX,,,451290.666667,418227.0,,,,,,,,,,,,,,,,,,,,,,,,,0.99681,0.0,0.0,1.326444,0.264443,0.000746,30520.044057,0.732368
2,704939.0,273288.0,297961.0,4.0,670121,2019,297961.0,297961.0,2980.0,294981.0,2979.61,06/30/2017,,10/01/2015,2017-12-31,,2016-01-01,SAINT CAMILLUS MEDICAL CENTER,2019.0,TX,,10,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,5.0,5.0,,5.0,,,,,,,,,0.0,,,2019_10,TX,,,2980.0,2980.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,710001.0,11912495.0,15923070.0,4.0,670120,2019,15923070.0,15862213.0,158622.0,15703591.0,159230.7,06/30/2017,,10/01/2015,2017-12-31,1.1256,2016-01-01,THE HOSPITALS OF PROVIDENCE TRANSMOUNTAIN CAMPUS,2019.0,TX,1.1256,10,Yes,1646.0,1026.0,620.0,0.0,0.0,0.758,0.475,0.0,0.0,1.130864,Yes,1.0,-0.005264,,,5.0,5.0,5.0,,,5.0,8715.0,8715.0,0.0,4.0,0.253,3.281,0.0,1.2191,-0.005264,,,,1.1256,,5.0,,,5.0,,,,,,,,,105391.0,1.2191,1.130864,2019_10,TX,50.508108,53231.0,141113.25,158622.0,,,,,,,,,,,,,,,,,,,,,,,,,1.219141,2.0,0.60957,1.443517,0.439963,0.000746,30520.044057,0.779178
4,,,,,670119,2019,0.0,0.0,0.0,0.0,0.0,06/30/2017,,10/01/2015,2017-12-31,2.1342,2016-01-01,PROVIDENCE HOSPITAL OF NORTH HOUSTON LLC,2019.0,TX,2.1342,10,Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,2.130279,Yes,1.0,0.003921,0.01387,0.004769,18.0,18.0,18.0,18.0,18.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.001318,-0.000154,,2.1658,1.9711,2.2018,5.0,18.0,2.196,5.0,,2.1365,,2.15193,,2.197031,,2.196154,0.0,,1.969782,2019_10,TX,,,10367.0,10367.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Attempt to reproduce HAC scores for 2019

In [9]:
df_yr = hac_df[hac_df['file_year'] == '2019']
df_yr.head()

Unnamed: 0,RPT_REC_NUM,IPPS interim payment (E_A_HOS_C1_72),IPPS payment (E_A_HOS_C1_59),PRVDR_CTRL_TYPE_CD,Facility ID,file_year,Line 19,Reconstructed IPPS payment (pre HAC penalty),Reconstructed HAC penalty,Reconstructed IPPS payment (post HAC penalty),HAC penalty imputed from E_A_HOS_C1_59,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Score,Start Date,Facility Name,Fiscal Year,State,Total HAC Score,file_month,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,Total HAC Score (derived),Payment Reduction (derived),Payment Reduction Reproduced?,HAC delta,CAUTI delta,CLABSI delta,CAUTI Footnote,CLABSI Footnote,SSI Footnote,Total HAC Footnote,CDI Footnote,MRSA Footnote,MRSA patient days,CDI patient days,MRSA Observed Cases,CDI Observed Cases,MRSA Predicted Cases,CDI Predicted Cases,MRSA derived SIR,CDI derived SIR,CDI delta,MRSA delta,Payment Reduction Footnote,CAUTI W Z Score,CDI W Z Score,CLABSI W Z Score,Domain 1 Footnote,Domain 2 Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,SSI W Z Score,CAUTI derived Winsorized SIR,CAUTI derived W Z Score,CLABSI derived Winsorized SIR,CLABSI derived W Z Score,MRSA derived Winsorized SIR,MRSA derived W Z Score,HAC penalty (E_A_HOS_C1_7099),CDI derived Winsorized SIR,CDI derived W Z Score,file date,STATE,% Error in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),Dollar difference in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),avg_Reconstructed HAC penalty,"HAC penalty, final",CAUTI O/E,CAUTI simulated O,CAUTI simulated O/E,CAUTI expected O,CAUTI expected O/E,CAUTI pi_opt,CAUTI z_opt,CAUTI SIS,CLABSI O/E,CLABSI simulated O,CLABSI simulated O/E,CLABSI expected O,CLABSI expected O/E,CLABSI pi_opt,CLABSI z_opt,CLABSI SIS,MRSA O/E,MRSA simulated O,MRSA simulated O/E,MRSA expected O,MRSA expected O/E,MRSA pi_opt,MRSA z_opt,MRSA SIS,CDI O/E,CDI simulated O,CDI simulated O/E,CDI expected O,CDI expected O/E,CDI pi_opt,CDI z_opt,CDI SIS
0,707152.0,448455.0,543174.0,4.0,670124,2019,543174.0,543174.0,5432.0,537742.0,5431.74,06/30/2017,,10/01/2015,2017-12-31,,2016-01-01,THE HOSPITALS OF PROVIDENCE HORIZON CITY CAMPUS,2019.0,TX,,10,No,0.0,5.0,0.0,0.0,0.0,0.003,0.0,0.0,,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,62.0,62.0,0.0,0.0,0.001,0.011,0.0,0.0,,,,,,,5.0,5.0,,5.0,,,,,,,,,0.0,,,2019_10,TX,,,8032.333333,5432.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,704940.0,34163505.0,41940876.0,2.0,670122,2019,41940876.0,41822732.0,418227.0,41404505.0,419408.76,06/30/2017,,10/01/2015,2017-12-31,,2016-01-01,HOUSTON METHODIST THE WOODLANDS HOSPITAL,2019.0,TX,,10,No,2119.0,1258.0,861.0,0.0,0.0,0.862,0.602,0.0,0.0,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,10424.0,8311.0,0.0,5.0,0.455,5.016,0.0,0.9968,,,,,,,5.0,5.0,,5.0,,,,,,,,,0.0,,,2019_10,TX,,,451290.666667,418227.0,,,,,,,,,,,,,,,,,,,,,,,,,0.99681,0.0,0.0,1.326444,0.264443,0.000746,30520.044057,0.732368
2,704939.0,273288.0,297961.0,4.0,670121,2019,297961.0,297961.0,2980.0,294981.0,2979.61,06/30/2017,,10/01/2015,2017-12-31,,2016-01-01,SAINT CAMILLUS MEDICAL CENTER,2019.0,TX,,10,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,5.0,5.0,5.0,5.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,5.0,5.0,,5.0,,,,,,,,,0.0,,,2019_10,TX,,,2980.0,2980.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,710001.0,11912495.0,15923070.0,4.0,670120,2019,15923070.0,15862213.0,158622.0,15703591.0,159230.7,06/30/2017,,10/01/2015,2017-12-31,1.1256,2016-01-01,THE HOSPITALS OF PROVIDENCE TRANSMOUNTAIN CAMPUS,2019.0,TX,1.1256,10,Yes,1646.0,1026.0,620.0,0.0,0.0,0.758,0.475,0.0,0.0,1.130864,Yes,1.0,-0.005264,,,5.0,5.0,5.0,,,5.0,8715.0,8715.0,0.0,4.0,0.253,3.281,0.0,1.2191,-0.005264,,,,1.1256,,5.0,,,5.0,,,,,,,,,105391.0,1.2191,1.130864,2019_10,TX,50.508108,53231.0,141113.25,158622.0,,,,,,,,,,,,,,,,,,,,,,,,,1.219141,2.0,0.60957,1.443517,0.439963,0.000746,30520.044057,0.779178
4,,,,,670119,2019,0.0,0.0,0.0,0.0,0.0,06/30/2017,,10/01/2015,2017-12-31,2.1342,2016-01-01,PROVIDENCE HOSPITAL OF NORTH HOUSTON LLC,2019.0,TX,2.1342,10,Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,2.130279,Yes,1.0,0.003921,0.01387,0.004769,18.0,18.0,18.0,18.0,18.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.001318,-0.000154,,2.1658,1.9711,2.2018,5.0,18.0,2.196,5.0,,2.1365,,2.15193,,2.197031,,2.196154,0.0,,1.969782,2019_10,TX,,,10367.0,10367.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Generate Winsorized z-scores

In [10]:
hais = ['CAUTI', 'CLABSI', 'MRSA', 'CDI']
    
for i, hai in enumerate(hais):
    tdf2 = df_yr[~df_yr[hai + ' Footnote'].isin([18, '18', '18 ', ' 18', 
                                              5, '5', ' 5', '5 ',
                                              4, '4', ' 4', '4 ',
                                              ])]
      
    reported_winZ = tdf2[hai + ' W Z Score'].tolist()
    sirs = tdf2[hai + ' SIS'].tolist()
    tdf2[hai + ' Winsorized SIS'] = Winsorize_it(sirs, reported_winZ)
    tdf2[hai + ' SIS W Z Score'] = ZScore_it(tdf2[hai + ' Winsorized SIS'], reported_winZ)
    
    # Assign maximum WinZ scores to hospitals with HAI footnote 18 
    maxWinZ = np.nanmax(tdf2[hai + ' SIS W Z Score'])
    tdf3 = df_yr[df_yr[hai + ' Footnote'].isin([18, '18', '18 ', ' 18'])]
    tdf3[hai + ' Winsorized SIS'] = [np.nan]*tdf3.shape[0]
    tdf3[hai + ' SIS W Z Score'] = [maxWinZ]*tdf3.shape[0]
    
    tdf4 = df_yr[df_yr[hai + ' Footnote'].isin([5, '5', ' 5', '5 ', 4, '4', ' 4', '4 '])]
    tdf4[hai + ' Winsorized SIS'] = [np.nan]*tdf4.shape[0]
    tdf4[hai + ' SIS W Z Score'] = [np.nan]*tdf4.shape[0]
    
    df_yr = pd.concat([tdf2, tdf3, tdf4], axis=0)

del tdf2
del tdf3

In [11]:
print('Results from attempting to reproduce Yes/No penalty assignments:\n')
print('Excluded from results below:')
print('1. MD hospitals')
print('2. Hospitals with payment reduction values other than Yes or No\n')

holdout_df = df_yr[(df_yr['State'] == 'MD') | ~df_yr['Payment Reduction'].isin(['Yes', 'No']) | (df_yr['Total HAC Score'].isin([float("NaN"), np.nan]))]
df_yr = df_yr[(df_yr['State'] != 'MD') & (df_yr['Payment Reduction'].isin(['Yes', 'No'])) & (~df_yr['Total HAC Score'].isin([float("NaN"), np.nan]))]

hac_scores = []
ct1 = 0
ct2 = 0


for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    d1 = 0
    d2 = 0
        
    w_ls = []
    sum_ls = []

    m_ls = ['CDI SIS W Z Score', 'CAUTI SIS W Z Score', 'CLABSI SIS W Z Score', 'MRSA derived W Z Score', 'SSI W Z Score']
                
    s = 0
    w = 0
    for m in m_ls:
        v = tdf[m].tolist()
        if len(list(set(v))) > 1:
            if np.isnan(v[0]) == True:
                v = [np.nan]
            
        else:
            v = tdf[m].iloc[0]

        if np.isnan(v) == False: 
            s += v
            w += 1

    if w == 0:
        d2 = np.nan
    else:
        d2 = s/w
        
    d1 = float(tdf['PSI-90 W Z Score'].iloc[0])
    d2_o = float(tdf['Domain 2 Score'].iloc[0])
    
    if np.isnan(d1) == True: 
        # if no score for Domain 1, then total HAC score will be based entirely on Domain 2
        if np.isnan(d2) == True:
            hac_scores.append(np.nan)
        
        elif np.isnan(d2) == False:
            hac_scores.append(d2)
        
    elif np.isnan(d1) == False:
        # if there is a score for Domain 1 ...
        
        # If the derived score is NaN
        if np.isnan(d2) == True:
            hac_scores.append(d1)
        
        # If the derived score is a float
        elif np.isnan(d2) == False:
            hac_scores.append(0.15*d1 + 0.85*d2)


print('holdout_df.shape:', holdout_df.shape)
df_yr['Total HAC Score (SIS-based)'] = hac_scores
print(df_yr.shape[0], 'hospitals in hac_df')


Results from attempting to reproduce Yes/No penalty assignments:

Excluded from results below:
1. MD hospitals
2. Hospitals with payment reduction values other than Yes or No

holdout_df.shape: (70, 119)
3204 hospitals in hac_df


In [12]:
p75 = np.nanpercentile(df_yr['Total HAC Score (SIS-based)'], 75)
print('p75:', p75)

pr = []
for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    score = tdf['Total HAC Score (SIS-based)'].iloc[0]

    if np.isnan(score) == True:
        pr.append('No')
    elif score <= p75:
        pr.append('No')
    elif score > p75:
        pr.append('Yes')
    else:
        print('This score is an error:', score)
        sys.exit()

df_yr['Payment Reduction (SIS-based)'] = pr
    
o_list = df_yr['Payment Reduction'].tolist()
d_list = df_yr['Payment Reduction (SIS-based)'].tolist()

same = 0
diff = 0
p_to_np = 0
np_to_p = 0

res_ls = []
for i, o in enumerate(o_list):
    if o == d_list[i]:
        same += 1
        res_ls.append(1)
    else:
        diff += 1
        if o == 'Yes' and d_list[i] == 'No':
            p_to_np += 1
        elif o == 'No' and d_list[i] == 'Yes':
            np_to_p += 1
        else:
            print('Error')
            break
        res_ls.append(0)
               
print(same, "SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments.")
print(diff, "SIR-based penalty assignments were NOT the same as their corresponding SIS-based penalty assignments.")
print(str(np.round(100 * same/(same+diff),2)) + '% of SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments."\n')

print(p_to_np, 'hospitals were penalized but should not have been.')
print(np_to_p, 'hospitals were NOT penalized but should have been.')

p75: 0.3630933089722688
2937 SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments.
267 SIR-based penalty assignments were NOT the same as their corresponding SIS-based penalty assignments.
91.67% of SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments."

133 hospitals were penalized but should not have been.
134 hospitals were NOT penalized but should have been.


In [13]:
tdf = df_yr[df_yr['Payment Reduction (SIS-based)'] == 'Yes']
print(tdf.shape[0]/df_yr.shape[0])

tdf = df_yr[df_yr['Payment Reduction (SIS-based)'] == 'No']
print(tdf.shape[0]/df_yr.shape[0])


0.25
0.75


In [14]:
ls1 = list(df_yr)
ls2 = list(holdout_df)
ls = list(filter(lambda x:x in ls1, ls2))
print(df_yr.shape)
print(holdout_df.shape)
df_yr = df_yr.merge(holdout_df, how='outer', on=ls)
print(df_yr.shape)


(3204, 121)
(70, 119)
(3274, 121)


In [15]:
p_np_df = df_yr[(df_yr['Payment Reduction'] == 'Yes') & (df_yr['Payment Reduction (SIS-based)'] == 'No')]
ip_ = np.round(np.nansum(p_np_df['HAC penalty, final']))
print(p_np_df.shape[0])

np_p_df = df_yr[(df_yr['Payment Reduction'] == 'No') & (df_yr['Payment Reduction (SIS-based)'] == 'Yes')]
is_ = np.round(np.nansum(np_p_df['HAC penalty, final']))
print(np_p_df.shape[0])

print(ip_, 'dollars of inappropriate penalties')
print(is_, 'dollars of inappropriate hospital savings')
print('', ip_ - is_)


133
134
106010254.0 dollars of inappropriate penalties
16456765.0 dollars of inappropriate hospital savings
 89553489.0


In [16]:
df_yr.to_pickle('~/GitHub/HACRP-HAIs/data/finalized/final_2019.pkl', protocol=5)