In [5]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import pandas as pd
import warnings
import sys
import numpy as np
import scipy as sc
import random
from scipy import stats
from numpy import log10, sqrt

mydir = '/Users/kenlocey/GitHub/HACRP-HAIs/'
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def Winsorize_it(x, WinZs):
    
    x2 = []
    for i, val in enumerate(x):
        wz = float(WinZs[i])
        val = float(val)
        if np.isnan(wz) == True:
            x2.append(np.nan)
        else:
            x2.append(val)
    
    p5 = np.nanpercentile(x2, 5)
    p95 = np.nanpercentile(x2, 95)
    WinScores = []
    
    for i, val in enumerate(x2):
        wz = float(WinZs[i])
        val = float(val)
        
        if np.isnan(wz) == True:
            WinScores.append(np.nan)
            
        elif np.isnan(wz) == False:
            if val >= p5 and val <= p95:
                WinScores.append(val)
            elif val < p5:
                WinScores.append(p5)
            elif val > p95:
                WinScores.append(p95)
            elif np.isnan(val) == True:
                WinScores.append(np.nan)
        
    
    return WinScores
        

def ZScore_it(x, WinZs):
    
    x2 = []
    for i, val in enumerate(x):
        wz = float(WinZs[i])
        val = float(val)
        if np.isnan(wz) == True:
            x2.append(np.nan)
        else:
            x2.append(val)
    
    x2 = np.array(x2)
    avg = np.nanmean(x2)
    std = np.nanstd(x2)
    zscores = (x2 - avg) / std
    return zscores


hac_mo = '10'

## Load HAC file

In [6]:
cauti_file =  '/Users/kenlocey/GitHub/HACRP-HAIs/4_optimize_random_sampling_models/optimized_by_HAI_file_date/CAUTI/CAUTI_Data_opt_for_SIRs_2018_10.pkl'
clabsi_file = '/Users/kenlocey/GitHub/HACRP-HAIs/4_optimize_random_sampling_models/optimized_by_HAI_file_date/CLABSI/CLABSI_Data_opt_for_SIRs_2018_10.pkl'

mrsa_file = '/Users/kenlocey/GitHub/HACRP-HAIs/4_optimize_random_sampling_models/optimized_by_HAI_file_date/MRSA/MRSA_Data_opt_for_SIRs_2018_10.pkl'
cdi_file = '/Users/kenlocey/GitHub/HACRP-HAIs/4_optimize_random_sampling_models/optimized_by_HAI_file_date/CDI/CDI_Data_opt_for_SIRs_2018_10.pkl'


tdf, tdf1, tdf2 = 0, 0, 0
cauti_df, clabsi_df = 0, 0
mrsa_df, cdi_df = 0, 0

cauti_df = pd.read_pickle(cauti_file)
cauti_df['CAUTI SIS'] = (cauti_df['CAUTI Observed Cases'] - cauti_df['expected O'])/cauti_df['CAUTI Predicted Cases']
cauti_df.rename(columns={
        'O/E': 'CAUTI O/P',
        'simulated O': 'CAUTI simulated O',
        'simulated O/E': 'CAUTI simulated O/P',
        'expected O': 'CAUTI expected O',
        'expected O/E': 'CAUTI expected O/P',
        'pi_opt': 'CAUTI pi_opt',
        'z_opt': 'CAUTI z_opt',
    }, inplace=True)
cauti_sis_deciles = np.nanpercentile(cauti_df['CAUTI SIS'], np.arange(0, 100, 10))
cauti_rand_sir_deciles = np.nanpercentile(cauti_df['CAUTI expected O/P'], np.arange(0, 100, 10))

clabsi_df = pd.read_pickle(clabsi_file)
clabsi_df['CLABSI SIS'] = (clabsi_df['CLABSI Observed Cases'] - clabsi_df['expected O'])/clabsi_df['CLABSI Predicted Cases']
clabsi_df.rename(columns={
        'O/E': 'CLABSI O/P',
        'simulated O': 'CLABSI simulated O',
        'simulated O/E': 'CLABSI simulated O/P',
        'expected O': 'CLABSI expected O',
        'expected O/E': 'CLABSI expected O/P',
        'pi_opt': 'CLABSI pi_opt',
        'z_opt': 'CLABSI z_opt',
    }, inplace=True)
clabsi_sis_deciles = np.nanpercentile(clabsi_df['CLABSI SIS'], np.arange(0, 100, 10))
clabsi_rand_sir_deciles = np.nanpercentile(clabsi_df['CLABSI expected O/P'], np.arange(0, 100, 10))




mrsa_df = pd.read_pickle(mrsa_file)
mrsa_df['MRSA SIS'] = (mrsa_df['MRSA Observed Cases'] - mrsa_df['expected O'])/mrsa_df['MRSA Predicted Cases']
mrsa_df.rename(columns={
        'O/E': 'MRSA O/P',
        'simulated O': 'MRSA simulated O',
        'simulated O/E': 'MRSA simulated O/P',
        'expected O': 'MRSA expected O',
        'expected O/E': 'MRSA expected O/P',
        'pi_opt': 'MRSA pi_opt',
        'z_opt': 'MRSA z_opt',
    }, inplace=True)
tdf = mrsa_df[mrsa_df['MRSA Predicted Cases'] >= 1]
mrsa_sis_deciles = np.nanpercentile(tdf['MRSA SIS'], np.arange(0, 100, 10))    
mrsa_rand_sir_deciles = np.nanpercentile(mrsa_df['MRSA expected O/P'], np.arange(0, 100, 10))


cdi_df = pd.read_pickle(cdi_file)
cdi_df['CDI SIS'] = (cdi_df['CDI Observed Cases'] - cdi_df['expected O'])/cdi_df['CDI Predicted Cases']
cdi_df.rename(columns={
        'O/E': 'CDI O/P',
        'simulated O': 'CDI simulated O',
        'simulated O/E': 'CDI simulated O/P',
        'expected O': 'CDI expected O',
        'expected O/E': 'CDI expected O/P',
        'pi_opt': 'CDI pi_opt',
        'z_opt': 'CDI z_opt',
    }, inplace=True)
tdf = cdi_df[cdi_df['CDI Predicted Cases'] >= 1]
cdi_sis_deciles = np.nanpercentile(tdf['CDI SIS'], np.arange(0, 100, 10))    
cdi_rand_sir_deciles = np.nanpercentile(cdi_df['CDI expected O/P'], np.arange(0, 100, 10))


ls1 = list(cauti_df)
ls2 = list(clabsi_df)
ls = list(filter(lambda x:x in ls1, ls2))
df_yr = cauti_df.merge(clabsi_df, on=ls, how='outer')
    
ls1 = list(df_yr)
ls2 = list(mrsa_df)
ls = list(filter(lambda x:x in ls1, ls2))
df_yr = df_yr.merge(mrsa_df, on=ls, how='outer')

ls1 = list(df_yr)
ls2 = list(cdi_df)
ls = list(filter(lambda x:x in ls1, ls2))
df_yr = df_yr.merge(cdi_df, on=ls, how='outer')


del tdf
del cauti_df, clabsi_df
del mrsa_df, cdi_df

df_yr.sort_values(by=['Facility ID', 'Line 19'], inplace=True, ascending=False)
df_yr.drop_duplicates(subset=['Facility ID'], inplace=True, keep='first')

print(len(df_yr['Facility ID'].unique()), 'hospitals resulting from merging the above dataframes.')

df_yr.head()


3091 hospitals resulting from merging the above dataframes.


Unnamed: 0,RPT_REC_NUM,IPPS interim payment (E_A_HOS_C1_72),IPPS payment (E_A_HOS_C1_59),PRVDR_CTRL_TYPE_CD,Facility ID,file_year,Line 19,Reconstructed IPPS payment (pre HAC penalty),Reconstructed HAC penalty,Reconstructed IPPS payment (post HAC penalty),HAC penalty imputed from E_A_HOS_C1_59,IPPS payment (from RAND),AHRQ PSI-90 Score,CAUTI Score,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Score,Start Date,Facility Name,Fiscal Year,State,Total HAC Score,file_month,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,CAUTI derived score,CLABSI derived score,Total HAC Score (derived),Payment Reduction (derived),Payment Reduction Reproduced?,HAC delta,CAUTI delta,CLABSI delta,AHRQ PSI-90 Footnote,CAUTI Footnote,CLABSI Footnote,Domain 1 Score Footnote,Domain 2 Score Footnote,SSI Footnote,SSI Score,Total HAC Footnote,CDI Footnote,CDI Score,MRSA Footnote,MRSA Score,MRSA patient days,CDI patient days,MRSA Observed Cases,CDI Observed Cases,MRSA Predicted Cases,CDI Predicted Cases,MRSA derived SIR,CDI derived SIR,MRSA derived score,CDI derived score,CDI delta,MRSA delta,Payment Reduction Footnote,CAUTI W Z Score,CDI W Z Score,CLABSI W Z Score,Domain 1 Footnote,Domain 2 Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,SSI W Z Score,CAUTI derived Winsorized SIR,CAUTI derived W Z Score,CLABSI derived Winsorized SIR,CLABSI derived W Z Score,MRSA derived Winsorized SIR,MRSA derived W Z Score,HAC penalty (E_A_HOS_C1_7099),CDI derived Winsorized SIR,CDI derived W Z Score,PSI-90 End Date,PSI-90 Start Date,HAC penalty (imputed from RAND),file date,STATE,Dollar difference in payments (RAND vs E_A_HOS_C1_59),% Difference in payments (RAND vs E_A_HOS_C1_59),% Error in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),Dollar difference in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),% Error in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),Dollar difference in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),avg_Reconstructed HAC penalty,"HAC penalty, final",CAUTI O/P,CAUTI simulated O,CAUTI simulated O/P,CAUTI expected O,CAUTI expected O/P,CAUTI pi_opt,CAUTI z_opt,CAUTI SIS,CLABSI O/P,CLABSI simulated O,CLABSI simulated O/P,CLABSI expected O,CLABSI expected O/P,CLABSI pi_opt,CLABSI z_opt,CLABSI SIS,MRSA O/P,MRSA simulated O,MRSA simulated O/P,MRSA expected O,MRSA expected O/P,MRSA pi_opt,MRSA z_opt,MRSA SIS,CDI O/P,CDI simulated O,CDI simulated O/P,CDI expected O,CDI expected O/P,CDI pi_opt,CDI z_opt,CDI SIS
2862,717770.0,9452916.0,9259277.0,1.0,670108,2018,9259277.0,10336557.0,103366.0,10233191.0,92592.77,,,,,09/30/2015,,07/01/2014,2016-12-31,0.0676,2015-01-01,BAYLOR SCOTT & WHITE MEDICAL CENTER - MARBLE F...,2018.0,TX,0.0676,10,No,1095.0,759.0,336.0,0.0,0.0,0.478,0.306,0.0,0.0,,,0.236243,No,1.0,-0.168643,,,,5.0,5.0,,,5.0,,,,,5.0,,6554.0,6346.0,0.0,3.0,0.203,3.286,0.0,0.913,,,-0.168643,,,,0.0676,,5.0,,,5.0,,,,,,,,,0.0,0.913,0.236243,,,,2018_10,TX,,,,,,,104701.25,103366.0,,,,,,,,,,,,,,,,,,,,,,,,,0.912964,2.0,0.608643,0.912216,0.277607,0.000847,31050.468443,0.635357
2863,724392.0,6729999.0,8235431.0,4.0,670106,2018,8235431.0,8229623.0,82296.0,8147327.0,82354.31,,,,,09/30/2015,,07/01/2014,2016-12-31,0.9134,2015-01-01,PEARLAND MEDICAL CENTER,2018.0,TX,0.9134,10,Yes,1390.0,1012.0,378.0,1.0,1.0,0.929,0.265,1.0764,3.7736,,,0.844586,Yes,1.0,0.068814,,,,5.0,5.0,,,,,,,,5.0,,7956.0,7024.0,1.0,6.0,0.178,3.423,5.618,1.7528,,,0.137527,,,,1.9601,,5.0,,,5.0,,-0.1334,,,,,,,82296.0,1.49917,1.822573,,,,2018_10,TX,,,,,0.0,0.0,83461.833333,82296.0,,,,,,,,,,,,,,,,,,,,,,,,,1.752848,1.0,0.292141,1.097649,0.320669,0.000847,31050.468443,1.43218
0,725556.0,7758997.0,9427696.0,4.0,670103,2018,9427696.0,9414209.0,94142.0,9320067.0,94276.96,,,,,09/30/2015,,07/01/2014,2016-12-31,-0.1059,2015-01-01,MEDICAL CITY ALLIANCE,2018.0,TX,-0.1059,10,No,3550.0,1266.0,2284.0,0.0,0.0,1.044,2.338,0.0,0.0,,,-0.055524,No,1.0,-0.050376,,0.033588,,5.0,,,,,,,,,5.0,,14597.0,12495.0,2.0,6.0,0.474,6.868,4.2194,0.8736,,,-0.184616,,,,-0.055,-1.511,5.0,,,5.0,,1.2484,,,0.0,-1.544588,,,0.0,0.8736,0.129616,,,,2018_10,TX,,,,,,,86255.333333,94142.0,0.0,0.0,0.0,0.202551,0.194014,0.001498,10591.061951,-0.194014,0.0,3.0,1.283148,0.802628,0.343297,0.001043,4493.848073,-0.343297,,,,,,,,,0.873617,4.0,0.582411,3.03709,0.442209,0.000847,31050.468443,0.431408
1,683243.0,11142120.0,13021669.0,4.0,670098,2018,13021669.0,12937720.0,129377.0,12808343.0,130216.69,,,,,09/30/2015,0.3178,07/01/2014,2016-12-31,-0.1973,2015-01-01,RESOLUTE HEALTH HOSPITAL,2018.0,TX,-0.12,10,No,6256.0,4121.0,2135.0,1.0,1.0,4.011,1.955,0.2493,0.5115,,,-0.065821,No,1.0,-0.054179,0.019382,-0.076542,,,,,,,,,,,,,28277.0,24358.0,2.0,18.0,1.324,15.347,1.5106,1.1729,,,-0.230901,-0.030844,,-0.8939,0.7087,-0.3626,,,1.0485,,0.3178,-1.4872,0.2493,-0.913282,0.5115,-0.286058,1.5106,1.079344,0.0,1.1729,0.939601,,,,2018_10,TX,,,,,,,129806.4,129377.0,0.249314,2.0,0.498629,1.72972,0.431244,0.001498,10591.061951,-0.18193,0.511509,1.0,0.511509,0.717087,0.366796,0.001043,4493.848073,0.144713,1.510574,1.0,0.755287,0.565751,0.427305,8.9e-05,98139.69381,1.083269,1.172868,7.0,0.456115,9.070584,0.591033,0.000847,31050.468443,0.581835
2,,,,,670096,2018,0.0,0.0,0.0,0.0,0.0,,,,,09/30/2015,0.4497,07/01/2014,2016-12-31,-0.7581,2015-01-01,"BAY AREA REGIONAL MEDICAL CENTER, LLC",2018.0,TX,-0.5769,10,No,13813.0,7603.0,6210.0,1.0,0.0,7.321,6.034,0.1366,0.0,,,-0.559836,No,1.0,-0.017064,-0.002587,0.033588,,,,,,5.0,,,,,5.0,,21855.0,28995.0,0.0,16.0,0.727,15.666,0.0,1.0213,,,-0.091331,,,-1.2013,0.438,-1.511,,,,,0.4497,,0.1366,-1.198713,0.0,-1.544588,,,0.0,1.0213,0.529331,,,,2018_10,TX,,,,,,,273845.0,273845.0,0.136593,4.0,0.546373,4.76085,0.650301,0.001498,10591.061951,-0.513707,0.0,2.0,0.331455,3.757138,0.622661,0.001043,4493.848073,-0.622661,,,,,,,,,1.02132,12.0,0.76599,11.860258,0.75707,0.000847,31050.468443,0.26425


In [7]:
main_df = pd.read_pickle('~/GitHub/HACRP-HAIs/data/Compiled_HCRIS-HACRP-HAI-RAND/Compiled_HCRIS-HACRP-HAI-RAND.pkl')
main_df = main_df[main_df['file_year'] == '2018']
main_df.sort_values(by='Facility ID', inplace=True)
print('Initial no. of hospitals in main_df:', len(main_df['Facility ID'].unique()))

main_df = main_df[(~main_df['Total HAC Score'].isin([float("NaN"), np.nan]))]
print('No. of hospitals in main_df having real HAC scores:', len(main_df['Facility ID'].unique()))

main_df = main_df[main_df['State'] != 'MD']
print('No. of these hospitals not in Maryland:', len(main_df['Facility ID'].unique()))

main_df = main_df[(main_df['CAUTI Predicted Cases'].isin([np.nan, float('NaN')])) | (main_df['CAUTI Predicted Cases'] < 1)]
main_df = main_df[(main_df['CLABSI Predicted Cases'].isin([np.nan, float('NaN')])) | (main_df['CLABSI Predicted Cases'] < 1)]
main_df = main_df[(main_df['MRSA Predicted Cases'].isin([np.nan, float('NaN')])) | (main_df['MRSA Predicted Cases'] < 1)]
main_df = main_df[(main_df['CDI Predicted Cases'].isin([np.nan, float('NaN')])) | (main_df['CDI Predicted Cases'] < 1)]
main_df = main_df[~main_df['PSI-90 W Z Score'].isin([np.nan, float('NaN')])]
print('No. of these hospitals without legit predicted cases for CAUTI and CLABSI:', len(main_df['Facility ID'].unique()))

main_df.head()


Initial no. of hospitals in main_df: 3300
No. of hospitals in main_df having real HAC scores: 3217
No. of these hospitals not in Maryland: 3170
No. of these hospitals without legit predicted cases for CAUTI and CLABSI: 138


Unnamed: 0,RPT_REC_NUM,IPPS interim payment (E_A_HOS_C1_72),IPPS payment (E_A_HOS_C1_59),PRVDR_CTRL_TYPE_CD,Facility ID,file_year,Line 19,Reconstructed IPPS payment (pre HAC penalty),Reconstructed HAC penalty,Reconstructed IPPS payment (post HAC penalty),HAC penalty imputed from E_A_HOS_C1_59,IPPS payment (from RAND),AHRQ PSI-90 Score,CAUTI Score,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Score,Start Date,Facility Name,Fiscal Year,State,Total HAC Score,file_month,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,CAUTI derived score,CLABSI derived score,Total HAC Score (derived),Payment Reduction (derived),Payment Reduction Reproduced?,HAC delta,CAUTI delta,CLABSI delta,AHRQ PSI-90 Footnote,CAUTI Footnote,CLABSI Footnote,Domain 1 Score Footnote,Domain 2 Score Footnote,SSI Footnote,SSI Score,Total HAC Footnote,CDI Footnote,CDI Score,MRSA Footnote,MRSA Score,MRSA patient days,CDI patient days,MRSA Observed Cases,CDI Observed Cases,MRSA Predicted Cases,CDI Predicted Cases,MRSA derived SIR,CDI derived SIR,MRSA derived score,CDI derived score,CDI delta,MRSA delta,Payment Reduction Footnote,CAUTI W Z Score,CDI W Z Score,CLABSI W Z Score,Domain 1 Footnote,Domain 2 Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,SSI W Z Score,CAUTI derived Winsorized SIR,CAUTI derived W Z Score,CLABSI derived Winsorized SIR,CLABSI derived W Z Score,MRSA derived Winsorized SIR,MRSA derived W Z Score,HAC penalty (E_A_HOS_C1_7099),CDI derived Winsorized SIR,CDI derived W Z Score,PSI-90 End Date,PSI-90 Start Date,HAC penalty (imputed from RAND),file date,STATE,Dollar difference in payments (RAND vs E_A_HOS_C1_59),% Difference in payments (RAND vs E_A_HOS_C1_59),% Error in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),Dollar difference in penalties (E_A_HOS_C1_7099 vs Imputed from RAND),% Error in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),Dollar difference in penalties (E_A_HOS_C1_7099 vs Reconstructed HAC penalty),avg_Reconstructed HAC penalty,"HAC penalty, final"
3325,723898,1431335.0,1410804.0,4,10047,2018,1410804.0,1726692.0,17267.0,1709425.0,14108.04,,,,,09/30/2015,-0.0438,07/01/2014,2016-12-31,,2015-01-01,GEORGIANA MEDICAL CENTER,2018.0,AL,-0.0438,10,No,0.0,547.0,0.0,0.0,0.0,0.648,0.0,0.0,,,,-0.0438,No,1.0,0.0,,,,5.0,5.0,,,5.0,,,5.0,,5.0,,3553.0,3553.0,0.0,0.0,0.09,0.986,0.0,0.0,,,,,,,,,,5.0,,,-0.0438,,,,,,,,0.0,,,,,,2018_10,AL,,,,,,,14732.0,17267.0
3336,723899,431491.0,502519.0,9,10051,2018,502519.0,627573.0,6276.0,621297.0,5025.19,,,,,09/30/2015,-0.013,07/01/2014,2016-12-31,,2015-01-01,GREENE COUNTY HOSPITAL,2018.0,AL,-0.013,10,No,0.0,39.0,0.0,0.0,0.0,0.065,0.0,0.0,,,,-0.013,No,1.0,0.0,,,,5.0,5.0,,,5.0,,,5.0,,5.0,,1611.0,1631.0,0.0,2.0,0.044,0.654,0.0,3.0581,,,,,,,,,,5.0,,,-0.013,,,,,,,,0.0,,,,,,2018_10,AL,,,,,,,8093.285714,6276.0
3491,723510,488380.0,454932.0,8,10102,2018,454932.0,568665.0,5687.0,562978.0,4549.32,,,,,09/30/2015,-0.0033,07/01/2014,2016-12-31,,2015-01-01,J PAUL JONES HOSPITAL,2018.0,AL,-0.0033,10,No,105.0,100.0,5.0,0.0,0.0,0.142,0.005,0.0,0.0,,,-0.0033,No,1.0,0.0,,,,5.0,5.0,,,5.0,,,5.0,,5.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,5.0,,,-0.0033,,,,,,,,0.0,,,,,,2018_10,AL,,,,,,,5050.142857,5687.0
3691,725187,4389874.0,3671875.0,7,20018,2018,3671875.0,4564273.0,45643.0,4518630.0,36718.75,,,,,09/30/2015,-0.0105,07/01/2014,2016-12-31,,2015-01-01,YUKON KUSKOKWIM DELTA REG HOSPITAL,2018.0,AK,-0.0105,10,No,0.0,152.0,0.0,0.0,0.0,0.14,0.0,0.0,,,,-0.0105,No,1.0,0.0,,,,5.0,5.0,,,5.0,,,5.0,,5.0,,2965.0,1777.0,0.0,1.0,0.082,0.728,0.0,1.3736,,,,,,,,,,5.0,,,-0.0105,,,,,,,,0.0,,,,,,2018_10,AK,,,,,,,40997.857143,45643.0
3866,717859,468796.0,432272.0,7,30074,2018,432272.0,540340.0,5403.0,534937.0,4322.72,,,,,09/30/2015,0.0017,07/01/2014,2016-12-31,2.1265,2015-01-01,SELLS HOSPITAL,2018.0,AZ,1.8078,10,Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,1.749703,Yes,1.0,0.058097,0.0991,0.072743,,18.0,18.0,,,18.0,,,18.0,,18.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.137527,0.032259,,2.1498,1.9601,2.1946,,18.0,2.2236,,0.0017,2.1044,,2.0507,,2.121857,,2.191341,5403.0,,1.822573,,,,2018_10,AZ,,,,,0.0,0.0,4969.625,5403.0


In [8]:
ls1 = list(df_yr)
ls2 = list(main_df)
ls = list(filter(lambda x:x in ls1, ls2))
df_yr = df_yr.merge(main_df, on=ls, how='outer')

print(df_yr.shape)
print(len(df_yr['Facility ID'].unique()), 'hospitals resulting from merging the above dataframes.')
print('This is NOT the same number as the number of hospitals having real HAC scores.')
print('Instead, it is the same as the number of hospitals with legit scores for PSI-90 or one or more HAIs.')


(3229, 132)
3229 hospitals resulting from merging the above dataframes.
This is NOT the same number as the number of hospitals having real HAC scores.
Instead, it is the same as the number of hospitals with legit scores for PSI-90 or one or more HAIs.


## Generate Winsorized z-scores for SIS and random-based SIRs

In [9]:
hais = ['CAUTI', 'CLABSI', 'MRSA', 'CDI']
   
for i, hai in enumerate(hais):
    tdf2 = df_yr[~df_yr[hai + ' Footnote'].isin([18, '18', '18 ', ' 18', 
                                              5, '5', ' 5', '5 ',
                                              4, '4', ' 4', '4 ',
                                              ])]
      
    reported_winZ = tdf2[hai + ' W Z Score'].tolist()
    sis = tdf2[hai + ' SIS'].tolist()
    tdf2[hai + ' Winsorized SIS'] = Winsorize_it(sis, reported_winZ)
    tdf2[hai + ' SIS W Z Score'] = ZScore_it(tdf2[hai + ' Winsorized SIS'], reported_winZ)
    
    sir = tdf2[hai + ' expected O/P'].tolist()
    tdf2[hai + ' Winsorized random-based'] = Winsorize_it(sis, reported_winZ)
    tdf2[hai + ' random-based W Z Score'] = ZScore_it(tdf2[hai + ' Winsorized random-based'], reported_winZ)
    
    
    # Assign maximum WinZ scores to hospitals with HAI footnote 18 
    maxWinZ = np.nanmax(tdf2[hai + ' SIS W Z Score'])
    tdf3 = df_yr[df_yr[hai + ' Footnote'].isin([18, '18', '18 ', ' 18'])]
    tdf3[hai + ' Winsorized SIS'] = [np.nan]*tdf3.shape[0]
    tdf3[hai + ' SIS W Z Score'] = [maxWinZ]*tdf3.shape[0]
    
    maxWinZ = np.nanmax(tdf2[hai + ' random-based W Z Score'])
    tdf3 = df_yr[df_yr[hai + ' Footnote'].isin([18, '18', '18 ', ' 18'])]
    tdf3[hai + ' Winsorized random-based'] = [np.nan]*tdf3.shape[0]
    tdf3[hai + ' random-based W Z Score'] = [maxWinZ]*tdf3.shape[0]
    
    
    tdf4 = df_yr[df_yr[hai + ' Footnote'].isin([5, '5', ' 5', '5 ', 4, '4', ' 4', '4 '])]
    tdf4[hai + ' Winsorized SIS'] = [np.nan]*tdf4.shape[0]
    tdf4[hai + ' SIS W Z Score'] = [np.nan]*tdf4.shape[0]
    
    tdf4[hai + ' Winsorized random-based'] = [np.nan]*tdf4.shape[0]
    tdf4[hai + ' random-based W Z Score'] = [np.nan]*tdf4.shape[0]
    
    df_yr = pd.concat([tdf2, tdf3, tdf4], axis=0)

del tdf2
del tdf3

In [10]:
print('Results from attempting to reproduce Yes/No penalty assignments:\n')
print('Excluded from results below:')
print('1. MD hospitals')
print('2. Hospitals with payment reduction values other than Yes or No\n')

holdout_df = df_yr[(df_yr['State'] == 'MD') | ~df_yr['Payment Reduction'].isin(['Yes', 'No']) | (df_yr['Total HAC Score'].isin([float("NaN"), np.nan]))]
df_yr = df_yr[(df_yr['State'] != 'MD') & (df_yr['Payment Reduction'].isin(['Yes', 'No'])) & (~df_yr['Total HAC Score'].isin([float("NaN"), np.nan]))]


Results from attempting to reproduce Yes/No penalty assignments:

Excluded from results below:
1. MD hospitals
2. Hospitals with payment reduction values other than Yes or No



## Generate SIS-based scores

In [11]:
hac_scores = []
ct1 = 0
ct2 = 0

for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    d1 = 0
    d2 = 0
        
    w_ls = []
    sum_ls = []

    m_ls = ['CDI SIS W Z Score', 'CAUTI SIS W Z Score', 'CLABSI SIS W Z Score', 'MRSA SIS W Z Score', 'SSI W Z Score']
                
    s = 0
    w = 0
    for m in m_ls:
        v = tdf[m].tolist()
        if len(list(set(v))) > 1:
            if np.isnan(v[0]) == True:
                v = [np.nan]
            
        else:
            v = tdf[m].iloc[0]

        if np.isnan(v) == False: 
            s += v
            w += 1

    if w == 0:
        d2 = np.nan
    else:
        d2 = s/w
        
    d1 = float(tdf['PSI-90 W Z Score'].iloc[0])
    d2_o = float(tdf['Domain 2 Score'].iloc[0])
    state = tdf['State'].iloc[0]

    if np.isnan(d1) == True: 
        # if no score for Domain 1, then total HAC score will be based entirely on Domain 2
        if np.isnan(d2) == True:
            hac_scores.append(np.nan)
        
        elif np.isnan(d2) == False:
            hac_scores.append(d2)
        
    elif np.isnan(d1) == False:
        # if there is a score for Domain 1 ...
        
        # If the derived score is NaN
        if np.isnan(d2) == True:
            hac_scores.append(d1)
        
        # If the derived score is a float
        elif np.isnan(d2) == False:
            hac_scores.append(0.15*d1 + 0.85*d2)
    else:
        print(d1, ',', d2, ',', d2_o)
        break


print('holdout_df.shape:', holdout_df.shape)
df_yr['Total HAC Score (SIS-based)'] = hac_scores
print(df_yr.shape[0], 'hospitals in hac_df')




p75 = np.nanpercentile(df_yr['Total HAC Score (SIS-based)'], 75)
print('p75:', p75)

pr = []
for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    score = tdf['Total HAC Score (SIS-based)'].iloc[0]

    if np.isnan(score) == True:
        pr.append('No')
    elif score <= p75:
        pr.append('No')
    elif score > p75:
        pr.append('Yes')
    else:
        print('This score is an error:', score)
        sys.exit()

df_yr['Payment Reduction (SIS-based)'] = pr
    
o_list = df_yr['Payment Reduction'].tolist()
d_list = df_yr['Payment Reduction (SIS-based)'].tolist()

same = 0
diff = 0
p_to_np = 0
np_to_p = 0

res_ls = []
for i, o in enumerate(o_list):
    if o == d_list[i]:
        same += 1
        res_ls.append(1)
    else:
        diff += 1
        if o == 'Yes' and d_list[i] == 'No':
            p_to_np += 1
        elif o == 'No' and d_list[i] == 'Yes':
            np_to_p += 1
        else:
            print('Error')
            break
        res_ls.append(0)
               
print(same, "SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments.")
print(diff, "SIR-based penalty assignments were NOT the same as their corresponding SIS-based penalty assignments.")
print(str(np.round(100 * same/(same+diff),2)) + '% of SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments."\n')

print(p_to_np, 'hospitals were penalized but should not have been.')
print(np_to_p, 'hospitals were NOT penalized but should have been.')

holdout_df.shape: (76, 148)
3153 hospitals in hac_df
p75: 0.36050504914460335
2831 SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments.
322 SIR-based penalty assignments were NOT the same as their corresponding SIS-based penalty assignments.
89.79% of SIR-based penalty assignments were the same as their corresponding SIS-based penalty assignments."

133 hospitals were penalized but should not have been.
189 hospitals were NOT penalized but should have been.


## Generate scores based on random expectations

In [12]:
hac_scores = []
ct1 = 0
ct2 = 0

for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    d1 = 0
    d2 = 0
        
    w_ls = []
    sum_ls = []

    m_ls = ['CDI random-based W Z Score', 'CAUTI random-based W Z Score', 'CLABSI random-based W Z Score', 'MRSA random-based W Z Score', 'SSI W Z Score']
                
    s = 0
    w = 0
    for m in m_ls:
        v = tdf[m].tolist()
        if len(list(set(v))) > 1:
            if np.isnan(v[0]) == True:
                v = [np.nan]
            
        else:
            v = tdf[m].iloc[0]

        if np.isnan(v) == False: 
            s += v
            w += 1

    if w == 0:
        d2 = np.nan
    else:
        d2 = s/w
        
    d1 = float(tdf['PSI-90 W Z Score'].iloc[0])
    d2_o = float(tdf['Domain 2 Score'].iloc[0])
    state = tdf['State'].iloc[0]

    if np.isnan(d1) == True: 
        # if no score for Domain 1, then total HAC score will be based entirely on Domain 2
        if np.isnan(d2) == True:
            hac_scores.append(np.nan)
        
        elif np.isnan(d2) == False:
            hac_scores.append(d2)
        
    elif np.isnan(d1) == False:
        # if there is a score for Domain 1 ...
        
        # If the derived score is NaN
        if np.isnan(d2) == True:
            hac_scores.append(d1)
        
        # If the derived score is a float
        elif np.isnan(d2) == False:
            hac_scores.append(0.15*d1 + 0.85*d2)
    else:
        print(d1, ',', d2, ',', d2_o)
        break


print('holdout_df.shape:', holdout_df.shape)
df_yr['Total HAC Score (random-based)'] = hac_scores
print(df_yr.shape[0], 'hospitals in hac_df')




p75 = np.nanpercentile(df_yr['Total HAC Score (random-based)'], 75)
print('p75:', p75)

pr = []
for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    score = tdf['Total HAC Score (random-based)'].iloc[0]

    if np.isnan(score) == True:
        pr.append('No')
    elif score <= p75:
        pr.append('No')
    elif score > p75:
        pr.append('Yes')
    else:
        print('This score is an error:', score)
        sys.exit()

df_yr['Payment Reduction (random-based)'] = pr


holdout_df.shape: (76, 148)
3153 hospitals in hac_df
p75: 0.3666369336771151


In [13]:
## SIS-based

tdf1 = df_yr[df_yr['Payment Reduction (SIS-based)'] == 'Yes']
tdf2 = df_yr[df_yr['Payment Reduction (SIS-based)'].isin(['Yes', 'No'])]
print(tdf1.shape[0]/tdf2.shape[0])

tdf1 = df_yr[df_yr['Payment Reduction (SIS-based)'] == 'No']
print(tdf1.shape[0]/tdf2.shape[0])
print(tdf2.shape[0], len(tdf2['Facility ID'].unique()))
print('\n')

## Random-based 
tdf1 = df_yr[df_yr['Payment Reduction (random-based)'] == 'Yes']
tdf2 = df_yr[df_yr['Payment Reduction (random-based)'].isin(['Yes', 'No'])]
print(tdf1.shape[0]/tdf2.shape[0])

tdf1 = df_yr[df_yr['Payment Reduction (random-based)'] == 'No']
print(tdf1.shape[0]/tdf2.shape[0])
print(tdf2.shape[0], len(tdf2['Facility ID'].unique()))
print('\n')

## Original payment reduction
tdf1 = df_yr[df_yr['Payment Reduction'] == 'Yes']
tdf2 = df_yr[df_yr['Payment Reduction'].isin(['Yes', 'No'])]
print(tdf1.shape[0]/tdf2.shape[0])

tdf1 = df_yr[df_yr['Payment Reduction'] == 'No']
print(tdf1.shape[0]/tdf2.shape[0])
print(df_yr.shape[0], len(df_yr['Facility ID'].unique()))

0.2499207104345068
0.7500792895654932
3153 3153


0.2499207104345068
0.7500792895654932
3153 3153


0.23215984776403426
0.7678401522359658
3153 3153


In [14]:
ls1 = list(df_yr)
ls2 = list(holdout_df)
ls = list(filter(lambda x:x in ls1, ls2))
print(df_yr.shape)
print(holdout_df.shape)
df_yr = df_yr.merge(holdout_df, how='outer', on=ls)
print(df_yr.shape)

(3153, 152)
(76, 148)
(3229, 152)


In [15]:
p_np_df = df_yr[(df_yr['Payment Reduction'] == 'Yes') & (df_yr['Payment Reduction (SIS-based)'] == 'No')]
ip_ = np.round(np.nansum(p_np_df['HAC penalty, final']))
print(p_np_df.shape[0])

np_p_df = df_yr[(df_yr['Payment Reduction'] == 'No') & (df_yr['Payment Reduction (SIS-based)'] == 'Yes')]
is_ = np.round(np.nansum(np_p_df['HAC penalty, final']))
print(np_p_df.shape[0])

print(ip_, 'dollars of inappropriate penalties')
print(is_, 'dollars of inappropriate hospital savings')
print('', ip_ - is_)


133
189
95937194.0 dollars of inappropriate penalties
39163235.0 dollars of inappropriate hospital savings
 56773959.0


In [16]:
df_yr.to_pickle('~/GitHub/HACRP-HAIs/data/finalized/final_2018.pkl', protocol=5)

#        