In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import pandas as pd
import warnings
import sys
import numpy as np
import scipy as sc
import random
from scipy import stats
from numpy import log10, sqrt

mydir = '/Users/kenlocey/GitHub/HACRP-HAIs/'
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def def_display_df_misses(df):
    df = df[df['Payment Reduction Reproduced?'] == 0]
    items = ['file_year', 'HAI Measures End Date', 'HAI Measures Start Date',
             'CAUTI Footnote', 'CAUTI W Z Score', 'CAUTI SIR W Z Score',
             'CDI Footnote', 'CDI W Z Score', 'CDI SIR W Z Score',
             'CLABSI Footnote', 'CLABSI W Z Score', 'CLABSI SIR W Z Score',
             'MRSA Footnote', 'MRSA W Z Score', 'MRSA SIR W Z Score',
             'SSI Footnote', 'SSI W Z Score', 'SSI SIR W Z Score',
             'PSI-90 Footnote', 'PSI-90 W Z Score', 'PSI-90 SIR W Z Score',
             'Total HAC Footnote', 'Total HAC Score', 'Total HAC Score (derived)',
             'Payment Reduction Footnote', 'Payment Reduction', 
             'Payment Reduction (derived)', 'Payment Reduction Reproduced?',
             ]
    return df.filter(items=items)


def Winsorize_it(x, WinZs):
    
    x2 = []
    for i, val in enumerate(x):
        wz = float(WinZs[i])
        val = float(val)
        if np.isnan(wz) == True:
            x2.append(np.nan)
        else:
            x2.append(val)
    
    p5 = np.nanpercentile(x2, 5)
    p95 = np.nanpercentile(x2, 95)
    WinScores = []
    
    for i, val in enumerate(x2):
        wz = float(WinZs[i])
        val = float(val)
        
        if np.isnan(wz) == True:
            WinScores.append(np.nan)
            
        elif np.isnan(wz) == False:
            if val >= p5 and val <= p95:
                WinScores.append(val)
            elif val < p5:
                WinScores.append(p5)
            elif val > p95:
                WinScores.append(p95)
            elif np.isnan(val) == True:
                #print('val:', val, '|', WinZs[i])
                WinScores.append(np.nan)
        
    
    return WinScores
        

def ZScore_it(x, WinZs):
    
    x2 = []
    for i, val in enumerate(x):
        wz = float(WinZs[i])
        val = float(val)
        if np.isnan(wz) == True:
            x2.append(np.nan)
        else:
            x2.append(val)
    
    x2 = np.array(x2)
    avg = np.nanmean(x2)
    std = np.nanstd(x2)
    zscores = (x2 - avg) / std
    return zscores


## Load HAC file

In [2]:
hac_df = pd.read_pickle(mydir + "data/CareCompare_data/CombinedFiles_HACRP/Facility.pkl")
hac_df = hac_df[hac_df['file_year'] == '2022']
hac_df = hac_df[hac_df['file_month'] == '04']

print('HACRP years:', sorted(hac_df['Fiscal Year'].unique()))

features = ['CAUTI Score', 'CLABSI Score', 'CDI Score', 'MRSA Score', 'SSI Score',
            'Total HAC Score', 'Domain 1 Score', 'AHRQ PSI-90 Score', 'Domain 2 Score',
            'CAUTI W Z Score', 'CLABSI W Z Score', 'MRSA W Z Score', 'CDI W Z Score', 
            'SSI W Z Score', 'PSI-90 W Z Score']
for f in features:
    hac_df[f] = hac_df[f].astype(str)
    hac_df[f] = hac_df[f].str.replace('*', '')
    hac_df[f] = pd.to_numeric(hac_df[f], errors='coerce')

hac_df.dropna(how='all', axis=1, inplace=True)
hac_df.head()

HACRP years: [2022]


Unnamed: 0,CAUTI Footnote,CAUTI W Z Score,CDI Footnote,CDI W Z Score,CLABSI Footnote,CLABSI W Z Score,Facility ID,Facility Name,Fiscal Year,HAI Measures End Date,HAI Measures Start Date,MRSA Footnote,MRSA W Z Score,PSI-90 End Date,PSI-90 Footnote,PSI-90 Start Date,PSI-90 W Z Score,Payment Reduction,Payment Reduction Footnote,SSI Footnote,SSI W Z Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year
0,,-0.7654,,-0.012,,0.1695,10001,SOUTHEAST HEALTH MEDICAL CENTER,2022,12/31/2019,01/01/2019,,-0.4193,12/31/2019,,07/01/2018,-1.3379,No,,,-0.5757,AL,,-0.4901,4,2022
1,,0.7408,,0.074,,-1.2914,10005,MARSHALL MEDICAL CENTERS,2022,12/31/2019,01/01/2019,,2.0736,12/31/2019,,07/01/2018,1.6595,Yes,,,-0.5926,AL,,0.444,4,2022
2,,-0.9705,,-0.353,,-1.2914,10006,NORTH ALABAMA MEDICAL CENTER,2022,12/31/2019,01/01/2019,,-0.2292,12/31/2019,,07/01/2018,1.6028,No,,,-0.2322,AL,,-0.2456,4,2022
3,5.0,,,-1.6615,5.0,,10007,MIZELL MEMORIAL HOSPITAL,2022,12/31/2019,01/01/2019,5.0,,12/31/2019,,07/01/2018,-0.3472,No,,5.0,,AL,,-1.0043,4,2022
4,5.0,,5.0,,5.0,,10008,CRENSHAW COMMUNITY HOSPITAL,2022,12/31/2019,01/01/2019,5.0,,12/31/2019,,07/01/2018,0.0445,No,,5.0,,AL,,0.0445,4,2022


## Format dates within the HAC file 

In [3]:

## Convert all other date data to default datetime

hac_df['HAI Measures Start Date'] = pd.to_datetime(hac_df['HAI Measures Start Date'])
hac_df.style.format({'HAI Measures Start Date': lambda t: t.strftime("%Y-%m-%d")})

hac_df['HAI Measures End Date'] = pd.to_datetime(hac_df['HAI Measures End Date'])
hac_df.style.format({'HAI Measures End Date': lambda t: t.strftime("%Y-%m-%d")})

hac_df = pd.concat([hac_df], axis=0)
hac_df.sort_values(by='HAI Measures End Date', inplace=True)

hac_df.rename(columns={'HAI Measures Start Date': 'Start Date', 'HAI Measures End Date': 'End Date'}, inplace=True)


## Load and merge HAI files

In [4]:
##############################   CAUTI   ################################################

cauti_df = pd.read_pickle(mydir + "data/preprocessed_HAI_data/CAUTI_Data.pkl")
cauti_df = cauti_df.filter(items=['Facility ID', 'CAUTI Urinary Catheter Days (ICUs + select wards)', 
                                  'CAUTI Observed Cases (ICUs + select wards)', 
                                  'CAUTI Predicted Cases (ICUs + select wards)', 
                                  'Start Date', 'End Date'], axis=1)

cauti_df.rename(columns={'CAUTI Urinary Catheter Days (ICUs + select wards)': 'CAUTI Urinary Catheter Days', 
                         'CAUTI Observed Cases (ICUs + select wards)': 'CAUTI Observed Cases', 
                         'CAUTI Predicted Cases (ICUs + select wards)': 'CAUTI Predicted Cases', 
                        }, inplace=True)

features = ['CAUTI Urinary Catheter Days', 'CAUTI Observed Cases', 'CAUTI Predicted Cases']
for f in features:
    cauti_df[f] = cauti_df[f].astype(str)
    cauti_df[f] = pd.to_numeric(cauti_df[f], errors='coerce')

cauti_df['Start Date'] = pd.to_datetime(cauti_df['Start Date'])
cauti_df.style.format({'Start Date': lambda t: t.strftime("%Y-%m-%d")})
cauti_df['End Date'] = pd.to_datetime(cauti_df['End Date'])
cauti_df.style.format({'End Date': lambda t: t.strftime("%Y-%m-%d")})
cauti_df = cauti_df[(cauti_df['Start Date'].isin(hac_df['Start Date'].unique())) | (cauti_df['End Date'].isin(hac_df['End Date'].unique()))]

cauti_df.head()


Unnamed: 0,Facility ID,CAUTI Urinary Catheter Days,CAUTI Observed Cases,CAUTI Predicted Cases,Start Date,End Date
4,030117,1658.0,0.0,0.862,2019-01-01,2019-12-31
11,361311,31.0,0.0,0.021,2019-01-01,2019-12-31
29,520100,4335.0,1.0,3.23,2019-01-01,2019-12-31
31,520045,4616.0,2.0,4.877,2019-01-01,2019-12-31
39,15004F,,,,2019-01-01,2019-12-31


In [5]:
##############################   CLABSI   ###############################################

clabsi_df = pd.read_pickle(mydir + "data/preprocessed_HAI_data/CLABSI_Data.pkl")
clabsi_df = clabsi_df.filter(items=['Facility ID', 'CLABSI Device Days (ICUs + select wards)', 
                                    'CLABSI Observed Cases (ICUs + select wards)', 
                                    'CLABSI Predicted Cases (ICUs + select wards)', 
                                    'Start Date', 'End Date'], axis=1)

clabsi_df.rename(columns={'CLABSI Device Days (ICUs + select wards)': 'CLABSI Device Days', 
                          'CLABSI Observed Cases (ICUs + select wards)': 'CLABSI Observed Cases', 
                          'CLABSI Predicted Cases (ICUs + select wards)': 'CLABSI Predicted Cases', 
                        }, inplace=True)


features = ['CLABSI Device Days', 'CLABSI Observed Cases', 'CLABSI Predicted Cases']
for f in features:
    clabsi_df[f] = clabsi_df[f].astype(str)
    clabsi_df[f] = pd.to_numeric(clabsi_df[f], errors='coerce')

clabsi_df['Start Date'] = pd.to_datetime(clabsi_df['Start Date'])
clabsi_df.style.format({'Start Date': lambda t: t.strftime("%Y-%m-%d")})
clabsi_df['End Date'] = pd.to_datetime(clabsi_df['End Date'])
clabsi_df.style.format({'End Date': lambda t: t.strftime("%Y-%m-%d")})
clabsi_df = clabsi_df[(clabsi_df['Start Date'].isin(hac_df['Start Date'].unique())) | (clabsi_df['End Date'].isin(hac_df['End Date'].unique()))]

clabsi_df.head()

Unnamed: 0,Facility ID,CLABSI Device Days,CLABSI Observed Cases,CLABSI Predicted Cases,Start Date,End Date
2,36007F,,,,2019-01-01,2019-12-31
21,140127,1883.0,0.0,1.692,2019-01-01,2019-12-31
37,171339,234.0,0.0,0.064,2019-01-01,2019-12-31
50,020018,24.0,0.0,0.013,2019-01-01,2019-12-31
66,051323,15.0,0.0,0.005,2019-01-01,2019-12-31


In [6]:

##############################   MRSA   ###############################################

mrsa_df = pd.read_pickle(mydir + "data/preprocessed_HAI_data/MRSA_Data.pkl")
mrsa_df = mrsa_df.filter(items=['Facility ID', 'MRSA patient days', 
                                'MRSA Observed Cases', 'MRSA Predicted Cases', 
                                'Start Date', 'End Date'], axis=1)
mrsa_df['Start Date'] = pd.to_datetime(mrsa_df['Start Date'])
mrsa_df.style.format({'Start Date': lambda t: t.strftime("%Y-%m-%d")})
mrsa_df['End Date'] = pd.to_datetime(mrsa_df['End Date'])
mrsa_df.style.format({'End Date': lambda t: t.strftime("%Y-%m-%d")})

mrsa_df = mrsa_df[(mrsa_df['Start Date'].isin(hac_df['Start Date'].unique())) | (mrsa_df['End Date'].isin(hac_df['End Date'].unique()))]

##############################   CDI   ###############################################

cdi_df = pd.read_pickle(mydir + "data/preprocessed_HAI_data/CDI_Data.pkl")
cdi_df = cdi_df.filter(items=['Facility ID', 'CDIFF patient days', 
                              'CDIFF Observed Cases', 'CDIFF Predicted Cases', 
                              'Start Date', 'End Date'], axis=1)
cdi_df['Start Date'] = pd.to_datetime(cdi_df['Start Date'])
cdi_df.style.format({'Start Date': lambda t: t.strftime("%Y-%m-%d")})
cdi_df['End Date'] = pd.to_datetime(cdi_df['End Date'])
cdi_df.style.format({'End Date': lambda t: t.strftime("%Y-%m-%d")})

cdi_df = cdi_df[(cdi_df['Start Date'].isin(hac_df['Start Date'].unique())) | (cdi_df['End Date'].isin(hac_df['End Date'].unique()))]

###################  Merge CAUTI, CLABSI, MRSA, and CDI  ####################################

hai_df = cauti_df.merge(clabsi_df, on=['Facility ID', 'Start Date', 'End Date'], how='outer')
hai_df = hai_df.merge(mrsa_df, on=['Facility ID', 'Start Date', 'End Date'], how='outer')
hai_df = hai_df.merge(cdi_df, on=['Facility ID', 'Start Date', 'End Date'], how='outer')


####################  Drop duplicate rows resulting from merger #############################

hai_df.drop_duplicates(inplace=True)


########################  Conversions to numeric  #############################################

features = ['CAUTI Urinary Catheter Days', 'CLABSI Device Days',
            'MRSA patient days', 'CDIFF patient days',
            'CLABSI Observed Cases', 'CLABSI Predicted Cases', 
            'MRSA Observed Cases', 'MRSA Predicted Cases', 
            'CDIFF Observed Cases', 'CDIFF Predicted Cases', 
            'CAUTI Observed Cases', 'CAUTI Predicted Cases', 
           ]

for f in features:
    hai_df[f] = hai_df[f].astype(str)
    hai_df[f] = hai_df[f].str.replace('*', '')
    hai_df[f] = hai_df[f].str.replace(' ', '')
    hai_df[f] = pd.to_numeric(hai_df[f], errors='coerce')
    
hai_df['Total device days'] = hai_df['CLABSI Device Days'] + hai_df['CAUTI Urinary Catheter Days']


########################  Reorder columns  #############################################

col_to_move = hai_df.pop('CAUTI Urinary Catheter Days')
hai_df.insert(hai_df.shape[1] - 2, 'CAUTI Urinary Catheter Days', col_to_move)

print(hai_df.shape)
hai_df.head()

(5266, 16)


Unnamed: 0,Facility ID,CAUTI Observed Cases,CAUTI Predicted Cases,Start Date,End Date,CLABSI Device Days,CLABSI Observed Cases,CLABSI Predicted Cases,MRSA patient days,MRSA Observed Cases,MRSA Predicted Cases,CDIFF patient days,CDIFF Observed Cases,CAUTI Urinary Catheter Days,CDIFF Predicted Cases,Total device days
0,030117,0.0,0.862,2019-01-01,2019-12-31,657.0,0.0,0.411,7022.0,0.0,0.153,6534.0,2.0,1658.0,1.713,2315.0
256,361311,0.0,0.021,2019-01-01,2019-12-31,6.0,0.0,0.002,1148.0,0.0,0.023,1148.0,1.0,31.0,0.254,37.0
512,520100,1.0,3.23,2019-01-01,2019-12-31,3241.0,0.0,2.219,17118.0,0.0,0.729,16219.0,2.0,4335.0,7.399,7576.0
768,520045,2.0,4.877,2019-01-01,2019-12-31,3849.0,2.0,3.451,34930.0,1.0,1.602,32193.0,9.0,4616.0,12.947,8465.0
1024,15004F,,,2019-01-01,2019-12-31,,,,,,,,,,,


## Filter HAI data on start dates and end dates that match those in the HAC file.

In [7]:
#########  Filter on start dates and end dates that match those in the HAC file  #######

hai_df = hai_df[(hai_df['Start Date'].isin(['2019-01-01'])) & (hai_df['End Date'].isin(['2019-12-31']))]

print(hai_df.shape)
print(hai_df['Start Date'].unique())
print(hai_df['End Date'].unique())

(5266, 16)
['2019-01-01T00:00:00.000000000']
['2019-12-31T00:00:00.000000000']


## Drop hospitals from HAI data that are not contained in HAC data.

In [8]:
# drop hospitals in HAI data that are not in HAC data
hai_df = hai_df[hai_df['Facility ID'].isin(hac_df['Facility ID'].unique())]

In [9]:
# hospitals in the hac data but not in the hai data ...
#tdf = hac_df.copy(deep=True) 
tdf = hac_df[~hac_df['Facility ID'].isin(hai_df['Facility ID'].unique())]
hac_df = hac_df[hac_df['Facility ID'].isin(hai_df['Facility ID'].unique())]

print(len(tdf['Facility ID'].unique()), 'hospitals in HAC dataset but not in HAI dataset\n')


tdf.head()

15 hospitals in HAC dataset but not in HAI dataset



Unnamed: 0,CAUTI Footnote,CAUTI W Z Score,CDI Footnote,CDI W Z Score,CLABSI Footnote,CLABSI W Z Score,Facility ID,Facility Name,Fiscal Year,End Date,Start Date,MRSA Footnote,MRSA W Z Score,PSI-90 End Date,PSI-90 Footnote,PSI-90 Start Date,PSI-90 W Z Score,Payment Reduction,Payment Reduction Footnote,SSI Footnote,SSI W Z Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year
1989,5.0,,5.0,,5.0,,330411,UNITY SPECIALTY HOSPITAL,2022,2019-12-31,2019-01-01,5.0,,12/31/2019,5.0,07/01/2018,,No,,5.0,,NY,5.0,,4,2022
1721,5.0,,5.0,,5.0,,280139,GRAND ISLAND REGIONAL MEDICAL CENTER,2022,2019-12-31,2019-01-01,5.0,,12/31/2019,5.0,07/01/2018,,No,,5.0,,NE,5.0,,4,2022
1851,5.0,,5.0,,5.0,,320091,THREE CROSSES REGIONAL HOSPITAL LLC,2022,2019-12-31,2019-01-01,5.0,,12/31/2019,5.0,07/01/2018,,No,,5.0,,NM,5.0,,4,2022
3091,5.0,,5.0,,5.0,,520213,FROEDTERT COMMUNITY HOSPITAL,2022,2019-12-31,2019-01-01,5.0,,12/31/2019,5.0,07/01/2018,,No,,5.0,,WI,5.0,,4,2022
3166,5.0,,5.0,,5.0,,670267,THE WOODLANDS SPECIALTY HOSPITAL,2022,2019-12-31,2019-01-01,5.0,,12/31/2019,5.0,07/01/2018,,No,,5.0,,TX,5.0,,4,2022


## Correct HAI file for non-duplicate rows having duplicate dates

**Problem:** Some rows for the same provider have duplicate measurement dates but different values for observed cases, predicted cases, etc. This results from each year having multiple (quarterly) files, the data within which can vary among files. Additionally, the most recent file for each year is not always the right file to use. 

**Need:** Since only one row can be used, we need to figure out which row should be used.

**Solution:** Select the row with the greatest totals for predicted cases for each HAI.

In [10]:
##############  Label rows that have duplicate dates (per provider) ####################
##############  For each provider with rows having duplicate dates,  ###################
###########  keep the last row (will have greatest number of total device days)  #######

hai_df['duplicated dates'] = hai_df.duplicated(subset=['Facility ID', 'Start Date', 'End Date'], keep=False)

hai_df.sort_values(by=['Facility ID', 'Start Date', 'End Date', 
                       'MRSA Predicted Cases', 'CAUTI Predicted Cases',  
                       'MRSA Observed Cases', 'CAUTI Observed Cases', 
                       'CDIFF Predicted Cases', 'CLABSI Predicted Cases',
                       'CDIFF Observed Cases', 'CLABSI Observed Cases',
                       ], inplace=True, ascending=False)

hai_df.drop_duplicates(subset=['Facility ID', 'Start Date', 'End Date'], inplace=True, keep='first')

print(hai_df.shape)
hai_df.head()

(3155, 17)


Unnamed: 0,Facility ID,CAUTI Observed Cases,CAUTI Predicted Cases,Start Date,End Date,CLABSI Device Days,CLABSI Observed Cases,CLABSI Predicted Cases,MRSA patient days,MRSA Observed Cases,MRSA Predicted Cases,CDIFF patient days,CDIFF Observed Cases,CAUTI Urinary Catheter Days,CDIFF Predicted Cases,Total device days,duplicated dates
1065528,670266,,,2019-01-01,2019-12-31,,,,,,,,,,,,False
1209734,670260,,,2019-01-01,2019-12-31,,,,,,,,,,,,False
1169889,670259,,,2019-01-01,2019-12-31,,,,,,,,,,,,False
720258,670143,,,2019-01-01,2019-12-31,,,,,,,,,,,,False
249925,670136,,,2019-01-01,2019-12-31,,,,,,,,,,,,False


## Aggregate annual HAI data into biennial data

Purpose: Match the biennial measurement periods of HAC data


In [11]:
start_dates = hac_df['Start Date'].tolist()
end_dates = hac_df['End Date'].tolist()
prvdrs = hac_df['Facility ID'].tolist()

total_device_days = []

cauti_days = []
clabsi_days = []
mrsa_days = []
cdi_days = []

cauti_pred = []
clabsi_pred = []
mrsa_pred = []
cdi_pred = []

cauti_obs = []
clabsi_obs = []
mrsa_obs = []
cdi_obs = []

for i, start in enumerate(start_dates):
    end = end_dates[i]
    prvdr = prvdrs[i]
    
    tdf = hai_df[hai_df['Facility ID'] == prvdr]
    tdf = tdf[(tdf['Start Date'] == start) | (tdf['End Date'] == end)]
    
    if tdf.shape[0] == 1:
        pass
        #print('tdf.shape[0]:', 1)
        #print('hospital:', prvdr)
        #print(tdf['Start Date'].unique())
        #print(tdf['End Date'].unique())
        
    if tdf.shape[0] > 2:
        print('tdf.shape[0] = ', tdf.shape[0])
        for date_ in ['Start Date', 'End Date']:
            tdf['duplicated dates'] = tdf.duplicated(subset=[date_], keep=False)
            tdf.sort_values(by=[
                                'CAUTI Observed Cases', 'CLABSI Observed Cases',
                                'MRSA Observed Cases', 'CDIFF Observed Cases',
                                ], inplace=True, ascending=True)

            tdf.drop_duplicates(subset=[date_], inplace=True, keep='last')

        if tdf.shape[0] > 2:
            print('Error:')
            print("tdf.shape[0] > 2:", tdf.shape[0])
            print(start)
            print(end)
            print(tdf.head())
            break
        
    total_device_days.append(np.nansum(tdf['Total device days']))
    cauti_days.append(np.nansum(tdf['CAUTI Urinary Catheter Days']))
    clabsi_days.append(np.nansum(tdf['CLABSI Device Days']))
    mrsa_days.append(np.nansum(tdf['MRSA patient days']))
    cdi_days.append(np.nansum(tdf['CDIFF patient days']))
        
    cauti_pred.append(np.nansum(tdf['CAUTI Predicted Cases']))
    clabsi_pred.append(np.nansum(tdf['CLABSI Predicted Cases']))
    mrsa_pred.append(np.nansum(tdf['MRSA Predicted Cases']))
    cdi_pred.append(np.nansum(tdf['CDIFF Predicted Cases']))

    cauti_obs.append(np.nansum(tdf['CAUTI Observed Cases']))
    clabsi_obs.append(np.nansum(tdf['CLABSI Observed Cases']))
    mrsa_obs.append(np.nansum(tdf['MRSA Observed Cases']))
    cdi_obs.append(np.nansum(tdf['CDIFF Observed Cases']))
    

## Add HAI data to the HAC dataframe and save

In [12]:
hac_df['Total device days'] = total_device_days
hac_df['CAUTI Urinary Catheter Days'] = cauti_days
hac_df['CLABSI Device Days'] = clabsi_days
hac_df['MRSA patient days'] = mrsa_days
hac_df['CDI patient days'] = cdi_days

hac_df['CAUTI Observed Cases'] = cauti_obs
hac_df['CLABSI Observed Cases'] = clabsi_obs
hac_df['MRSA Observed Cases'] = mrsa_obs
hac_df['CDI Observed Cases'] = cdi_obs

hac_df['CAUTI Predicted Cases'] = cauti_pred
hac_df['CLABSI Predicted Cases'] = clabsi_pred
hac_df['MRSA Predicted Cases'] = mrsa_pred
hac_df['CDI Predicted Cases'] = cdi_pred

hac_df['CAUTI derived SIR'] = np.round(hac_df['CAUTI Observed Cases'] / hac_df['CAUTI Predicted Cases'],4)
hac_df['CLABSI derived SIR'] = np.round(hac_df['CLABSI Observed Cases'] / hac_df['CLABSI Predicted Cases'],4)
hac_df['MRSA derived SIR'] = np.round(hac_df['MRSA Observed Cases'] / hac_df['MRSA Predicted Cases'],4)
hac_df['CDI derived SIR'] = np.round(hac_df['CDI Observed Cases'] / hac_df['CDI Predicted Cases'],4)

print('hac_df.shape:', hac_df.shape)
print(len(hac_df['Facility ID'].unique()), 'hospitals in 2022 HACRP')
#hac_df.to_pickle('~/GitHub/HACRP-HAIs/data/merged_HAC_HAI/merged_HAI_HAC_2022.pkl', protocol=5)
hac_df.head()



hac_df.shape: (3155, 43)
3155 hospitals in 2022 HACRP


Unnamed: 0,CAUTI Footnote,CAUTI W Z Score,CDI Footnote,CDI W Z Score,CLABSI Footnote,CLABSI W Z Score,Facility ID,Facility Name,Fiscal Year,End Date,Start Date,MRSA Footnote,MRSA W Z Score,PSI-90 End Date,PSI-90 Footnote,PSI-90 Start Date,PSI-90 W Z Score,Payment Reduction,Payment Reduction Footnote,SSI Footnote,SSI W Z Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,MRSA patient days,CDI patient days,CAUTI Observed Cases,CLABSI Observed Cases,MRSA Observed Cases,CDI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,MRSA Predicted Cases,CDI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,MRSA derived SIR,CDI derived SIR
0,,-0.7654,,-0.012,,0.1695,10001,SOUTHEAST HEALTH MEDICAL CENTER,2022,2019-12-31,2019-01-01,,-0.4193,12/31/2019,,07/01/2018,-1.3379,No,,,-0.5757,AL,,-0.4901,4,2022,21114.0,13268.0,7846.0,102765.0,102765.0,6.0,6.0,4.0,41.0,18.498,8.086,7.305,76.294,0.3244,0.742,0.5476,0.5374
2107,5.0,,,1.0354,5.0,,360044,WAYNE HOSPITAL,2022,2019-12-31,2019-01-01,5.0,,12/31/2019,,07/01/2018,-0.358,Yes,,5.0,,OH,,0.3387,4,2022,1204.0,869.0,335.0,4494.0,3379.0,0.0,0.0,0.0,2.0,0.456,0.212,0.117,2.278,0.0,0.0,0.0,0.878
2108,5.0,,,2.0828,5.0,,360046,MCCULLOUGH-HYDE MEMORIAL HOSPITAL,2022,2019-12-31,2019-01-01,5.0,,12/31/2019,,07/01/2018,-0.549,Yes,,5.0,,OH,,0.7669,4,2022,1300.0,946.0,354.0,6945.0,6233.0,1.0,0.0,0.0,2.0,0.477,0.214,0.134,1.628,2.0964,0.0,0.0,1.2285
2109,,-0.3532,,0.0709,,-0.9803,360048,UNIVERSITY OF TOLEDO MEDICAL CENTER,2022,2019-12-31,2019-01-01,,0.5904,12/31/2019,,07/01/2018,1.2259,No,,,-0.8261,OH,,-0.0454,4,2022,11157.0,5269.0,5888.0,47657.0,47657.0,4.0,1.0,4.0,19.0,7.564,6.348,3.6,33.68,0.5288,0.1575,1.1111,0.5641
2110,,-0.7191,,0.1845,,-0.4172,360051,MIAMI VALLEY HOSPITAL,2022,2019-12-31,2019-01-01,,0.2299,12/31/2019,,07/01/2018,0.9032,No,,,-0.2254,OH,,-0.0073,4,2022,57497.0,25738.0,31759.0,251924.0,231286.0,14.0,15.0,20.0,106.0,40.344,33.815,21.977,176.518,0.347,0.4436,0.91,0.6005


## Generate Winsorized z-scores

In [13]:
hais = ['CAUTI', 'CLABSI', 'MRSA', 'CDI']

df_yr = hac_df.copy(deep=True)
    
for i, hai in enumerate(hais):
    tdf2 = df_yr[~df_yr[hai + ' Footnote'].isin([18, '18', '18 ', ' 18', 
                                              5, '5', ' 5', '5 ',
                                              4, '4', ' 4', '4 ',
                                              ])]
      
    reported_winZ = tdf2[hai + ' W Z Score'].tolist()
    sirs = tdf2[hai + ' derived SIR'].tolist()
    tdf2[hai + ' derived Winsorized SIR'] = Winsorize_it(sirs, reported_winZ)
    tdf2[hai + ' derived W Z Score'] = ZScore_it(tdf2[hai + ' derived Winsorized SIR'], reported_winZ)
    
    # Assign maximum WinZ scores to hospitals with HAI footnote 18 
    maxWinZ = np.nanmax(tdf2[hai + ' derived W Z Score'])
    tdf3 = df_yr[df_yr[hai + ' Footnote'].isin([18, '18', '18 ', ' 18'])]
    tdf3[hai + ' derived Winsorized SIR'] = [np.nan]*tdf3.shape[0]
    tdf3[hai + ' derived W Z Score'] = [maxWinZ]*tdf3.shape[0]
    
    tdf4 = df_yr[df_yr[hai + ' Footnote'].isin([5, '5', ' 5', '5 ', 4, '4', ' 4', '4 '])]
    tdf4[hai + ' derived Winsorized SIR'] = [np.nan]*tdf4.shape[0]
    tdf4[hai + ' derived W Z Score'] = [np.nan]*tdf4.shape[0]
    
    df_yr = pd.concat([tdf2, tdf3, tdf4], axis=0)

del tdf2
del tdf3

display_df = df_yr.copy(deep=True)
items = ['file_year', 'HAI Measures End Date', 'HAI Measures Start Date',
         'CAUTI Footnote', 'CAUTI W Z Score', 'CAUTI derived W Z Score', 'CAUTI derived Winsorized SIR',
         'CDI Footnote', 'CDI W Z Score', 'CDI derived W Z Score',
         'CLABSI Footnote', 'CLABSI W Z Score', 'CLABSI derived W Z Score',
         'MRSA Footnote', 'MRSA W Z Score', 'MRSA derived W Z Score',
         #'Total HAC Footnote', 'Total HAC Score',
         ]


display_df = display_df.filter(items=items)
#display_df = display_df[display_df['CAUTI W Z Score'].isin([np.nan, float("NaN")])]
#display_df.sort_values(by='CAUTI derived W Z Score', inplace=True, ascending=False)
display_df.head(20)

Unnamed: 0,file_year,CAUTI Footnote,CAUTI W Z Score,CAUTI derived W Z Score,CAUTI derived Winsorized SIR,CDI Footnote,CDI W Z Score,CDI derived W Z Score,CLABSI Footnote,CLABSI W Z Score,CLABSI derived W Z Score,MRSA Footnote,MRSA W Z Score,MRSA derived W Z Score
0,2022,,-0.7654,-0.765325,0.3244,,-0.012,-0.013854,,0.1695,0.169634,,-0.4193,-0.420642
2109,2022,,-0.3532,-0.354223,0.5288,,0.0709,0.068137,,-0.9803,-0.982387,,0.5904,0.59086
2110,2022,,-0.7191,-0.719871,0.347,,0.1845,0.179916,,-0.4172,-0.418498,,0.2299,0.229878
2111,2022,,-0.689,-0.6895,0.3621,,0.5501,0.547497,,-1.2914,-1.292812,,0.1366,0.135818
2112,2022,,0.4894,0.4891,0.9481,,-0.3622,-0.364852,,-0.0707,-0.071216,,-0.6417,-0.641791
2113,2022,,-0.1983,-0.199154,0.6059,,0.5593,0.554867,,-0.8346,-0.835157,,-0.57,-0.56981
2115,2022,,-1.246,-1.245816,0.0855,,-0.4236,-0.426883,,0.0514,0.051968,,0.8504,0.851499
2106,2022,,-0.6809,-0.682461,0.3656,,0.6453,0.643307,,1.7525,1.754673,,-0.6345,-0.635149
2116,2022,,-0.6649,-0.664963,0.3743,,0.0002,-0.002492,,-0.1061,-0.107087,,-0.6363,-0.637483
2118,2022,,-1.1254,-1.126548,0.1448,,-0.9427,-0.945243,,-0.9311,-0.931339,,-0.3709,-0.371638


In [14]:

features = ['CAUTI derived SIR', 'CAUTI W Z Score', 'CAUTI derived W Z Score',
            'CDI derived SIR', 'CDI W Z Score', 'CDI derived W Z Score',
            'CLABSI derived SIR', 'CLABSI W Z Score', 'CLABSI derived W Z Score',
            'MRSA derived SIR', 'MRSA W Z Score', 'MRSA derived W Z Score', 
            'SSI W Z Score', 'PSI-90 W Z Score', 'Total HAC Score']

for f in features:
    df_yr[f] = df_yr[f].astype(str)
    df_yr[f] = pd.to_numeric(df_yr[f], errors='coerce')
    

for hai in hais:
    print(hai)
    
    ls1 = df_yr[hai + ' W Z Score'].tolist()
    ls2 = df_yr[hai + ' derived W Z Score'].tolist()
    final_ls = []
    
    for i, val in enumerate(ls1):
        if np.isnan(val) == False and np.isnan(ls2[i]) == True:
            final_ls.append(ls2[i])
        elif np.isnan(val) == False and np.isnan(ls2[i]) == False:
            final_ls.append(ls2[i])
        elif np.isnan(val) == True and np.isnan(ls2[i]) == False:
            final_ls.append(ls2[i])
        elif np.isnan(val) == True and np.isnan(ls2[i]) == True:
            final_ls.append(ls2[i])
        else:
            final_ls.append(ls2[i])
            
    df_yr[hai + ' derived W Z Score'] = final_ls


CAUTI
CLABSI
MRSA
CDI


## Attempt to reproduce HAC scores for 2022

In [15]:
print('Results from attempting to reproduce Yes/No penalty assignments:\n')
print('Excluded from results below:')
print('1. MD hospitals')
print('2. Hospitals with payment reduction values other than Yes or No\n')

df_yr.dropna(how='all', axis=1, inplace=True)

holdout_df = df_yr[(df_yr['State'] == 'MD') | ~df_yr['Payment Reduction'].isin(['Yes', 'No']) | (df_yr['Total HAC Score'].isin([float("NaN"), np.nan]))]
df_yr = df_yr[(df_yr['State'] != 'MD') & (df_yr['Payment Reduction'].isin(['Yes', 'No'])) & (~df_yr['Total HAC Score'].isin([float("NaN"), np.nan]))]

hac_scores = []
ct1 = 0
ct2 = 0


for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    d = 0
        
    w_ls = []
    sum_ls = []

    # Use original scores for all HAIs to test whether can penalties be reproduced when 
    # using data from the HACRP files
    #m_ls = ['CDI W Z Score', 'CAUTI W Z Score', 'CLABSI W Z Score', 'MRSA W Z Score', 'SSI W Z Score']

    # Use original SSI scores but derived scores for CDI, CAUTI, CLABSI, and MRSA for actual results
    m_ls = ['PSI-90 W Z Score', 'CDI derived W Z Score', 'CAUTI derived W Z Score', 'CLABSI derived W Z Score', 'MRSA derived W Z Score', 'SSI W Z Score']
                
    s = 0
    w = 0
    for m in m_ls:
        v = tdf[m].tolist()
        if len(list(set(v))) > 1:
            print('len(list(set(v))) > 1')
            sys.exit()

        v = tdf[m].iloc[0]

        if np.isnan(v) == False: 
            s += v
            w += 1

    if w == 0:
        d = np.nan
    else:
        d = s/w
        
    hac_scores.append(d)
    state = tdf['State'].iloc[0]
    
        
df_yr['Total HAC Score (derived)'] = hac_scores
print(df_yr.shape[0], 'hospitals in hac_df')


Results from attempting to reproduce Yes/No penalty assignments:

Excluded from results below:
1. MD hospitals
2. Hospitals with payment reduction values other than Yes or No

3060 hospitals in hac_df


In [16]:
p75 = np.nanpercentile(df_yr['Total HAC Score'], 75)
p75 = 0.2998
print('p75:', p75)

pr = []
for hosp in df_yr['Facility ID'].tolist():
    tdf = df_yr[df_yr['Facility ID'] == hosp]

    score = tdf['Total HAC Score'].iloc[0]

    if np.isnan(score) == True:
        pr.append('No')
    elif score <= p75:
        pr.append('No')
    elif score > p75:
        pr.append('Yes')
    else:
        print('This score is an error:', score)
        sys.exit()

df_yr['Payment Reduction (derived)'] = pr
    
o_list = df_yr['Payment Reduction'].tolist()
d_list = df_yr['Payment Reduction (derived)'].tolist()

same = 0
diff = 0
res_ls = []
for i, o in enumerate(o_list):
    if o == d_list[i]:
        same += 1
        res_ls.append(1)
    else:
        diff += 1
        res_ls.append(0)
            
df_yr['Payment Reduction Reproduced?'] = res_ls
    
print(same, "Penalty assignments were reproduced")
print(diff, "Penalty assignments were not reproduced")
print(str(np.round(100 * same/(same+diff), 2)) + '% penalty assignments were reproduced\n')

df_yr['HAC delta'] = df_yr['Total HAC Score'] - df_yr['Total HAC Score (derived)']
df_yr['CDI delta'] = df_yr['CDI W Z Score'] - df_yr['CDI derived W Z Score']
df_yr['MRSA delta'] = df_yr['MRSA W Z Score'] - df_yr['MRSA derived W Z Score']
df_yr['CAUTI delta'] = df_yr['CAUTI W Z Score'] - df_yr['CAUTI derived W Z Score']
df_yr['CLABSI delta'] = df_yr['CLABSI W Z Score'] - df_yr['CLABSI derived W Z Score']


p75: 0.2998
3060 Penalty assignments were reproduced
0 Penalty assignments were not reproduced
100.0% penalty assignments were reproduced



In [17]:
ls1 = list(df_yr)
ls2 = list(holdout_df)
ls = list(filter(lambda x:x in ls1, ls2))
print(df_yr.shape)
print(holdout_df.shape)
df_yr = df_yr.merge(holdout_df, how='outer', on=ls)
print(df_yr.shape)


(3060, 59)
(95, 51)
(3155, 59)


In [18]:
tdf1 = df_yr[df_yr['Payment Reduction'] == 'Yes']
tdf2 = df_yr[df_yr['Payment Reduction'].isin(['Yes', 'No'])]
print(tdf1.shape[0]/tdf2.shape[0])

tdf1 = df_yr[df_yr['Payment Reduction'] == 'No']
print(tdf1.shape[0]/tdf2.shape[0])

print(df_yr.shape[0])
print(len(df_yr['Facility ID'].unique()))

df_yr.to_pickle('~/GitHub/HACRP-HAIs/data/merged_HAC_HAI/HAI_HAC_2022.pkl', protocol=5)


0.24573817947893214
0.7542618205210678
3155
3155
