In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import pandas as pd
import warnings
import sys
import numpy as np
import scipy as sc
import random
from scipy import stats
from numpy import log10, sqrt

mydir = '/Users/kenlocey/GitHub/HACRP-HAIs/'
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def Winsorize_it(x, WinZs):
    
    x2 = []
    for i, val in enumerate(x):
        wz = float(WinZs[i])
        val = float(val)
        if np.isnan(wz) == True:
            x2.append(np.nan)
        else:
            x2.append(val)
    
    p5 = np.nanpercentile(x2, 5)
    p95 = np.nanpercentile(x2, 95)
    WinScores = []
    
    for i, val in enumerate(x2):
        wz = float(WinZs[i])
        val = float(val)
        
        if np.isnan(wz) == True:
            WinScores.append(np.nan)
            
        elif np.isnan(wz) == False:
            if val >= p5 and val <= p95:
                WinScores.append(val)
            elif val < p5:
                WinScores.append(p5)
            elif val > p95:
                WinScores.append(p95)
            elif np.isnan(val) == True:
                #print('val:', val, '|', WinZs[i])
                WinScores.append(np.nan)
        
    
    return WinScores
        

def ZScore_it(x, WinZs):
    
    x2 = []
    for i, val in enumerate(x):
        wz = float(WinZs[i])
        val = float(val)
        if np.isnan(wz) == True:
            x2.append(np.nan)
        else:
            x2.append(val)
    
    x2 = np.array(x2)
    avg = np.nanmean(x2)
    std = np.nanstd(x2)
    zscores = (x2 - avg) / std
    return zscores


hac_mo = '10'

## Load HAC file

In [2]:
hac_df = pd.read_pickle(mydir + "data/CareCompare_data/CombinedFiles_HACRP/Facility.pkl")
features = ['Total HAC Score', 'Domain 1 Score', 'Domain 2 Score', 'CAUTI W Z Score', 'CLABSI W Z Score', 
            'MRSA W Z Score', 'CDI W Z Score', 'SSI W Z Score', 'PSI-90 W Z Score']

for f in features:
    hac_df[f] = hac_df[f].astype(str)
    hac_df[f] = hac_df[f].str.replace('*', '')
    hac_df[f] = hac_df[f].str.replace(' ', '')
    hac_df[f] = pd.to_numeric(hac_df[f], errors='coerce')

#hac_df = hac_df[hac_df['Fiscal Year'] == 2018]
hac_df = hac_df[hac_df['file_year'] == '2018']
hac_df = hac_df[hac_df['file_month'] == hac_mo]
hac_df.dropna(how='all', axis=1, inplace=True)

print('Max HAC score:', np.max(hac_df['Total HAC Score']))
print('Months:', hac_df['file_month'].unique())
print(len(hac_df['Facility ID'].unique()), 'hospitals in 2018 HACRP')

hac_df.head()

Max HAC score: 2.1496
Months: ['10']
3305 hospitals in 2018 HACRP


Unnamed: 0,CAUTI Footnote,CAUTI W Z Score,CDI Footnote,CDI W Z Score,CLABSI Footnote,CLABSI W Z Score,Domain 1 End Date,Domain 1 Footnote,Domain 1 Score,Domain 1 Start Date,Domain 2 End Date,Domain 2 Footnote,Domain 2 Score,Domain 2 Start Date,Facility ID,Facility Name,Fiscal Year,MRSA Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,Payment Reduction,Payment Reduction Footnote,SSI Footnote,SSI W Z Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year
0,,1.3847,,-1.0689,,1.628,09/30/2015,,-1.2795,07/01/2014,12/31/2016,,0.4826,01/01/2015,10001,SOUTHEAST ALABAMA MEDICAL CENTER,2018,,0.3071,,-1.2795,No,,,0.1619,AL,,0.2182,10,2018
1,,0.4575,,-1.406,,0.779,09/30/2015,,-0.0362,07/01/2014,12/31/2016,,-0.1814,01/01/2015,10005,MARSHALL MEDICAL CENTERS,2018,,0.3862,,-0.0362,No,,,-1.1238,AL,,-0.1596,10,2018
2,,0.5594,,-1.0153,,0.0433,09/30/2015,,0.034,07/01/2014,12/31/2016,,0.0409,01/01/2015,10006,ELIZA COFFEE MEMORIAL HOSPITAL,2018,,0.7678,,0.034,No,,,-0.1508,AL,,0.0398,10,2018
3,,-1.5254,,-1.9602,5.0,,09/30/2015,,-0.4543,07/01/2014,12/31/2016,,-1.7428,01/01/2015,10007,MIZELL MEMORIAL HOSPITAL,2018,5.0,,,-0.4543,No,,5.0,,AL,,-1.5495,10,2018
4,5.0,,5.0,,5.0,,09/30/2015,,-0.0259,07/01/2014,12/31/2016,5.0,,01/01/2015,10008,CRENSHAW COMMUNITY HOSPITAL,2018,5.0,,,-0.0259,No,,5.0,,AL,,-0.0259,10,2018


## Format dates within the HAC file 

In [3]:

hac_df['Domain 2 Start Date'] = pd.to_datetime(hac_df['Domain 2 Start Date'])
hac_df.style.format({'Domain 2 Start Date': lambda t: t.strftime("%Y-%m-%d")})

hac_df['Domain 2 End Date'] = pd.to_datetime(hac_df['Domain 2 End Date'])
hac_df.style.format({'Domain 2 End Date': lambda t: t.strftime("%Y-%m-%d")})

hac_df.rename(columns={'Domain 2 Start Date': 'Start Date', 'Domain 2 End Date': 'End Date'}, inplace=True)

print('rows in hac_df, after reformatting dates:', hac_df.shape[0])
print(hac_df['Start Date'].unique())
print(hac_df['End Date'].unique())


rows in hac_df, after reformatting dates: 3305
['2015-01-01T00:00:00.000000000']
['2016-12-31T00:00:00.000000000']


## Load and merge HAI files

In [4]:
##############################   CAUTI   ################################################

cauti_df = pd.read_pickle(mydir + "data/preprocessed_HAI_data/CAUTI_Data.pkl")
cauti_df = cauti_df.filter(items=['Facility ID', 'CAUTI Urinary Catheter Days (ICUs + select wards)', 
                                  'CAUTI Observed Cases (ICUs + select wards)', 
                                  'CAUTI Predicted Cases (ICUs + select wards)', 
                                  'Start Date', 'End Date'], axis=1)

cauti_df.rename(columns={'CAUTI Urinary Catheter Days (ICUs + select wards)': 'CAUTI Urinary Catheter Days', 
                         'CAUTI Observed Cases (ICUs + select wards)': 'CAUTI Observed Cases', 
                         'CAUTI Predicted Cases (ICUs + select wards)': 'CAUTI Predicted Cases', 
                        }, inplace=True)

features = ['CAUTI Urinary Catheter Days', 'CAUTI Observed Cases', 'CAUTI Predicted Cases']
for f in features:
    cauti_df[f] = cauti_df[f].astype(str)
    cauti_df[f] = pd.to_numeric(cauti_df[f], errors='coerce')

cauti_df['Start Date'] = pd.to_datetime(cauti_df['Start Date'])
cauti_df.style.format({'Start Date': lambda t: t.strftime("%Y-%m-%d")})
cauti_df['End Date'] = pd.to_datetime(cauti_df['End Date'])
cauti_df.style.format({'End Date': lambda t: t.strftime("%Y-%m-%d")})
cauti_df = cauti_df[(cauti_df['Start Date'].isin(hac_df['Start Date'].unique())) | (cauti_df['End Date'].isin(hac_df['End Date'].unique()))]

cauti_df.head()


Unnamed: 0,Facility ID,CAUTI Urinary Catheter Days,CAUTI Observed Cases,CAUTI Predicted Cases,Start Date,End Date
1,260179,,4.0,8.658,2015-01-01,2015-06-30
14,361328,,0.0,0.572,2015-01-01,2015-09-30
20,450651,14923.0,20.0,31.259,2016-01-01,2016-12-31
21,450845,,,,2015-01-01,2015-09-30
27,61302,,,,2015-01-01,2015-09-30


In [5]:
##############################   CLABSI   ###############################################

clabsi_df = pd.read_pickle(mydir + "data/preprocessed_HAI_data/CLABSI_Data.pkl")
clabsi_df = clabsi_df.filter(items=['Facility ID', 'CLABSI Device Days (ICUs + select wards)', 
                                    'CLABSI Observed Cases (ICUs + select wards)', 
                                    'CLABSI Predicted Cases (ICUs + select wards)', 
                                    'Start Date', 'End Date'], axis=1)

clabsi_df.rename(columns={'CLABSI Device Days (ICUs + select wards)': 'CLABSI Device Days', 
                          'CLABSI Observed Cases (ICUs + select wards)': 'CLABSI Observed Cases', 
                          'CLABSI Predicted Cases (ICUs + select wards)': 'CLABSI Predicted Cases', 
                        }, inplace=True)


features = ['CLABSI Device Days', 'CLABSI Observed Cases', 'CLABSI Predicted Cases']
for f in features:
    clabsi_df[f] = clabsi_df[f].astype(str)
    clabsi_df[f] = pd.to_numeric(clabsi_df[f], errors='coerce')

clabsi_df['Start Date'] = pd.to_datetime(clabsi_df['Start Date'])
clabsi_df.style.format({'Start Date': lambda t: t.strftime("%Y-%m-%d")})
clabsi_df['End Date'] = pd.to_datetime(clabsi_df['End Date'])
clabsi_df.style.format({'End Date': lambda t: t.strftime("%Y-%m-%d")})
clabsi_df = clabsi_df[(clabsi_df['Start Date'].isin(hac_df['Start Date'].unique())) | (clabsi_df['End Date'].isin(hac_df['End Date'].unique()))]

clabsi_df.head()

Unnamed: 0,Facility ID,CLABSI Device Days,CLABSI Observed Cases,CLABSI Predicted Cases,Start Date,End Date
10,370047,,1.0,2.031,2015-01-01,2015-06-30
14,110111,387.0,0.0,0.225,2016-01-01,2016-12-31
26,360141,,3.0,3.1,2015-01-01,2015-09-30
28,450716,,3.0,3.413,2015-01-01,2015-09-30
31,180009,4651.0,4.0,6.549,2015-01-01,2015-12-31


In [6]:

##############################   MRSA   ###############################################

mrsa_df = pd.read_pickle(mydir + "data/preprocessed_HAI_data/MRSA_Data.pkl")
mrsa_df = mrsa_df.filter(items=['Facility ID', 'MRSA patient days', 
                                'MRSA Observed Cases', 'MRSA Predicted Cases', 
                                'Start Date', 'End Date'], axis=1)
mrsa_df['Start Date'] = pd.to_datetime(mrsa_df['Start Date'])
mrsa_df.style.format({'Start Date': lambda t: t.strftime("%Y-%m-%d")})
mrsa_df['End Date'] = pd.to_datetime(mrsa_df['End Date'])
mrsa_df.style.format({'End Date': lambda t: t.strftime("%Y-%m-%d")})

mrsa_df = mrsa_df[(mrsa_df['Start Date'].isin(hac_df['Start Date'].unique())) | (mrsa_df['End Date'].isin(hac_df['End Date'].unique()))]

##############################   CDI   ###############################################

cdi_df = pd.read_pickle(mydir + "data/preprocessed_HAI_data/CDI_Data.pkl")
cdi_df = cdi_df.filter(items=['Facility ID', 'CDIFF patient days', 
                              'CDIFF Observed Cases', 'CDIFF Predicted Cases', 
                              'Start Date', 'End Date'], axis=1)
cdi_df['Start Date'] = pd.to_datetime(cdi_df['Start Date'])
cdi_df.style.format({'Start Date': lambda t: t.strftime("%Y-%m-%d")})
cdi_df['End Date'] = pd.to_datetime(cdi_df['End Date'])
cdi_df.style.format({'End Date': lambda t: t.strftime("%Y-%m-%d")})

cdi_df = cdi_df[(cdi_df['Start Date'].isin(hac_df['Start Date'].unique())) | (cdi_df['End Date'].isin(hac_df['End Date'].unique()))]

###################  Merge CAUTI, CLABSI, MRSA, and CDI  ####################################

hai_df = cauti_df.merge(clabsi_df, on=['Facility ID', 'Start Date', 'End Date'], how='outer')
hai_df = hai_df.merge(mrsa_df, on=['Facility ID', 'Start Date', 'End Date'], how='outer')
hai_df = hai_df.merge(cdi_df, on=['Facility ID', 'Start Date', 'End Date'], how='outer')


####################  Drop duplicate rows resulting from merger #############################

hai_df.drop_duplicates(inplace=True)


########################  Conversions to numeric  #############################################

features = ['CAUTI Urinary Catheter Days', 'CLABSI Device Days',
            'MRSA patient days', 'CDIFF patient days',
            'CLABSI Observed Cases', 'CLABSI Predicted Cases', 
            'MRSA Observed Cases', 'MRSA Predicted Cases', 
            'CDIFF Observed Cases', 'CDIFF Predicted Cases', 
            'CAUTI Observed Cases', 'CAUTI Predicted Cases', 
           ]

for f in features:
    hai_df[f] = hai_df[f].astype(str)
    hai_df[f] = hai_df[f].str.replace('*', '')
    hai_df[f] = hai_df[f].str.replace(' ', '')
    hai_df[f] = pd.to_numeric(hai_df[f], errors='coerce')
    
hai_df['Total device days'] = hai_df['CLABSI Device Days'] + hai_df['CAUTI Urinary Catheter Days']


########################  Reorder columns  #############################################

col_to_move = hai_df.pop('CAUTI Urinary Catheter Days')
hai_df.insert(hai_df.shape[1] - 2, 'CAUTI Urinary Catheter Days', col_to_move)

print(hai_df.shape)
hai_df.head()

(75637, 16)


Unnamed: 0,Facility ID,CAUTI Observed Cases,CAUTI Predicted Cases,Start Date,End Date,CLABSI Device Days,CLABSI Observed Cases,CLABSI Predicted Cases,MRSA patient days,MRSA Observed Cases,MRSA Predicted Cases,CDIFF patient days,CDIFF Observed Cases,CAUTI Urinary Catheter Days,CDIFF Predicted Cases,Total device days
0,260179,4.0,8.658,2015-01-01,2015-06-30,,1.0,6.933,,,,,,,,
1,361328,0.0,0.572,2015-01-01,2015-09-30,,0.0,0.069,,,,,,,,
2,450651,20.0,31.259,2016-01-01,2016-12-31,15342.0,15.0,17.42,117372.0,5.0,6.179,103727.0,101.0,14923.0,78.891,30265.0
3,450845,,,2015-01-01,2015-09-30,,,,,,,,,,,
4,61302,,,2015-01-01,2015-09-30,,,,,,,,,,,


## Filter HAI data on start dates and end dates that match those in the HAC file.

In [7]:
#########  Filter on start dates and end dates that match those in the HAC file  #######

hai_df = hai_df[(hai_df['Start Date'].isin(['2015-01-01', '2016-01-01'])) & (hai_df['End Date'].isin(['2015-12-31', '2016-12-31']))]

print(hai_df.shape)
print(hai_df['Start Date'].unique())
print(hai_df['End Date'].unique())

(61728, 16)
['2016-01-01T00:00:00.000000000' '2015-01-01T00:00:00.000000000']
['2016-12-31T00:00:00.000000000' '2015-12-31T00:00:00.000000000']


## Drop hospitals from HAI data that are not contained in HAC data.

In [8]:
# drop hospitals in HAI data that are not in HAC data
hai_df = hai_df[hai_df['Facility ID'].isin(hac_df['Facility ID'].unique())]

In [9]:
# hospitals in the hac data but not in the hai data ...
#tdf = hac_df.copy(deep=True) 
tdf = hac_df[~hac_df['Facility ID'].isin(hai_df['Facility ID'].unique())]
hac_df = hac_df[hac_df['Facility ID'].isin(hai_df['Facility ID'].unique())]

print(len(tdf['Facility ID'].unique()), 'hospitals in HAC dataset but not in HAI dataset\n')


tdf.head()

5 hospitals in HAC dataset but not in HAI dataset



Unnamed: 0,CAUTI Footnote,CAUTI W Z Score,CDI Footnote,CDI W Z Score,CLABSI Footnote,CLABSI W Z Score,Domain 1 End Date,Domain 1 Footnote,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Footnote,Domain 2 Score,Start Date,Facility ID,Facility Name,Fiscal Year,MRSA Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,Payment Reduction,Payment Reduction Footnote,SSI Footnote,SSI W Z Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year
498,5.0,,5.0,,5.0,,09/30/2015,5.0,,07/01/2014,2016-12-31,5.0,,2015-01-01,50783,HEALTHSOUTH REHABILITATION HOSPITAL OF MODESTO,2018,5.0,,5.0,,No,,5.0,,CA,5.0,,10,2018
759,5.0,,5.0,,5.0,,09/30/2015,5.0,,07/01/2014,2016-12-31,5.0,,2015-01-01,100329,OVIEDO MEDICAL CENTER,2018,5.0,,5.0,,No,,5.0,,FL,5.0,,10,2018
2280,5.0,,5.0,,5.0,,09/30/2015,5.0,,07/01/2014,2016-12-31,5.0,,2015-01-01,360365,AVITA ONTARIO,2018,5.0,,5.0,,No,,5.0,,OH,5.0,,10,2018
2365,5.0,,5.0,,5.0,,09/30/2015,5.0,,07/01/2014,2016-12-31,5.0,,2015-01-01,370237,SAINT FRANCIS HOSPITAL VINITA,2018,5.0,,5.0,,No,,5.0,,OK,5.0,,10,2018
3297,5.0,,5.0,,5.0,,09/30/2015,5.0,,07/01/2014,2016-12-31,5.0,,2015-01-01,670113,"HUMBLE SURGICAL HOSPITAL, LLC",2018,5.0,,5.0,,No,,5.0,,TX,5.0,,10,2018


## Correct HAI file for non-duplicate rows having duplicate dates

**Problem:** Some rows for the same provider have duplicate measurement dates but different values for observed cases, predicted cases, etc. This results from each year having multiple (quarterly) files, the data within which can vary among files. Additionally, the most recent file for each year is not always the right file to use. 

**Need:** Since only one row can be used, we need to figure out which row should be used.

**Solution:** Select the row with the greatest totals for predicted cases for each HAI.

In [10]:
##############  Label rows that have duplicate dates (per provider) ####################
##############  For each provider with rows having duplicate dates,  ###################
###########  keep the last row (will have greatest number of total device days)  #######

hai_df['duplicated dates'] = hai_df.duplicated(subset=['Facility ID', 'Start Date', 'End Date'], keep=False)

hai_df.sort_values(by=['Facility ID', 'Start Date', 'End Date', 
                       'MRSA Predicted Cases', 'CAUTI Predicted Cases',  
                       'MRSA Observed Cases', 'CAUTI Observed Cases', 
                       'CDIFF Predicted Cases', 'CLABSI Predicted Cases',
                       'CDIFF Observed Cases', 'CLABSI Observed Cases',
                       ], inplace=True, ascending=False)

hai_df.drop_duplicates(subset=['Facility ID', 'Start Date', 'End Date'], inplace=True, keep='first')

print(hai_df.shape)
hai_df.head()

(6579, 17)


Unnamed: 0,Facility ID,CAUTI Observed Cases,CAUTI Predicted Cases,Start Date,End Date,CLABSI Device Days,CLABSI Observed Cases,CLABSI Predicted Cases,MRSA patient days,MRSA Observed Cases,MRSA Predicted Cases,CDIFF patient days,CDIFF Observed Cases,CAUTI Urinary Catheter Days,CDIFF Predicted Cases,Total device days,duplicated dates
35263,670120,,,2016-01-01,2016-12-31,,,,,,,,,,,,False
64237,670119,,,2016-01-01,2016-12-31,,,,,,,,,,,,False
94014,670118,0.0,0.008,2016-01-01,2016-12-31,9.0,0.0,0.002,726.0,0.0,0.015,726.0,0.0,13.0,0.33,22.0,False
12693,670117,,,2016-01-01,2016-12-31,,,,,,,,,,,,False
45892,670116,0.0,0.058,2016-01-01,2016-12-31,23.0,0.0,0.013,256.0,0.0,0.005,539.0,0.0,120.0,0.116,143.0,False


## Aggregate annual HAI data into biennial data

Purpose: Match the biennial measurement periods of HAC data


In [11]:
start_dates = hac_df['Start Date'].tolist()
end_dates = hac_df['End Date'].tolist()
prvdrs = hac_df['Facility ID'].tolist()

total_device_days = []

cauti_days = []
clabsi_days = []
mrsa_days = []
cdi_days = []

cauti_pred = []
clabsi_pred = []
mrsa_pred = []
cdi_pred = []

cauti_obs = []
clabsi_obs = []
mrsa_obs = []
cdi_obs = []

for i, start in enumerate(start_dates):
    end = end_dates[i]
    prvdr = prvdrs[i]
    
    tdf = hai_df[hai_df['Facility ID'] == prvdr]
    tdf = tdf[(tdf['Start Date'] == start) | (tdf['End Date'] == end)]
    
    if tdf.shape[0] == 1:
        pass
        #print('tdf.shape[0]:', 1)
        #print('hospital:', prvdr)
        #print(tdf['Start Date'].unique())
        #print(tdf['End Date'].unique())
        
    if tdf.shape[0] > 2:
        print('tdf.shape[0] = ', tdf.shape[0])
        for date_ in ['Start Date', 'End Date']:
            tdf['duplicated dates'] = tdf.duplicated(subset=[date_], keep=False)
            tdf.sort_values(by=[
                                'CAUTI Observed Cases', 'CLABSI Observed Cases',
                                'MRSA Observed Cases', 'CDIFF Observed Cases',
                                ], inplace=True, ascending=True)

            tdf.drop_duplicates(subset=[date_], inplace=True, keep='last')

        if tdf.shape[0] > 2:
            print('Error:')
            print("tdf.shape[0] > 2:", tdf.shape[0])
            print(start)
            print(end)
            print(tdf.head())
            break
        
    total_device_days.append(np.nansum(tdf['Total device days']))
    cauti_days.append(np.nansum(tdf['CAUTI Urinary Catheter Days']))
    clabsi_days.append(np.nansum(tdf['CLABSI Device Days']))
    mrsa_days.append(np.nansum(tdf['MRSA patient days']))
    cdi_days.append(np.nansum(tdf['CDIFF patient days']))
        
    cauti_pred.append(np.nansum(tdf['CAUTI Predicted Cases']))
    clabsi_pred.append(np.nansum(tdf['CLABSI Predicted Cases']))
    mrsa_pred.append(np.nansum(tdf['MRSA Predicted Cases']))
    cdi_pred.append(np.nansum(tdf['CDIFF Predicted Cases']))

    cauti_obs.append(np.nansum(tdf['CAUTI Observed Cases']))
    clabsi_obs.append(np.nansum(tdf['CLABSI Observed Cases']))
    mrsa_obs.append(np.nansum(tdf['MRSA Observed Cases']))
    cdi_obs.append(np.nansum(tdf['CDIFF Observed Cases']))
    

## Add HAI data to the HAC dataframe and save

In [12]:
hac_df['Total device days'] = total_device_days
hac_df['CAUTI Urinary Catheter Days'] = cauti_days
hac_df['CLABSI Device Days'] = clabsi_days
hac_df['MRSA patient days'] = mrsa_days
hac_df['CDI patient days'] = cdi_days

hac_df['CAUTI Observed Cases'] = cauti_obs
hac_df['CLABSI Observed Cases'] = clabsi_obs
hac_df['MRSA Observed Cases'] = mrsa_obs
hac_df['CDI Observed Cases'] = cdi_obs

hac_df['CAUTI Predicted Cases'] = cauti_pred
hac_df['CLABSI Predicted Cases'] = clabsi_pred
hac_df['MRSA Predicted Cases'] = mrsa_pred
hac_df['CDI Predicted Cases'] = cdi_pred

hac_df['CAUTI derived SIR'] = np.round(hac_df['CAUTI Observed Cases'] / hac_df['CAUTI Predicted Cases'],4)
hac_df['CLABSI derived SIR'] = np.round(hac_df['CLABSI Observed Cases'] / hac_df['CLABSI Predicted Cases'],4)
hac_df['MRSA derived SIR'] = np.round(hac_df['MRSA Observed Cases'] / hac_df['MRSA Predicted Cases'],4)
hac_df['CDI derived SIR'] = np.round(hac_df['CDI Observed Cases'] / hac_df['CDI Predicted Cases'],4)

print('hac_df.shape:', hac_df.shape)
print(len(hac_df['Facility ID'].unique()), 'hospitals in 2018 HACRP')
#hac_df.to_pickle('~/GitHub/HACRP-HAIs/data/merged_HAC_HAI/merged_HAI_HAC_2018.pkl', protocol=5)
hac_df.head()


hac_df.shape: (3300, 47)
3300 hospitals in 2018 HACRP


Unnamed: 0,CAUTI Footnote,CAUTI W Z Score,CDI Footnote,CDI W Z Score,CLABSI Footnote,CLABSI W Z Score,Domain 1 End Date,Domain 1 Footnote,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Footnote,Domain 2 Score,Start Date,Facility ID,Facility Name,Fiscal Year,MRSA Footnote,MRSA W Z Score,PSI-90 Footnote,PSI-90 W Z Score,Payment Reduction,Payment Reduction Footnote,SSI Footnote,SSI W Z Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,MRSA patient days,CDI patient days,CAUTI Observed Cases,CLABSI Observed Cases,MRSA Observed Cases,CDI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,MRSA Predicted Cases,CDI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,MRSA derived SIR,CDI derived SIR
0,,1.3847,,-1.0689,,1.628,09/30/2015,,-1.2795,07/01/2014,2016-12-31,,0.4826,2015-01-01,10001,SOUTHEAST ALABAMA MEDICAL CENTER,2018,,0.3071,,-1.2795,No,,,0.1619,AL,,0.2182,10,2018,40996.0,26729.0,14267.0,181535.0,178892.0,40.0,21.0,11.0,62.0,31.194,14.917,12.076,130.103,1.2823,1.4078,0.9109,0.4765
1,,0.4575,,-1.406,,0.779,09/30/2015,,-0.0362,07/01/2014,2016-12-31,,-0.1814,2015-01-01,10005,MARSHALL MEDICAL CENTERS,2018,,0.3862,,-0.0362,No,,,-1.1238,AL,,-0.1596,10,2018,20081.0,14408.0,5673.0,77385.0,75450.0,10.0,5.0,3.0,8.0,15.086,5.637,2.874,29.504,0.6629,0.887,1.0438,0.2711
2,,0.5594,,-1.0153,,0.0433,09/30/2015,,0.034,07/01/2014,2016-12-31,,0.0409,2015-01-01,10006,ELIZA COFFEE MEMORIAL HOSPITAL,2018,,0.7678,,0.034,No,,,-0.1508,AL,,0.0398,10,2018,44570.0,24329.0,20241.0,128518.0,122589.0,28.0,15.0,10.0,36.0,33.623,24.905,6.979,72.24,0.8328,0.6023,1.4329,0.4983
3,,-1.5254,,-1.9602,5.0,,09/30/2015,,-0.4543,07/01/2014,2016-12-31,,-1.7428,2015-01-01,10007,MIZELL MEMORIAL HOSPITAL,2018,5.0,,,-0.4543,No,,5.0,,AL,,-1.5495,10,2018,2172.0,1790.0,382.0,9273.0,8430.0,0.0,0.0,0.0,0.0,2.194,0.311,0.253,4.571,0.0,0.0,0.0,0.0
4,5.0,,5.0,,5.0,,09/30/2015,,-0.0259,07/01/2014,2016-12-31,5.0,,2015-01-01,10008,CRENSHAW COMMUNITY HOSPITAL,2018,5.0,,,-0.0259,No,,5.0,,AL,,-0.0259,10,2018,547.0,454.0,93.0,3923.0,3925.0,2.0,0.0,0.0,1.0,0.538,0.079,0.141,1.711,3.7175,0.0,0.0,0.5845


## Generate Winsorized z-scores

In [13]:
hais = ['CAUTI', 'CLABSI', 'MRSA', 'CDI']

df_2018 = hac_df.copy(deep=True)
    
for i, hai in enumerate(hais):
    tdf2 = df_2018[~df_2018[hai + ' Footnote'].isin([18, '18', '18 ', ' 18', 
                                              5, '5', ' 5', '5 ',
                                              4, '4', ' 4', '4 ',
                                              ])]
      
    reported_winZ = tdf2[hai + ' W Z Score'].tolist()
    sirs = tdf2[hai + ' derived SIR'].tolist()
    tdf2[hai + ' derived Winsorized SIR'] = Winsorize_it(sirs, reported_winZ)
    tdf2[hai + ' derived W Z Score'] = ZScore_it(tdf2[hai + ' derived Winsorized SIR'], reported_winZ)
    
    # Assign maximum WinZ scores to hospitals with HAI footnote 18 
    maxWinZ = np.nanmax(tdf2[hai + ' derived W Z Score'])
    tdf3 = df_2018[df_2018[hai + ' Footnote'].isin([18, '18', '18 ', ' 18'])]
    tdf3[hai + ' derived Winsorized SIR'] = [np.nan]*tdf3.shape[0]
    tdf3[hai + ' derived W Z Score'] = [maxWinZ]*tdf3.shape[0]
    
    tdf4 = df_2018[df_2018[hai + ' Footnote'].isin([5, '5', ' 5', '5 ', 4, '4', ' 4', '4 '])]
    tdf4[hai + ' derived Winsorized SIR'] = [np.nan]*tdf4.shape[0]
    tdf4[hai + ' derived W Z Score'] = [np.nan]*tdf4.shape[0]
    
    df_2018 = pd.concat([tdf2, tdf3, tdf4], axis=0)

del tdf2
del tdf3

display_df = df_2018.copy(deep=True)
items = ['file_year', 'HAI Measures End Date', 'HAI Measures Start Date',
         'CAUTI Footnote', 'CAUTI W Z Score', 'CAUTI derived W Z Score', 'CAUTI derived Winsorized SIR',
         'CDI Footnote', 'CDI W Z Score', 'CDI derived W Z Score',
         'CLABSI Footnote', 'CLABSI W Z Score', 'CLABSI derived W Z Score',
         'MRSA Footnote', 'MRSA W Z Score', 'MRSA derived W Z Score',
         #'Total HAC Footnote', 'Total HAC Score',
         ]


display_df = display_df.filter(items=items)
#display_df = display_df[display_df['CAUTI W Z Score'].isin([np.nan, float("NaN")])]
#display_df.sort_values(by='CAUTI derived W Z Score', inplace=True, ascending=False)
display_df.head(20)

Unnamed: 0,file_year,CAUTI Footnote,CAUTI W Z Score,CAUTI derived W Z Score,CAUTI derived Winsorized SIR,CDI Footnote,CDI W Z Score,CDI derived W Z Score,CLABSI Footnote,CLABSI W Z Score,CLABSI derived W Z Score,MRSA Footnote,MRSA W Z Score,MRSA derived W Z Score
0,2018,,1.3847,1.702953,1.2823,,-1.0689,-0.94504,,1.628,1.919262,,0.3071,0.063911
1,2018,,0.4575,0.134225,0.6629,,-1.406,-1.500907,,0.779,0.637849,,0.3862,0.288942
2,2018,,0.5594,0.564524,0.8328,,-1.0153,-0.886044,,0.0433,-0.062647,,0.7678,0.947779
5,2018,,-0.3677,-0.13297,0.5574,,-0.5019,-0.479293,,0.4188,0.659993,,-0.7663,-0.72649
6,2018,,-1.5254,-1.544674,0.0,,-1.2323,-1.116888,,-1.511,-1.544588,,-1.4551,-1.478455
7,2018,,-0.1906,-0.254538,0.5094,,-0.6884,-0.588084,,-0.1275,-0.223316,,0.0294,0.142477
9,2018,,-0.7486,-0.878585,0.263,,-1.3039,-1.257614,,1.1037,0.838377,,2.2236,1.692464
12,2018,,0.2738,0.615937,0.8531,,0.8849,0.815113,,0.9921,1.010856,,0.5149,0.672798
13,2018,,0.2671,0.559458,0.8308,,0.5171,0.665186,,2.1946,2.121857,,1.9451,2.191341
14,2018,,0.1167,0.18969,0.6848,,-0.6654,-0.611088,,0.2006,0.373102,,0.6406,0.809781


In [14]:

features = ['CAUTI derived SIR', 'CAUTI W Z Score', 'CAUTI derived W Z Score',
            'CDI derived SIR', 'CDI W Z Score', 'CDI derived W Z Score',
            'CLABSI derived SIR', 'CLABSI W Z Score', 'CLABSI derived W Z Score',
            'MRSA derived SIR', 'MRSA W Z Score', 'MRSA derived W Z Score', 
            'SSI W Z Score', 'PSI-90 W Z Score', 'Total HAC Score']

for f in features:
    df_2018[f] = df_2018[f].astype(str)
    df_2018[f] = pd.to_numeric(df_2018[f], errors='coerce')
    

for hai in hais:
    print(hai)
    
    ls1 = df_2018[hai + ' W Z Score'].tolist()
    ls2 = df_2018[hai + ' derived W Z Score'].tolist()
    final_ls = []
    
    for i, val in enumerate(ls1):
        if np.isnan(val) == False and np.isnan(ls2[i]) == True:
            final_ls.append(ls2[i])
        elif np.isnan(val) == False and np.isnan(ls2[i]) == False:
            final_ls.append(ls2[i])
        elif np.isnan(val) == True and np.isnan(ls2[i]) == False:
            final_ls.append(ls2[i])
        elif np.isnan(val) == True and np.isnan(ls2[i]) == True:
            final_ls.append(ls2[i])
        else:
            final_ls.append(ls2[i])
            
    df_2018[hai + ' derived W Z Score'] = final_ls


CAUTI
CLABSI
MRSA
CDI


## Attempt to reproduce HAC scores for 2018

In [15]:
print('Results from attempting to reproduce Yes/No penalty assignments:\n')
print('Excluded from results below:')
print('1. MD hospitals')
print('2. Hospitals with payment reduction values other than Yes or No\n')

holdout_df = df_2018[(df_2018['State'] == 'MD') | ~df_2018['Payment Reduction'].isin(['Yes', 'No']) | (df_2018['Total HAC Score'].isin([float("NaN"), np.nan]))]
df_2018 = df_2018[(df_2018['State'] != 'MD') & (df_2018['Payment Reduction'].isin(['Yes', 'No'])) & (~df_2018['Total HAC Score'].isin([float("NaN"), np.nan]))]

hac_scores = []
ct1 = 0
ct2 = 0


for hosp in df_2018['Facility ID'].tolist():
    tdf = df_2018[df_2018['Facility ID'] == hosp]

    d1 = 0
    d2 = 0
        
    w_ls = []
    sum_ls = []

    # Use original scores for all HAIs to test whether can penalties be reproduced when 
    # using data from the HACRP files
    #m_ls = ['CDI W Z Score', 'CAUTI W Z Score', 'CLABSI W Z Score', 'MRSA W Z Score', 'SSI W Z Score']

    # Use original SSI scores but derived scores for CDI, CAUTI, CLABSI, and MRSA for actual results
    m_ls = ['CDI derived W Z Score', 'CAUTI derived W Z Score', 'CLABSI derived W Z Score', 'MRSA derived W Z Score', 'SSI W Z Score']
                
    s = 0
    w = 0
    for m in m_ls:
        v = tdf[m].tolist()
        if len(list(set(v))) > 1:
            print('len(list(set(v))) > 1')
            sys.exit()

        v = tdf[m].iloc[0]

        if np.isnan(v) == False: 
            s += v
            w += 1

    if w == 0:
        d2 = np.nan
    else:
        d2 = s/w
        
    d1 = float(tdf['PSI-90 W Z Score'].iloc[0])
    d2_o = float(tdf['Domain 2 Score'].iloc[0])
    state = tdf['State'].iloc[0]

    if np.isnan(d1) == True: 
        # if no score for Domain 1, then total HAC score will be based entirely on Domain 2
        if np.isnan(d2) == True and np.isnan(d2_o) == True:
            hac_scores.append(d2)

        elif np.isnan(d2) == True and np.isnan(d2_o) == False:
            hac_scores.append(d2_o)

        elif np.isnan(d2) == False and np.isnan(d2_o) == False:
            hac_scores.append(d2)

        elif np.isnan(d2) == False and np.isnan(d2_o) == True:
            hac_scores.append(d2_o)

    elif np.isnan(d1) == False:
        # if there is a score for Domain 1 ...

        if state == 'MD':
            # states in MD should not have scores for domain 1
            print('Error:')
            print('Domain 1:', d1)
            print('State:', state)
            print('Hospitals in this state should not have domain 1 scores.\n')
            
        # Domain 2:

        # If the derived score is NaN and the original score is NaN ...
        if np.isnan(d2) == True and np.isnan(d2_o) == True:
            hac_scores.append(d1)
        
        # If the derived score is a float but the original score is NaN ...
        elif np.isnan(d2) == False and np.isnan(d2_o) == True:
            hac_scores.append(d1)
            
        # If the derived score is NaN but the original score is a float ...
        elif np.isnan(d2) == True and np.isnan(d2_o) == False:
            ct1 += 1
            hac_scores.append(0.15*d1 + 0.85*d2_o)

        # If the derived score is a float and the original score is a float ...
        elif np.isnan(d2) == False and np.isnan(d2_o) == False:
            if d2 != d2_o:
                ct2 += 1
                #print(d2, d2_o)
            hac_scores.append(0.15*d1 + 0.85*d2)

    else:
        print(d1, ',', d2, ',', d2_o)
        break

        
df_2018['Total HAC Score (derived)'] = hac_scores
print(df_2018.shape[0], 'hospitals in hac_df')


Results from attempting to reproduce Yes/No penalty assignments:

Excluded from results below:
1. MD hospitals
2. Hospitals with payment reduction values other than Yes or No

3170 hospitals in hac_df


In [16]:
p75 = np.nanpercentile(df_2018['Total HAC Score'], 75)
p75 = 0.3712
print('p75:', p75)

pr = []
for hosp in df_2018['Facility ID'].tolist():
    tdf = df_2018[df_2018['Facility ID'] == hosp]

    score = tdf['Total HAC Score'].iloc[0]

    if np.isnan(score) == True:
        pr.append('No')
    elif score <= p75:
        pr.append('No')
    elif score > p75:
        pr.append('Yes')
    else:
        print('This score is an error:', score)
        sys.exit()

df_2018['Payment Reduction (derived)'] = pr
    
o_list = df_2018['Payment Reduction'].tolist()
d_list = df_2018['Payment Reduction (derived)'].tolist()

same = 0
diff = 0
res_ls = []
for i, o in enumerate(o_list):
    if o == d_list[i]:
        same += 1
        res_ls.append(1)
    else:
        diff += 1
        res_ls.append(0)
            
df_2018['Payment Reduction Reproduced?'] = res_ls
    
print(same, "Penalty assignments were reproduced")
print(diff, "Penalty assignments were not reproduced")
print(str(np.round(100 * same/(same+diff),2)) + '% penalty assignments were reproduced\n')
    
df_2018['HAC delta'] = df_2018['Total HAC Score'] - df_2018['Total HAC Score (derived)']
df_2018['CDI delta'] = df_2018['CDI W Z Score'] - df_2018['CDI derived W Z Score']
df_2018['MRSA delta'] = df_2018['MRSA W Z Score'] - df_2018['MRSA derived W Z Score']
df_2018['CAUTI delta'] = df_2018['CAUTI W Z Score'] - df_2018['CAUTI derived W Z Score']
df_2018['CLABSI delta'] = df_2018['CLABSI W Z Score'] - df_2018['CLABSI derived W Z Score']

df_2018.to_pickle('~/GitHub/HACRP-HAIs/data/merged_HAC_HAI/HAI_HAC_2018.pkl', protocol=5)


p75: 0.3712
3170 Penalty assignments were reproduced
0 Penalty assignments were not reproduced
100.0% penalty assignments were reproduced



In [17]:
ls1 = list(df_2018)
ls2 = list(holdout_df)
ls = list(filter(lambda x:x in ls1, ls2))
print(df_2018.shape)
print(holdout_df.shape)
df_2018 = df_2018.merge(holdout_df, how='outer', on=ls)
print(df_2018.shape)
df_2018.to_pickle('~/GitHub/HACRP-HAIs/data/merged_HAC_HAI/HAI_HAC_2018.pkl', protocol=5)


(3170, 63)
(130, 55)
(3300, 63)


In [18]:
display_df = df_2018[df_2018['Payment Reduction Reproduced?'] == 0]
items = ['file_year', 'HAI Measures End Date', 'HAI Measures Start Date',
         'CAUTI Footnote', 'CAUTI W Z Score', 'CAUTI derived W Z Score',
         'CDI Footnote', 'CDI W Z Score', 'CDI derived W Z Score',
         'CLABSI Footnote', 'CLABSI W Z Score', 'CLABSI derived W Z Score',
         'MRSA Footnote', 'MRSA W Z Score', 'MRSA derived W Z Score',
         'SSI Footnote', 'SSI W Z Score', 'SSI SIR W Z Score',
         'PSI-90 Footnote', 'PSI-90 W Z Score', 'PSI-90 SIR W Z Score',
         'Total HAC Footnote', 'Total HAC Score', 'Total HAC Score (derived)',
         'Payment Reduction Footnote', 'Payment Reduction', 
         'Payment Reduction (derived)', 'Payment Reduction Reproduced?',
         ]
display_df = display_df.filter(items=items)
display_df['delta'] = ((display_df['Total HAC Score'] - display_df['Total HAC Score (derived)'])**2)**0.5
display_df = display_df.round(4)
display_df.sort_values(by=['delta'], ascending=True, inplace=True)

display_df.tail(10)

Unnamed: 0,file_year,CAUTI Footnote,CAUTI W Z Score,CAUTI derived W Z Score,CDI Footnote,CDI W Z Score,CDI derived W Z Score,CLABSI Footnote,CLABSI W Z Score,CLABSI derived W Z Score,MRSA Footnote,MRSA W Z Score,MRSA derived W Z Score,SSI Footnote,SSI W Z Score,PSI-90 Footnote,PSI-90 W Z Score,Total HAC Footnote,Total HAC Score,Total HAC Score (derived),Payment Reduction Footnote,Payment Reduction,Payment Reduction (derived),Payment Reduction Reproduced?,delta


In [19]:
tdf1 = df_2018[df_2018['Payment Reduction'] == 'Yes']
tdf2 = df_2018[df_2018['Payment Reduction'].isin(['Yes', 'No'])]
print(tdf1.shape[0]/tdf2.shape[0])

tdf1 = df_2018[df_2018['Payment Reduction'] == 'No']
print(tdf1.shape[0]/tdf2.shape[0])

print(df_2018.shape[0])
print(len(df_2018['Facility ID'].unique()))

df_2018.to_pickle('~/GitHub/HACRP-HAIs/data/merged_HAC_HAI/P1_HAI_HAC_2018.pkl', protocol=5)


0.23024900092222564
0.7697509990777743
3300
3300


#        