In [1]:
import pandas as pd
import warnings
import sys
import numpy as np
import scipy as sc
import random
from scipy import stats
from numpy import log10, sqrt

mydir = '/Users/kenlocey/GitHub/HACRP-HAIs/'
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def def_display_df_misses(df):
    df = df[df['Payment Reduction Reproduced?'] == 0]
    items = ['file_year', 'HAI Measures End Date', 'HAI Measures Start Date',
             'CAUTI Footnote', 'CAUTI W Z Score', 'CAUTI SIR W Z Score',
             'CDI Footnote', 'CDI W Z Score', 'CDI SIR W Z Score',
             'CLABSI Footnote', 'CLABSI W Z Score', 'CLABSI SIR W Z Score',
             'MRSA Footnote', 'MRSA W Z Score', 'MRSA SIR W Z Score',
             'SSI Footnote', 'SSI W Z Score', 'SSI SIR W Z Score',
             'PSI-90 Footnote', 'PSI-90 W Z Score', 'PSI-90 SIR W Z Score',
             'Total HAC Footnote', 'Total HAC Score', 'Total HAC Score (derived)',
             'Payment Reduction Footnote', 'Payment Reduction', 
             'Payment Reduction (derived)', 'Payment Reduction Reproduced?',
             ]
    return df.filter(items=items)

hac_mo = '11'

## Load HAC file

In [2]:
hac_df = pd.read_pickle(mydir + "data/CareCompare_data/CombinedFiles_HACRP/Facility.pkl")
features = ['CAUTI Score', 'CLABSI Score', 'Total HAC Score', 'Domain 1 Score', 'AHRQ PSI-90 Score', 'Domain 2 Score']
for f in features:
    hac_df[f] = hac_df[f].astype(str)
    hac_df[f] = hac_df[f].str.replace('*', '')
    hac_df[f] = pd.to_numeric(hac_df[f], errors='coerce')

#hac_df = hac_df[hac_df['SSI Score'].isin([np.nan, float("NaN")])]
hac_df = hac_df[hac_df['file_year'] == '2016']
hac_df = hac_df[hac_df['file_month'] == hac_mo]
hac_df.dropna(how='all', axis=1, inplace=True)

print(hac_df.shape)
print(sorted(hac_df['file_year'].unique()))
print(sorted(hac_df['file_month'].unique()))
hac_df.head()

(3352, 24)
['2016']
['11']


Unnamed: 0,AHRQ PSI-90 Footnote,AHRQ PSI-90 Score,CAUTI Footnote,CAUTI Score,CLABSI Footnote,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Score Footnote,Domain 1 Start Date,Domain 2 End Date,Domain 2 Score,Domain 2 Score Footnote,Domain 2 Start Date,Facility ID,Facility Name,Fiscal Year,SSI Footnote,SSI Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year
0,,1.0,,10.0,,10.0,6302014,1.0,,7012012,12312014,8.0,,1012013,10001,SOUTHEAST ALABAMA MEDICAL CENTER,2016,,4.0,AL,,6.25,11,2016
1,,3.0,,3.0,,10.0,6302014,3.0,,7012012,12312014,5.3333,,1012013,10005,MARSHALL MEDICAL CENTERS,2016,,3.0,AL,,4.75,11,2016
2,,7.0,,3.0,,5.0,6302014,7.0,,7012012,12312014,6.0,,1012013,10006,ELIZA COFFEE MEMORIAL HOSPITAL,2016,,10.0,AL,,6.25,11,2016
3,,3.0,,1.0,,,6302014,3.0,,7012012,12312014,1.0,,1012013,10007,MIZELL MEMORIAL HOSPITAL,2016,,,AL,,1.5,11,2016
4,,6.0,,,,,6302014,6.0,,7012012,12312014,,,1012013,10008,CRENSHAW COMMUNITY HOSPITAL,2016,,,AL,,6.0,11,2016


## Add penalty assignments

In [3]:
tdf = hac_df[hac_df['State'] != 'MD']
p75 = np.nanpercentile(tdf['Total HAC Score'].tolist(), 75, axis=0)
pr = []
hacs = hac_df['Total HAC Score'].tolist()

for i in hacs:
    if i > p75:
        pr.append('Yes')
    elif i <= p75:
        pr.append('No')
    else:
        pr.append('No')

hac_df['Payment Reduction'] = pr

## Format dates within the HAC file 

In [4]:
## Fix Nov 2016 date format and convert to default datetime
hac_df['Domain 2 Start Date'] = '0' + hac_df['Domain 2 Start Date'].astype(str)
hac_df['Domain 2 Start Date'] = pd.to_datetime(hac_df['Domain 2 Start Date'], format='%m%d%Y')
hac_df.style.format({'Domain 2 Start Date': lambda t: t.strftime("%Y-%m-%d")})
hac_df['Domain 2 End Date'] = pd.to_datetime(hac_df['Domain 2 End Date'], format='%m%d%Y')
hac_df.style.format({'Domain 2 End Date': lambda t: t.strftime("%Y-%m-%d")})

print(hac_df['Domain 2 Start Date'].unique(), '\n')
print(hac_df['Domain 2 End Date'].unique())

hac_df.rename(columns={'Domain 2 Start Date': 'Start Date', 'Domain 2 End Date': 'End Date'}, inplace=True)
hac_df.head()

['2013-01-01T00:00:00.000000000'] 

['2014-12-31T00:00:00.000000000']


Unnamed: 0,AHRQ PSI-90 Footnote,AHRQ PSI-90 Score,CAUTI Footnote,CAUTI Score,CLABSI Footnote,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Score Footnote,Domain 1 Start Date,End Date,Domain 2 Score,Domain 2 Score Footnote,Start Date,Facility ID,Facility Name,Fiscal Year,SSI Footnote,SSI Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year,Payment Reduction
0,,1.0,,10.0,,10.0,6302014,1.0,,7012012,2014-12-31,8.0,,2013-01-01,10001,SOUTHEAST ALABAMA MEDICAL CENTER,2016,,4.0,AL,,6.25,11,2016,No
1,,3.0,,3.0,,10.0,6302014,3.0,,7012012,2014-12-31,5.3333,,2013-01-01,10005,MARSHALL MEDICAL CENTERS,2016,,3.0,AL,,4.75,11,2016,No
2,,7.0,,3.0,,5.0,6302014,7.0,,7012012,2014-12-31,6.0,,2013-01-01,10006,ELIZA COFFEE MEMORIAL HOSPITAL,2016,,10.0,AL,,6.25,11,2016,No
3,,3.0,,1.0,,,6302014,3.0,,7012012,2014-12-31,1.0,,2013-01-01,10007,MIZELL MEMORIAL HOSPITAL,2016,,,AL,,1.5,11,2016,No
4,,6.0,,,,,6302014,6.0,,7012012,2014-12-31,,,2013-01-01,10008,CRENSHAW COMMUNITY HOSPITAL,2016,,,AL,,6.0,11,2016,No


## Load and merge HAI files

In [5]:
##############################   CAUTI   ################################################

cauti_df = pd.read_pickle(mydir + "1_preprocess_CareCompare_data/preprocessed_HAI_data/CAUTI_Data.pkl")
cauti_df = cauti_df.filter(items=['Facility ID', 'CAUTI Urinary Catheter Days (ICUs only)', 
                                  'CAUTI Observed Cases (ICUs only)', 'CAUTI Predicted Cases (ICUs only)', 
                                  'Start Date', 'End Date', 'file_year', 'file_month'], axis=1)

cauti_df.rename(columns={'CAUTI Urinary Catheter Days (ICUs only)': 'CAUTI Urinary Catheter Days', 
                         'CAUTI Observed Cases (ICUs only)': 'CAUTI Observed Cases', 
                         'CAUTI Predicted Cases (ICUs only)': 'CAUTI Predicted Cases', 
                        }, inplace=True)

features = ['CAUTI Urinary Catheter Days', 'CAUTI Observed Cases', 'CAUTI Predicted Cases']
for f in features:
    cauti_df[f] = cauti_df[f].astype(str)
    cauti_df[f] = pd.to_numeric(cauti_df[f], errors='coerce')

cauti_df['Start Date'] = pd.to_datetime(cauti_df['Start Date'])
cauti_df.style.format({'Start Date': lambda t: t.strftime("%Y-%m-%d")})
cauti_df['End Date'] = pd.to_datetime(cauti_df['End Date'])
cauti_df.style.format({'End Date': lambda t: t.strftime("%Y-%m-%d")})
cauti_df = cauti_df[(cauti_df['Start Date'].isin(hac_df['Start Date'].unique())) | (cauti_df['End Date'].isin(hac_df['End Date'].unique()))]

cauti_df.head()

Unnamed: 0,Facility ID,CAUTI Urinary Catheter Days,CAUTI Observed Cases,CAUTI Predicted Cases,Start Date,End Date,file_year,file_month
8,370056,4206.0,9.0,5.646,2014-01-01,2014-12-31,2015,10
19,31301,,,,2013-01-01,2013-12-31,2015,1
23,100093,5016.0,18.0,10.983,2013-01-01,2013-12-31,2015,1
50,501327,,,,2013-01-01,2013-12-31,2014,12
54,230108,344.0,0.0,0.447,2013-01-01,2013-12-31,2014,12


In [6]:
##############################   CLABSI   ###############################################

clabsi_df = pd.read_pickle(mydir + "1_preprocess_CareCompare_data/preprocessed_HAI_data/CLABSI_Data.pkl")
clabsi_df = clabsi_df.filter(items=['Facility ID', 'CLABSI Device Days (ICUs only)', 
                                    'CLABSI Observed Cases (ICUs only)', 'CLABSI Predicted Cases (ICUs only)', 
                                    'Start Date', 'End Date', 'file_year', 'file_month'], axis=1)

clabsi_df.rename(columns={'CLABSI Device Days (ICUs only)': 'CLABSI Device Days', 
                         'CLABSI Observed Cases (ICUs only)': 'CLABSI Observed Cases', 
                         'CLABSI Predicted Cases (ICUs only)': 'CLABSI Predicted Cases', 
                        }, inplace=True)


features = ['CLABSI Device Days', 'CLABSI Observed Cases', 'CLABSI Predicted Cases']
for f in features:
    clabsi_df[f] = clabsi_df[f].astype(str)
    clabsi_df[f] = pd.to_numeric(clabsi_df[f], errors='coerce')

clabsi_df['Start Date'] = pd.to_datetime(clabsi_df['Start Date'])
clabsi_df.style.format({'Start Date': lambda t: t.strftime("%Y-%m-%d")})
clabsi_df['End Date'] = pd.to_datetime(clabsi_df['End Date'])
clabsi_df.style.format({'End Date': lambda t: t.strftime("%Y-%m-%d")})
clabsi_df = clabsi_df[(clabsi_df['Start Date'].isin(hac_df['Start Date'].unique())) | (clabsi_df['End Date'].isin(hac_df['End Date'].unique()))]

clabsi_df.head()

Unnamed: 0,Facility ID,CLABSI Device Days,CLABSI Observed Cases,CLABSI Predicted Cases,Start Date,End Date,file_year,file_month
13,330396,2555.0,7.0,5.957,2014-01-01,2014-12-31,2015,10
17,310039,3790.0,3.0,6.488,2013-01-01,2013-12-31,2014,12
39,400007,1055.0,0.0,1.583,2013-01-01,2013-12-31,2015,1
49,420002,2350.0,2.0,3.525,2013-01-01,2013-12-31,2015,1
54,381322,,,,2013-01-01,2013-12-31,2015,1


In [7]:
###################  Merge CAUTI and CLABSI data ####################################

hai_df = cauti_df.merge(clabsi_df, on=['Facility ID', 'Start Date', 'End Date', 'file_year', 'file_month'], how='outer')

####################  Drop duplicate rows resulting from merger #############################

hai_df.drop_duplicates(inplace=True)

########################  Conversions to numeric  #############################################

features = ['CAUTI Urinary Catheter Days', 'CLABSI Device Days',
            'CLABSI Observed Cases', 'CLABSI Predicted Cases',
            'CAUTI Observed Cases', 'CAUTI Predicted Cases', 
           ]

for f in features:
    hai_df[f] = hai_df[f].astype(str)
    hai_df[f] = hai_df[f].str.replace('*', '')
    hai_df[f] = pd.to_numeric(hai_df[f], errors='coerce')
    
hai_df['Total device days'] = hai_df['CLABSI Device Days'] + hai_df['CAUTI Urinary Catheter Days']


########################  Reorder columns  #############################################

col_to_move = hai_df.pop('CAUTI Urinary Catheter Days')
hai_df.insert(hai_df.shape[1] - 2, 'CAUTI Urinary Catheter Days', col_to_move)

hai_df.head()

Unnamed: 0,Facility ID,CAUTI Observed Cases,CAUTI Predicted Cases,Start Date,End Date,file_year,file_month,CLABSI Device Days,CLABSI Observed Cases,CAUTI Urinary Catheter Days,CLABSI Predicted Cases,Total device days
0,370056,9.0,5.646,2014-01-01,2014-12-31,2015,10,3058.0,4.0,4206.0,4.501,7264.0
1,31301,,,2013-01-01,2013-12-31,2015,1,,,,,
2,100093,18.0,10.983,2013-01-01,2013-12-31,2015,1,3198.0,3.0,5016.0,5.913,8214.0
3,501327,,,2013-01-01,2013-12-31,2014,12,,,,,
4,230108,0.0,0.447,2013-01-01,2013-12-31,2014,12,117.0,0.0,344.0,0.177,461.0


## Filter HAI data on start dates and end dates that match those in the HAC file.

In [8]:
#########  Filter on start dates and end dates that match those in the HAC file  #######

hai_df = hai_df[(hai_df['Start Date'].isin(['2013-01-01', '2014-01-01'])) & (hai_df['End Date'].isin(['2013-12-31', '2014-12-31']))]
print(hai_df['Start Date'].unique())
print(hai_df['End Date'].unique())


['2014-01-01T00:00:00.000000000' '2013-01-01T00:00:00.000000000']
['2014-12-31T00:00:00.000000000' '2013-12-31T00:00:00.000000000']


## Then, drop hospitals from HAI data that are not contained in HAC data.

In [9]:
# hospitals in the hac data but not in the hai data ...
tdf = hac_df[~hac_df['Facility ID'].isin(hai_df['Facility ID'].unique())]
print(len(tdf['Facility ID'].unique()), 'hospitals in HAC dataset but not in HAI dataset\n')
    
print(sorted(tdf['Facility ID'].unique()))
print(tdf['AHRQ PSI-90 Score'].unique())
print(tdf['CAUTI Score'].unique())
print(tdf['CLABSI Score'].unique())

# drop hospitals in HAI data that are not in HAC data
hai_df = hai_df[hai_df['Facility ID'].isin(hac_df['Facility ID'].unique())]

tdf.head()

45 hospitals in HAC dataset but not in HAI dataset

['030136', '030137', '050545', '050546', '050547', '050548', '050778', '060126', '070038', '100134', '100298', '100324', '100325', '110235', '170201', '190300', '190302', '210058', '210064', '210065', '250018', '250152', '290042', '330387', '330408', '330409', '340168', '340188', '360247', '360363', '420107', '490104', '490129', '510091', '670072', '670093', '670096', '670099', '670100', '670101', '670102', '670103', '670105', '670106', '670107']
[nan  7.  9.  6.]
[nan 10.]
[nan 10.]


Unnamed: 0,AHRQ PSI-90 Footnote,AHRQ PSI-90 Score,CAUTI Footnote,CAUTI Score,CLABSI Footnote,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Score Footnote,Domain 1 Start Date,End Date,Domain 2 Score,Domain 2 Score Footnote,Start Date,Facility ID,Facility Name,Fiscal Year,SSI Footnote,SSI Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year,Payment Reduction
154,,,,,,,6302014,,,7012012,2014-12-31,,,2013-01-01,30136,DIGNITY HEALTH - ARIZONA GENERAL HOSPITAL,2016,,,AZ,,,11,2016,No
155,,,,,,,6302014,,,7012012,2014-12-31,,,2013-01-01,30137,GREEN VALLEY HOSPITAL,2016,,,AZ,,,11,2016,No
411,,7.0,,,,,6302014,7.0,,7012012,2014-12-31,10.0,18.0,2013-01-01,50545,LANTERMAN DEVELOPMENTAL CENTER,2016,18.0,10.0,CA,,9.25,11,2016,Yes
412,,7.0,18.0,10.0,18.0,10.0,6302014,7.0,,7012012,2014-12-31,10.0,18.0,2013-01-01,50546,PORTERVILLE DEVELOPMENTAL CENTER,2016,18.0,10.0,CA,,9.25,11,2016,Yes
413,,9.0,,,,,6302014,9.0,,7012012,2014-12-31,,,2013-01-01,50547,SONOMA DEVELOPMENTAL CENTER,2016,,,CA,,9.0,11,2016,Yes


## Correct HAI file for non-duplicate rows having duplicate dates

**Problem:** Some rows for the same provider have duplicate measurement dates but different values for observed cases, predicted cases, etc. This results from each year having multiple (quarterly) files, the data within which can vary among files. Additionally, the most recent file for each year is not always the right file to use. 

**Need:** Since only one row can be used, we need to figure out which row should be used.

**Solution:** Select the row with the greatest totals for predicted cases for each HAI.

## Aggregate annual HAI data into biennial data

Purpose: Match the biennial measurement periods of HAC data


In [10]:
start_dates = hac_df['Start Date'].tolist()
end_dates = hac_df['End Date'].tolist()
prvdrs = hac_df['Facility ID'].tolist()

total_device_days = []

cauti_days = []
clabsi_days = []

cauti_pred = []
clabsi_pred = []

cauti_obs = []
clabsi_obs = []

for i, start in enumerate(start_dates):
    end = end_dates[i]
    prvdr = prvdrs[i]
    
    tdf = hai_df[hai_df['Facility ID'] == prvdr]
    tdf = tdf[(tdf['Start Date'] == start) | (tdf['End Date'] == end)]
    
    if tdf.shape[0] == 1:
        print('tdf.shape[0]:', 1)
        print('hospital:', prvdr)
        print(tdf['Start Date'].unique())
        print(tdf['End Date'].unique())
        
    if tdf.shape[0] > 2:
        for date_ in ['Start Date', 'End Date']:
            tdf['duplicated dates'] = tdf.duplicated(subset=[date_], keep=False)
            tdf.sort_values(by=[
                                'CAUTI Observed Cases', 'CLABSI Observed Cases',
                                ], inplace=True, ascending=True)

            tdf.drop_duplicates(subset=[date_], inplace=True, keep='last')

        if tdf.shape[0] > 2:
            print('Error:')
            print("tdf.shape[0] > 2:", tdf.shape[0])
            print(start)
            print(end)
            print(tdf.head())
            break
        
    total_device_days.append(np.nansum(tdf['Total device days']))
    cauti_days.append(np.nansum(tdf['CAUTI Urinary Catheter Days']))
    clabsi_days.append(np.nansum(tdf['CLABSI Device Days']))
        
    cauti_pred.append(np.nansum(tdf['CAUTI Predicted Cases']))
    clabsi_pred.append(np.nansum(tdf['CLABSI Predicted Cases']))
    
    cauti_obs.append(np.nansum(tdf['CAUTI Observed Cases']))
    clabsi_obs.append(np.nansum(tdf['CLABSI Observed Cases']))
    

tdf.shape[0]: 1
hospital: 050777
['2014-01-01T00:00:00.000000000']
['2014-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 190312
['2014-01-01T00:00:00.000000000']
['2014-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 190313
['2014-01-01T00:00:00.000000000']
['2014-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 230144
['2014-01-01T00:00:00.000000000']
['2014-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 250127
['2014-01-01T00:00:00.000000000']
['2014-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 280134
['2014-01-01T00:00:00.000000000']
['2014-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 290002
['2014-01-01T00:00:00.000000000']
['2014-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 310130
['2014-01-01T00:00:00.000000000']
['2014-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 390302
['2014-01-01T00:00:00.000000000']
['2014-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 440147
['2014-01-01T00:00:00.000000000']
['2014-12-31T00:00:00.00

## Add HAI data to the HAC dataframe and save

In [11]:
hac_df['Total device days'] = total_device_days
hac_df['CAUTI Urinary Catheter Days'] = cauti_days
hac_df['CLABSI Device Days'] = clabsi_days

hac_df['CAUTI Observed Cases'] = cauti_obs
hac_df['CLABSI Observed Cases'] = clabsi_obs

hac_df['CAUTI Predicted Cases'] = cauti_pred
hac_df['CLABSI Predicted Cases'] = clabsi_pred

hac_df['CAUTI derived SIR'] = hac_df['CAUTI Observed Cases'] / np.round(hac_df['CAUTI Predicted Cases'],4)
hac_df['CLABSI derived SIR'] = hac_df['CLABSI Observed Cases'] / np.round(hac_df['CLABSI Predicted Cases'],4)

print('hac_df.shape:', hac_df.shape)
print(hac_df['CAUTI Score'].unique())

hac_df.head()


hac_df.shape: (3352, 34)
[10.  3.  1. nan  4.  5.  8.  9.  6.  2.  7.]


Unnamed: 0,AHRQ PSI-90 Footnote,AHRQ PSI-90 Score,CAUTI Footnote,CAUTI Score,CLABSI Footnote,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Score Footnote,Domain 1 Start Date,End Date,Domain 2 Score,Domain 2 Score Footnote,Start Date,Facility ID,Facility Name,Fiscal Year,SSI Footnote,SSI Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR
0,,1.0,,10.0,,10.0,6302014,1.0,,7012012,2014-12-31,8.0,,2013-01-01,10001,SOUTHEAST ALABAMA MEDICAL CENTER,2016,,4.0,AL,,6.25,11,2016,No,19927.0,14812.0,5115.0,56.0,17.0,17.783,7.675,3.149075,2.214984
1,,3.0,,3.0,,10.0,6302014,3.0,,7012012,2014-12-31,5.3333,,2013-01-01,10005,MARSHALL MEDICAL CENTERS,2016,,3.0,AL,,4.75,11,2016,No,7366.0,5640.0,1726.0,3.0,4.0,7.334,2.591,0.409054,1.543805
2,,7.0,,3.0,,5.0,6302014,7.0,,7012012,2014-12-31,6.0,,2013-01-01,10006,ELIZA COFFEE MEMORIAL HOSPITAL,2016,,10.0,AL,,6.25,11,2016,No,23537.0,12769.0,10768.0,7.0,6.0,21.293,17.884,0.328747,0.335495
3,,3.0,,1.0,,,6302014,3.0,,7012012,2014-12-31,1.0,,2013-01-01,10007,MIZELL MEMORIAL HOSPITAL,2016,,,AL,,1.5,11,2016,No,540.0,477.0,63.0,0.0,0.0,1.241,0.146,0.0,0.0
4,,6.0,,,,,6302014,6.0,,7012012,2014-12-31,,,2013-01-01,10008,CRENSHAW COMMUNITY HOSPITAL,2016,,,AL,,6.0,11,2016,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


#        

## Reproduce scores for 2016

In [12]:
df_2016 = hac_df[hac_df['file_year'] == '2016']
df_2016.dropna(how='all', axis=1, inplace=True)

cauti_deciles = np.nanpercentile(df_2016['CAUTI derived SIR'], np.arange(0, 100, 10))
clabsi_deciles = np.nanpercentile(df_2016['CLABSI derived SIR'], np.arange(0, 100, 10))

df_2016.head()

Unnamed: 0,AHRQ PSI-90 Footnote,AHRQ PSI-90 Score,CAUTI Footnote,CAUTI Score,CLABSI Footnote,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Score Footnote,Domain 1 Start Date,End Date,Domain 2 Score,Domain 2 Score Footnote,Start Date,Facility ID,Facility Name,Fiscal Year,SSI Footnote,SSI Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR
0,,1.0,,10.0,,10.0,6302014,1.0,,7012012,2014-12-31,8.0,,2013-01-01,10001,SOUTHEAST ALABAMA MEDICAL CENTER,2016,,4.0,AL,,6.25,11,2016,No,19927.0,14812.0,5115.0,56.0,17.0,17.783,7.675,3.149075,2.214984
1,,3.0,,3.0,,10.0,6302014,3.0,,7012012,2014-12-31,5.3333,,2013-01-01,10005,MARSHALL MEDICAL CENTERS,2016,,3.0,AL,,4.75,11,2016,No,7366.0,5640.0,1726.0,3.0,4.0,7.334,2.591,0.409054,1.543805
2,,7.0,,3.0,,5.0,6302014,7.0,,7012012,2014-12-31,6.0,,2013-01-01,10006,ELIZA COFFEE MEMORIAL HOSPITAL,2016,,10.0,AL,,6.25,11,2016,No,23537.0,12769.0,10768.0,7.0,6.0,21.293,17.884,0.328747,0.335495
3,,3.0,,1.0,,,6302014,3.0,,7012012,2014-12-31,1.0,,2013-01-01,10007,MIZELL MEMORIAL HOSPITAL,2016,,,AL,,1.5,11,2016,No,540.0,477.0,63.0,0.0,0.0,1.241,0.146,0.0,0.0
4,,6.0,,,,,6302014,6.0,,7012012,2014-12-31,,,2013-01-01,10008,CRENSHAW COMMUNITY HOSPITAL,2016,,,AL,,6.0,11,2016,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [13]:

hais = ['CAUTI', 'CLABSI']
start_dates = df_2016['Start Date'].unique()

new_df = 0

tdf1 = df_2016[df_2016['Start Date'] == start_dates[0]]
    
for hai in hais:
    tdf2 = tdf1[~tdf1[hai + ' Footnote'].isin([18, '18', '18 ', ' 18'])]
    if tdf2.shape[0] > 0:
            
        derived_sirs = tdf2[hai + ' derived SIR'].tolist()
        reported_scores = tdf2[hai + ' Score'].tolist()
        derived_scores = []

        for ii, sir in enumerate(derived_sirs):
            sir = float(sir)
            if np.isnan(sir) == True or np.isnan(reported_scores[ii]) == True:
                derived_scores.append(np.nan)
            
            elif hai == 'CLABSI':
                score = np.nan
                if sir == 0.0:
                    score = 1
                elif sir <= 0.101: #clabsi_deciles[1]:
                    score = 2
                elif sir <= 0.239: #clabsi_deciles[2]:
                    score = 3
                elif sir <= 0.318: #clabsi_deciles[3]:
                    score = 4
                elif sir <= 0.410:  #clabsi_deciles[4]:
                    score = 5
                elif sir <= 0.496: #clabsi_deciles[5]:
                    score = 6
                elif sir <= 0.604: #clabsi_deciles[6]:
                    score = 7
                elif sir <= 0.748: #clabsi_deciles[7]:
                    score = 8
                elif sir <= 0.995: #clabsi_deciles[8]:
                    score = 9
                elif sir > 0.995: #clabsi_deciles[9]:
                    score = 10
                else:
                    score = np.nan

                derived_scores.append(score)
                
            elif hai == 'CAUTI':
                score = np.nan
                if sir == 0.0:
                    score = 1
                elif sir <= 0.279:  #cauti_deciles[1]:
                    score = 2
                elif sir <= 0.469:  #cauti_deciles[2]:
                    score = 3
                elif sir <= 0.656:  #cauti_deciles[3]:
                    score = 4
                elif sir <= 0.847:  #cauti_deciles[4]:
                    score = 5
                elif sir <= 1.065:  #cauti_deciles[5]:
                    score = 6
                elif sir <= 1.298:  #cauti_deciles[6]:
                    score = 7
                elif sir <= 1.575:  #cauti_deciles[7]:
                    score = 8
                elif sir <= 2.060:  #cauti_deciles[8]:
                    score = 9
                elif sir >  2.060: #cauti_deciles[9]:
                    score = 10
                else:
                    score = np.nan

                derived_scores.append(score)


        tdf2[hai + ' derived score'] = derived_scores
        print('len:', len(derived_scores), len(reported_scores))
        
        # Assign maximum scores to hospitals with HAI footnote 18 
        Max = np.max(tdf2[hai + ' derived score'])
            
    tdf3 = tdf1[tdf1[hai + ' Footnote'].isin([18, '18', '18 ', ' 18'])]
    if tdf3.shape[0] > 0:
        tdf3[hai + ' derived score'] = 10
        tdf1 = pd.concat([tdf2, tdf3], axis=0)
            
    else:
        tdf1 = tdf2.copy(deep=True)

features = ['CAUTI derived SIR', 'CAUTI Score', 'CAUTI derived score',
            'CLABSI derived SIR', 'CLABSI Score', 'CLABSI derived score',
            'SSI Score', 'AHRQ PSI-90 Score']

for f in features:
    tdf1[f] = pd.to_numeric(tdf1[f], errors='coerce')

df_2016 = tdf1.copy(deep=True)
df_2016.head()

len: 3342 3342
len: 3342 3342


Unnamed: 0,AHRQ PSI-90 Footnote,AHRQ PSI-90 Score,CAUTI Footnote,CAUTI Score,CLABSI Footnote,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Score Footnote,Domain 1 Start Date,End Date,Domain 2 Score,Domain 2 Score Footnote,Start Date,Facility ID,Facility Name,Fiscal Year,SSI Footnote,SSI Score,State,Total HAC Footnote,Total HAC Score,file_month,file_year,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR,CAUTI derived score,CLABSI derived score
0,,1.0,,10.0,,10.0,6302014,1.0,,7012012,2014-12-31,8.0,,2013-01-01,10001,SOUTHEAST ALABAMA MEDICAL CENTER,2016,,4.0,AL,,6.25,11,2016,No,19927.0,14812.0,5115.0,56.0,17.0,17.783,7.675,3.149075,2.214984,10.0,10.0
1,,3.0,,3.0,,10.0,6302014,3.0,,7012012,2014-12-31,5.3333,,2013-01-01,10005,MARSHALL MEDICAL CENTERS,2016,,3.0,AL,,4.75,11,2016,No,7366.0,5640.0,1726.0,3.0,4.0,7.334,2.591,0.409054,1.543805,3.0,10.0
2,,7.0,,3.0,,5.0,6302014,7.0,,7012012,2014-12-31,6.0,,2013-01-01,10006,ELIZA COFFEE MEMORIAL HOSPITAL,2016,,10.0,AL,,6.25,11,2016,No,23537.0,12769.0,10768.0,7.0,6.0,21.293,17.884,0.328747,0.335495,3.0,5.0
3,,3.0,,1.0,,,6302014,3.0,,7012012,2014-12-31,1.0,,2013-01-01,10007,MIZELL MEMORIAL HOSPITAL,2016,,,AL,,1.5,11,2016,No,540.0,477.0,63.0,0.0,0.0,1.241,0.146,0.0,0.0,1.0,
4,,6.0,,,,,6302014,6.0,,7012012,2014-12-31,,,2013-01-01,10008,CRENSHAW COMMUNITY HOSPITAL,2016,,,AL,,6.0,11,2016,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,


In [14]:
holdout_df = df_2016[(df_2016['State'] == 'MD') | ~df_2016['Payment Reduction'].isin(['Yes', 'No']) | (df_2016['Total HAC Score'].isin([float("NaN"), np.nan]))]
df_2016 = df_2016[(df_2016['State'] != 'MD') & (df_2016['Payment Reduction'].isin(['Yes', 'No'])) & (~df_2016['Total HAC Score'].isin([float("NaN"), np.nan]))]


hac_scores = []
for hosp in df_2016['Facility ID'].tolist():
    tdf = df_2016[df_2016['Facility ID'] == hosp]

    d1 = 0
    d2 = 0
        
    w_ls = []
    sum_ls = []

    # Use original scores for all HAIs for a specific type of testing, i.e., can penalties be reproduced when using data directly from the HAC files
    m_ls = ['CAUTI derived score', 'CLABSI derived score', 'SSI Score']

    # Use original SSI scores but derived scores for CDI, CAUTI, CLABSI, and MRSA for actual results
    #m_ls = ['CAUTI Score', 'CLABSI Score', 'SSI Score']
                
    s = 0
    w = 0
    for m in m_ls:
        v = tdf[m].tolist()
        if len(list(set(v))) > 1:
            print('len(list(set(v))) > 1')
            sys.exit()

        v = tdf[m].iloc[0]

        if np.isnan(v) == False: 
            s += v
            w += 1

    if w == 0:
        d2 = np.nan
    else:
        d2 = s/w
        
    d1 = float(tdf['AHRQ PSI-90 Score'].iloc[0])
        
    if np.isnan(d1) == True and np.isnan(d2) == True:
        hac_scores.append(np.nan)
    elif np.isnan(d1) == True and np.isnan(d2) == False:
        hac_scores.append(d2)
    elif np.isnan(d1) == False and np.isnan(d2) == True:
        hac_scores.append(d1)
    elif np.isnan(d1) == False and np.isnan(d2) == False:
        hac_scores.append(0.25*d1 + 0.75*d2)
                
df_2016['Total HAC Score (derived)'] = hac_scores
print(df_2016.shape[0], 'hospitals in hac_df')

tdf = df_2016[~df_2016['Total HAC Score (derived)'].isin([np.nan, float('NaN')])]
p75 = np.percentile(tdf['Total HAC Score (derived)'], 75, method='linear')

pr = []
for hosp in df_2016['Facility ID'].tolist():
    tdf = df_2016[df_2016['Facility ID'] == hosp]

    p = tdf['Payment Reduction'].iloc[0]
    if p != 'Yes' and p != 'No' and np.isnan(p) == True:
        pr.append(np.nan)

    else:
        score = tdf['Total HAC Score (derived)'].iloc[0]

        if np.isnan(score) == True:
            pr.append('No')
        elif score <= p75:
            pr.append('No')
        elif score > p75:
            pr.append('Yes')
        else:
            print('This score is an error:', score)
            sys.exit()

df_2016['Payment Reduction (derived)'] = pr
            
o_list = df_2016['Payment Reduction'].tolist()
d_list = df_2016['Payment Reduction (derived)'].tolist()

same = 0
diff = 0
res_ls = []
for i, o in enumerate(o_list):
    if o == d_list[i]:
        same += 1
        res_ls.append(1)
    else:
        diff += 1
        res_ls.append(0)
            
df_2016['Payment Reduction Reproduced?'] = res_ls
    
print(same, "Penalty assignments were reproduced")
print(diff, "Penalty assignments were not reproduced")
print(str(np.round(100 * same/(same+diff),2)) + '% penalty assignments were reproduced\n')
    
df_2016['HAC delta'] = df_2016['Total HAC Score'] - df_2016['Total HAC Score (derived)']
df_2016['CAUTI delta'] = df_2016['CAUTI Score'] - df_2016['CAUTI derived score']
df_2016['CLABSI delta'] = df_2016['CLABSI Score'] - df_2016['CLABSI derived score']


3201 hospitals in hac_df
3201 Penalty assignments were reproduced
0 Penalty assignments were not reproduced
100.0% penalty assignments were reproduced



In [15]:
ls1 = list(df_2016)
ls2 = list(holdout_df)
ls = list(filter(lambda x:x in ls1, ls2))
print(df_2016.shape)
print(holdout_df.shape)
df_2016 = df_2016.merge(holdout_df, how='outer', on=ls)
print(df_2016.shape)


(3201, 42)
(151, 36)
(3352, 42)


In [16]:
display_df = df_2016[df_2016['Payment Reduction Reproduced?'] == 1]
items = ['file_year', 'HAI Measures End Date', 'HAI Measures Start Date',
             'CAUTI Footnote', 'CAUTI Score', 'CAUTI derived score',
             'CLABSI Footnote', 'CLABSI Score', 'CLABSI derived score',
             'SSI Footnote', 'SSI Score',
             'AHRQ PSI-90 Footnote', 'AHRQ PSI-90 Score',
             'Total HAC Footnote', 'Total HAC Score', 'Total HAC Score (derived)',
             'Payment Reduction Footnote', 'Payment Reduction', 
             'Payment Reduction (derived)', 'Payment Reduction Reproduced?',
             ]

display_df = display_df.filter(items=items)
display_df = display_df[~display_df['Total HAC Score'].isin([float("NaN"), np.nan])]
display_df = display_df[~display_df['Total HAC Score (derived)'].isin([float("NaN"), np.nan])]

display_df['delta'] = ((display_df['Total HAC Score'] - display_df['Total HAC Score (derived)'])**2)**0.5
display_df = display_df.round(4)

display_df.sort_values(by=['delta'], ascending=False, inplace=True)
display_df.head()




Unnamed: 0,file_year,CAUTI Footnote,CAUTI Score,CAUTI derived score,CLABSI Footnote,CLABSI Score,CLABSI derived score,SSI Footnote,SSI Score,AHRQ PSI-90 Footnote,AHRQ PSI-90 Score,Total HAC Footnote,Total HAC Score,Total HAC Score (derived),Payment Reduction,Payment Reduction (derived),Payment Reduction Reproduced?,delta
2693,2016,,9.0,10.0,,8.0,8.0,,8.0,,9.0,,8.5,8.75,Yes,Yes,1.0,0.25
589,2016,,4.0,5.0,,5.0,5.0,,2.0,,2.0,,3.25,3.5,No,No,1.0,0.25
2153,2016,,7.0,7.0,,3.0,4.0,,4.0,,10.0,,6.0,6.25,No,No,1.0,0.25
1133,2016,,5.0,6.0,,1.0,1.0,,7.0,,2.0,,3.75,4.0,No,No,1.0,0.25
974,2016,,4.0,5.0,,5.0,5.0,,7.0,,1.0,,4.25,4.5,No,No,1.0,0.25


In [17]:
tdf1 = df_2016[df_2016['Payment Reduction'] == 'Yes']
tdf2 = df_2016[df_2016['Payment Reduction'].isin(['Yes', 'No'])]
print(tdf1.shape[0]/tdf2.shape[0])

tdf1 = df_2016[df_2016['Payment Reduction'] == 'No']
print(tdf1.shape[0]/tdf2.shape[0])

print(df_2016.shape[0])
print(len(df_2016['Facility ID'].unique()))

df_2016.to_pickle('~/GitHub/HACRP-HAIs/data/merged_HAC_HAI/HAI_HAC_2016.pkl', protocol=5)

0.22583532219570407
0.774164677804296
3352
3352
