In [1]:
import pandas as pd
import warnings
import sys
import numpy as np
import scipy as sc
import random
from scipy import stats
from numpy import log10, sqrt

mydir = '/Users/kenlocey/GitHub/HACRP-HAIs/'
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def def_display_df_misses(df):
    df = df[df['Payment Reduction Reproduced?'] == 0]
    items = ['file_year', 'HAI Measures End Date', 'HAI Measures Start Date',
             'CAUTI Footnote', 'CAUTI W Z Score', 'CAUTI SIR W Z Score',
             'CDI Footnote', 'CDI W Z Score', 'CDI SIR W Z Score',
             'CLABSI Footnote', 'CLABSI W Z Score', 'CLABSI SIR W Z Score',
             'MRSA Footnote', 'MRSA W Z Score', 'MRSA SIR W Z Score',
             'SSI Footnote', 'SSI W Z Score', 'SSI SIR W Z Score',
             'PSI-90 Footnote', 'PSI-90 W Z Score', 'PSI-90 SIR W Z Score',
             'Total HAC Footnote', 'Total HAC Score', 'Total HAC Score (derived)',
             'Payment Reduction Footnote', 'Payment Reduction', 
             'Payment Reduction (derived)', 'Payment Reduction Reproduced?',
             ]
    return df.filter(items=items)

hac_mo = '10'

## Load HAC file

In [2]:
hac_df = pd.read_pickle(mydir + "data/CareCompare_data/CombinedFiles_HACRP/Facility.pkl")
features = ['CAUTI Score', 'CLABSI Score', 'Total HAC Score', 'Domain 1 Score', 'AHRQ PSI-90 Score', 'Domain 2 Score']
for f in features:
    hac_df[f] = hac_df[f].astype(str)
    hac_df[f] = hac_df[f].str.replace('*', '')
    hac_df[f] = pd.to_numeric(hac_df[f], errors='coerce')

hac_df = hac_df[hac_df['SSI Score'].isin([np.nan, float("NaN")])]
hac_df = hac_df[hac_df['file_year'] == '2015']
hac_df = hac_df[hac_df['file_month'] == hac_mo]
hac_df.dropna(how='all', axis=1, inplace=True)

print(hac_df.shape)
print(sorted(hac_df['file_month'].unique()))
hac_df.head()

(3336, 16)
['10']


Unnamed: 0,AHRQ PSI-90 Score,CAUTI Score,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,Domain 2 End Date,Domain 2 Score,Domain 2 Start Date,Facility ID,Facility Name,Fiscal Year,State,Total HAC Score,file_month,file_year
0,1.0,9.0,7.0,30-JUN-13,1.0,01-JUL-11,31-DEC-13,8.0,01-JAN-12,110064,MIDTOWN MEDICAL CENTER,2015,GA,5.55,10,2015
1,2.0,2.0,4.0,30-JUN-13,2.0,01-JUL-11,31-DEC-13,3.0,01-JAN-12,110069,HOUSTON MEDICAL CENTER,2015,GA,2.65,10,2015
2,4.0,1.0,,30-JUN-13,4.0,01-JUL-11,31-DEC-13,1.0,01-JAN-12,110071,APPLING HOSPITAL,2015,GA,2.05,10,2015
3,5.0,1.0,,30-JUN-13,5.0,01-JUL-11,31-DEC-13,1.0,01-JAN-12,110073,DORMINY MEDICAL CENTER,2015,GA,2.4,10,2015
4,9.0,9.0,7.0,30-JUN-13,9.0,01-JUL-11,31-DEC-13,8.0,01-JAN-12,110074,ATHENS REGIONAL MEDICAL CENTER,2015,GA,8.35,10,2015


## Add penalty assignments

In [3]:
tdf = hac_df[hac_df['State'] != 'MD']
p75 = np.nanpercentile(tdf['Total HAC Score'].tolist(), 75, axis=0)
pr = []
hacs = hac_df['Total HAC Score'].tolist()

for i in hacs:
    if i > p75:
        pr.append('Yes')
    elif i <= p75:
        pr.append('No')
    else:
        pr.append('No')

hac_df['Payment Reduction'] = pr

## Format dates within the HAC file 

In [4]:
hac_df['Domain 2 Start Date'] = pd.to_datetime(hac_df['Domain 2 Start Date'])
hac_df.style.format({'Domain 2 Start Date': lambda t: t.strftime("%Y-%m-%d")})
hac_df['Domain 2 End Date'] = pd.to_datetime(hac_df['Domain 2 End Date'])
hac_df.style.format({'Domain 2 End Date': lambda t: t.strftime("%Y-%m-%d")})

hac_df['Domain 1 Start Date'] = pd.to_datetime(hac_df['Domain 1 Start Date'])
hac_df.style.format({'Domain 2 Start Date': lambda t: t.strftime("%Y-%m-%d")})
hac_df['Domain 1 End Date'] = pd.to_datetime(hac_df['Domain 1 End Date'])
hac_df.style.format({'Domain 2 End Date': lambda t: t.strftime("%Y-%m-%d")})
hac_df.sort_values(by='Domain 2 End Date', inplace=True)

print(hac_df['Domain 2 Start Date'].unique(), '\n')
print(hac_df['Domain 2 End Date'].unique())

hac_df.rename(columns={'Domain 2 Start Date': 'Start Date', 
                       'Domain 2 End Date': 'End Date'}, 
              inplace=True)

['2012-01-01T00:00:00.000000000'] 

['2013-12-31T00:00:00.000000000']


## Load and merge HAI files

In [5]:
##############################   CAUTI   ################################################

cauti_df = pd.read_pickle(mydir + "1_preprocess_CareCompare_data/preprocessed_HAI_data/CAUTI_Data.pkl")
cauti_df = cauti_df.filter(items=['Facility ID', 'CAUTI Urinary Catheter Days (ICUs only)', 
                                  'CAUTI Observed Cases (ICUs only)', 'CAUTI Predicted Cases (ICUs only)', 
                                  'Start Date', 'End Date', 'file_year', 'file_month'], axis=1)

cauti_df.rename(columns={'CAUTI Urinary Catheter Days (ICUs only)': 'CAUTI Urinary Catheter Days', 
                         'CAUTI Observed Cases (ICUs only)': 'CAUTI Observed Cases', 
                         'CAUTI Predicted Cases (ICUs only)': 'CAUTI Predicted Cases', 
                        }, inplace=True)

features = ['CAUTI Urinary Catheter Days', 'CAUTI Observed Cases', 'CAUTI Predicted Cases']
for f in features:
    cauti_df[f] = cauti_df[f].astype(str)
    cauti_df[f] = pd.to_numeric(cauti_df[f], errors='coerce')

cauti_df['Start Date'] = pd.to_datetime(cauti_df['Start Date'])
cauti_df.style.format({'Start Date': lambda t: t.strftime("%Y-%m-%d")})
cauti_df['End Date'] = pd.to_datetime(cauti_df['End Date'])
cauti_df.style.format({'End Date': lambda t: t.strftime("%Y-%m-%d")})
cauti_df = cauti_df[(cauti_df['Start Date'].isin(hac_df['Start Date'].unique())) | (cauti_df['End Date'].isin(hac_df['End Date'].unique()))]

cauti_df.head()

Unnamed: 0,Facility ID,CAUTI Urinary Catheter Days,CAUTI Observed Cases,CAUTI Predicted Cases,Start Date,End Date,file_year,file_month
12,400105,608.0,1.0,1.216,2012-01-01,2012-09-30,2013,7
19,31301,,,,2013-01-01,2013-12-31,2015,1
23,100093,5016.0,18.0,10.983,2013-01-01,2013-12-31,2015,1
50,501327,,,,2013-01-01,2013-12-31,2014,12
54,230108,344.0,0.0,0.447,2013-01-01,2013-12-31,2014,12


In [6]:
##############################   CLABSI   ###############################################

clabsi_df = pd.read_pickle(mydir + "1_preprocess_CareCompare_data/preprocessed_HAI_data/CLABSI_Data.pkl")
clabsi_df = clabsi_df.filter(items=['Facility ID', 'CLABSI Device Days (ICUs only)', 
                                    'CLABSI Observed Cases (ICUs only)', 'CLABSI Predicted Cases (ICUs only)', 
                                    'Start Date', 'End Date', 'file_year', 'file_month'], axis=1)

clabsi_df.rename(columns={'CLABSI Device Days (ICUs only)': 'CLABSI Device Days', 
                         'CLABSI Observed Cases (ICUs only)': 'CLABSI Observed Cases', 
                         'CLABSI Predicted Cases (ICUs only)': 'CLABSI Predicted Cases', 
                        }, inplace=True)


features = ['CLABSI Device Days', 'CLABSI Observed Cases', 'CLABSI Predicted Cases']
for f in features:
    clabsi_df[f] = clabsi_df[f].astype(str)
    clabsi_df[f] = pd.to_numeric(clabsi_df[f], errors='coerce')

clabsi_df['Start Date'] = pd.to_datetime(clabsi_df['Start Date'])
clabsi_df.style.format({'Start Date': lambda t: t.strftime("%Y-%m-%d")})
clabsi_df['End Date'] = pd.to_datetime(clabsi_df['End Date'])
clabsi_df.style.format({'End Date': lambda t: t.strftime("%Y-%m-%d")})
clabsi_df = clabsi_df[(clabsi_df['Start Date'].isin(hac_df['Start Date'].unique())) | (clabsi_df['End Date'].isin(hac_df['End Date'].unique()))]

clabsi_df.head()

Unnamed: 0,Facility ID,CLABSI Device Days,CLABSI Observed Cases,CLABSI Predicted Cases,Start Date,End Date,file_year,file_month
17,310039,3790.0,3.0,6.488,2013-01-01,2013-12-31,2014,12
20,490069,4285.0,3.0,6.488,2012-01-01,2012-12-31,2013,10
39,400007,1055.0,0.0,1.583,2013-01-01,2013-12-31,2015,1
49,420002,2350.0,2.0,3.525,2013-01-01,2013-12-31,2015,1
54,381322,,,,2013-01-01,2013-12-31,2015,1


In [7]:

###################  Merge CAUTI and CLABSI data ####################################

hai_df = cauti_df.merge(clabsi_df, on=['Facility ID', 'Start Date', 'End Date', 'file_year', 'file_month'], how='outer')

####################  Drop duplicate rows resulting from merger #############################

hai_df.drop_duplicates(inplace=True)

########################  Conversions to numeric  #############################################

features = ['CAUTI Urinary Catheter Days', 'CLABSI Device Days',
            'CLABSI Observed Cases', 'CLABSI Predicted Cases',
            'CAUTI Observed Cases', 'CAUTI Predicted Cases', 
           ]

for f in features:
    hai_df[f] = hai_df[f].astype(str)
    hai_df[f] = hai_df[f].str.replace('*', '')
    hai_df[f] = pd.to_numeric(hai_df[f], errors='coerce')
    
hai_df['Total device days'] = hai_df['CLABSI Device Days'] + hai_df['CAUTI Urinary Catheter Days']


########################  Reorder columns  #############################################

col_to_move = hai_df.pop('CAUTI Urinary Catheter Days')
hai_df.insert(hai_df.shape[1] - 2, 'CAUTI Urinary Catheter Days', col_to_move)

hai_df.head()

Unnamed: 0,Facility ID,CAUTI Observed Cases,CAUTI Predicted Cases,Start Date,End Date,file_year,file_month,CLABSI Device Days,CLABSI Observed Cases,CAUTI Urinary Catheter Days,CLABSI Predicted Cases,Total device days
0,400105,1.0,1.216,2012-01-01,2012-09-30,2013,7,,,608.0,,
1,31301,,,2013-01-01,2013-12-31,2015,1,,,,,
2,100093,18.0,10.983,2013-01-01,2013-12-31,2015,1,3198.0,3.0,5016.0,5.913,8214.0
3,501327,,,2013-01-01,2013-12-31,2014,12,,,,,
4,230108,0.0,0.447,2013-01-01,2013-12-31,2014,12,117.0,0.0,344.0,0.177,461.0


## Filter HAI data on start dates and end dates that match those in the HAC file.

In [8]:
#########  Filter on start dates and end dates that match those in the HAC file  #######

hai_df = hai_df[(hai_df['Start Date'].isin(['2012-01-01', '2013-01-01'])) & (hai_df['End Date'].isin(['2012-12-31', '2013-12-31']))]
print(hai_df['Start Date'].unique())
print(hai_df['End Date'].unique())


['2013-01-01T00:00:00.000000000' '2012-01-01T00:00:00.000000000']
['2013-12-31T00:00:00.000000000' '2012-12-31T00:00:00.000000000']


## Then, drop hospitals from HAI data that are not contained in HAC data.

In [9]:
# hospitals in the hac data but not in the hai data ...
tdf = hac_df[~hac_df['Facility ID'].isin(hai_df['Facility ID'].unique())]
print(len(tdf['Facility ID'].unique()), 'hospitals in HAC dataset but not in HAI dataset\n')
    
print(sorted(tdf['Facility ID'].unique()))
print(tdf['AHRQ PSI-90 Score'].unique())
print(tdf['CAUTI Score'].unique())
print(tdf['CLABSI Score'].unique())

# drop hospitals in HAI data that are not in HAC data
hai_df = hai_df[hai_df['Facility ID'].isin(hac_df['Facility ID'].unique())]

tdf.head()

31 hospitals in HAC dataset but not in HAI dataset

['050545', '050546', '050548', '070038', '100134', '100298', '170180', '170201', '190300', '190302', '210058', '210064', '230144', '250018', '250127', '250152', '280119', '280134', '290042', '310130', '330387', '330408', '330409', '340168', '360247', '390302', '490104', '490129', '490135', '490144', '670091']
[nan  7.  2.]
[nan 10.]
[nan 10.]


Unnamed: 0,AHRQ PSI-90 Score,CAUTI Score,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Score,Start Date,Facility ID,Facility Name,Fiscal Year,State,Total HAC Score,file_month,file_year,Payment Reduction
2131,,,,2013-06-30,,2011-07-01,2013-12-31,,2012-01-01,490104,HIRAM W DAVIS MEDICAL CENTER,2015,VA,,10,2015,No
2468,,,,2013-06-30,,2011-07-01,2013-12-31,,2012-01-01,170201,"BLUE VALLEY HOSPITAL, INC",2015,KS,,10,2015,No
2397,,,,2013-06-30,,2011-07-01,2013-12-31,,2012-01-01,190302,"OMEGA HOSPITAL, LLC",2015,LA,,10,2015,No
2337,,,,2013-06-30,,2011-07-01,2013-12-31,,2012-01-01,490129,CAPITAL HOSPICE,2015,VA,,10,2015,No
2378,7.0,,,2013-06-30,7.0,2011-07-01,2013-12-31,,2012-01-01,190300,ST CHARLES SURGICAL HOSPITAL LLC,2015,LA,7.0,10,2015,No


## Correct HAI file for non-duplicate rows having duplicate dates

**Problem:** Some rows for the same provider have duplicate measurement dates but different values for observed cases, predicted cases, etc. This results from each year having multiple (quarterly) files, the data within which can vary among files. Additionally, the most recent file for each year is not always the right file to use. 

**Need:** Since only one row can be used, we need to figure out which row should be used.

**Solution:** Select the row with the greatest totals for predicted cases for each HAI.

## Aggregate annual HAI data into biennial data

Purpose: Match the biennial measurement periods of HAC data


In [10]:
start_dates = hac_df['Start Date'].tolist()
end_dates = hac_df['End Date'].tolist()
prvdrs = hac_df['Facility ID'].tolist()

total_device_days = []

cauti_days = []
clabsi_days = []

cauti_pred = []
clabsi_pred = []

cauti_obs = []
clabsi_obs = []

for i, start in enumerate(start_dates):
    end = end_dates[i]
    prvdr = prvdrs[i]
    
    tdf = hai_df[hai_df['Facility ID'] == prvdr]
    tdf = tdf[(tdf['Start Date'] == start) | (tdf['End Date'] == end)]
    
    if tdf.shape[0] == 1:
        print('tdf.shape[0]:', 1)
        print('hospital:', prvdr)
        print(tdf['Start Date'].unique())
        print(tdf['End Date'].unique())
        
    if tdf.shape[0] > 2:
        for date_ in ['Start Date', 'End Date']:
            tdf['duplicated dates'] = tdf.duplicated(subset=[date_], keep=False)
            tdf.sort_values(by=[
                                'CAUTI Observed Cases', 'CLABSI Observed Cases',
                                ], inplace=True, ascending=True)

            tdf.drop_duplicates(subset=[date_], inplace=True, keep='last')

        if tdf.shape[0] > 2:
            print('Error:')
            print("tdf.shape[0] > 2:", tdf.shape[0])
            print(start)
            print(end)
            print(tdf.head())
            break
        
    total_device_days.append(np.nansum(tdf['Total device days']))
    cauti_days.append(np.nansum(tdf['CAUTI Urinary Catheter Days']))
    clabsi_days.append(np.nansum(tdf['CLABSI Device Days']))
        
    cauti_pred.append(np.nansum(tdf['CAUTI Predicted Cases']))
    clabsi_pred.append(np.nansum(tdf['CLABSI Predicted Cases']))
    
    cauti_obs.append(np.nansum(tdf['CAUTI Observed Cases']))
    clabsi_obs.append(np.nansum(tdf['CLABSI Observed Cases']))
    

tdf.shape[0]: 1
hospital: 670021
['2012-01-01T00:00:00.000000000']
['2012-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 290002
['2012-01-01T00:00:00.000000000']
['2012-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 670072
['2012-01-01T00:00:00.000000000']
['2012-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 440147
['2012-01-01T00:00:00.000000000']
['2012-12-31T00:00:00.000000000']
tdf.shape[0]: 1
hospital: 050547
['2012-01-01T00:00:00.000000000']
['2012-12-31T00:00:00.000000000']


## Add HAI data to the HAC dataframe and save

In [11]:
hac_df['Total device days'] = total_device_days
hac_df['CAUTI Urinary Catheter Days'] = cauti_days
hac_df['CLABSI Device Days'] = clabsi_days

hac_df['CAUTI Observed Cases'] = cauti_obs
hac_df['CLABSI Observed Cases'] = clabsi_obs

hac_df['CAUTI Predicted Cases'] = cauti_pred
hac_df['CLABSI Predicted Cases'] = clabsi_pred

hac_df['CAUTI derived SIR'] = hac_df['CAUTI Observed Cases'] / np.round(hac_df['CAUTI Predicted Cases'], 4)
hac_df['CLABSI derived SIR'] = hac_df['CLABSI Observed Cases'] / np.round(hac_df['CLABSI Predicted Cases'], 4)

print('hac_df.shape:', hac_df.shape)
print(hac_df['CAUTI Score'].unique())

hac_df.head()


hac_df.shape: (3336, 26)
[ 9. nan  4.  1.  6.  5.  3.  2.  8.  7. 10.]


Unnamed: 0,AHRQ PSI-90 Score,CAUTI Score,CLABSI Score,Domain 1 End Date,Domain 1 Score,Domain 1 Start Date,End Date,Domain 2 Score,Start Date,Facility ID,Facility Name,Fiscal Year,State,Total HAC Score,file_month,file_year,Payment Reduction,Total device days,CAUTI Urinary Catheter Days,CLABSI Device Days,CAUTI Observed Cases,CLABSI Observed Cases,CAUTI Predicted Cases,CLABSI Predicted Cases,CAUTI derived SIR,CLABSI derived SIR
0,1.0,9.0,7.0,2013-06-30,1.0,2011-07-01,2013-12-31,8.0,2012-01-01,110064,MIDTOWN MEDICAL CENTER,2015,GA,5.55,10,2015,No,27213.0,13259.0,13954.0,40.0,19.0,22.953,30.015,1.742692,0.633017
2217,5.0,,,2013-06-30,5.0,2011-07-01,2013-12-31,,2012-01-01,390323,ADVANCED SURGICAL HOSPITAL,2015,PA,5.0,10,2015,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2218,8.0,4.0,7.0,2013-06-30,8.0,2011-07-01,2013-12-31,5.5,2012-01-01,490141,SPOTSYLVANIA REGIONAL MEDICAL CENTER,2015,VA,6.375,10,2015,No,2751.0,1600.0,1151.0,1.0,1.0,2.081,1.728,0.480538,0.578704
2219,4.0,,,2013-06-30,4.0,2011-07-01,2013-12-31,,2012-01-01,670066,"BASIN HEALTHCARE CENTER, LLC",2015,TX,4.0,10,2015,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2220,2.0,,,2013-06-30,2.0,2011-07-01,2013-12-31,,2012-01-01,670067,BAYLOR ORTHOPEDIC AND SPINE HOSPITAL AT ARLINGTON,2015,TX,2.0,10,2015,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


#        

## Reproduce scores for 2015

In [12]:
df_2015 = hac_df[hac_df['file_year'] == '2015']

hais = ['CAUTI', 'CLABSI']
cauti_deciles = np.nanpercentile(df_2015['CAUTI derived SIR'], np.arange(0, 100, 10))
clabsi_deciles = np.nanpercentile(df_2015['CLABSI derived SIR'], np.arange(0, 100, 10))

print(df_2015.shape)

(3336, 26)


In [13]:
for hai in hais:
    derived_sirs = df_2015[hai + ' derived SIR'].tolist()
    reported_scores = df_2015[hai + ' Score'].tolist()
    derived_scores = []

    for ii, sir in enumerate(derived_sirs):
        sir = float(sir)
        if np.isnan(sir) == True or np.isnan(reported_scores[ii]) == True:
            derived_scores.append(np.nan)
        elif hai == 'CLABSI':
            score = np.nan
            if sir == 0.0:
                score = 1
            elif sir <= 0.138:  #clabsi_deciles[1]:
                score = 2
            elif sir <= 0.266:  #clabsi_deciles[2]:
                score = 3
            elif sir <= 0.370:  #clabsi_deciles[3]:
                score = 4
            elif sir <= 0.456:  #clabsi_deciles[4]:
                score = 5
            elif sir <= 0.549:  #clabsi_deciles[5]:
                score = 6
            elif sir <= 0.677:  #clabsi_deciles[6]:
                score = 7
            elif sir <= 0.856:  #clabsi_deciles[7]:
                score = 8
            elif sir <= 1.138:  #clabsi_deciles[8]:
                score = 9
            elif sir > 1.138:  #clabsi_deciles[9]:
                score = 10
            else:
                score = np.nan
                            
            derived_scores.append(score)
                
        elif hai == 'CAUTI':
            score = np.nan
            if sir == 0.0:
                score = 1
            elif sir <= 0.251:  #cauti_deciles[1]:
                score = 2
            elif sir <= 0.444:  #cauti_deciles[2]:
                score = 3
            elif sir <= 0.618:  #cauti_deciles[3]:
                score = 4
            elif sir <= 0.810:  #cauti_deciles[4]:
                score = 5
            elif sir <= 0.999:  #cauti_deciles[5]:
                score = 6
            elif sir <= 1.243:  #cauti_deciles[6]:
                score = 7
            elif sir <= 1.564:  #cauti_deciles[7]:
                score = 8
            elif sir <= 2.013:  #cauti_deciles[8]:
                score = 9
            elif sir > 2.013: #cauti_deciles[9]:
                score = 10
            else:
                score = np.nan
                            
            derived_scores.append(score)
            
    df_2015[hai + ' derived score'] = derived_scores
    print('len:', len(derived_scores), len(reported_scores))
    
features = ['CAUTI derived SIR', 'CAUTI Score', 'CAUTI derived score',
            'CLABSI derived SIR', 'CLABSI Score', 'CLABSI derived score',
            'AHRQ PSI-90 Score']

for f in features:
    df_2015[f] = pd.to_numeric(df_2015[f], errors='coerce')


len: 3336 3336
len: 3336 3336


In [14]:
holdout_df = df_2015[(df_2015['State'] == 'MD') | ~df_2015['Payment Reduction'].isin(['Yes', 'No']) | (df_2015['Total HAC Score'].isin([float("NaN"), np.nan]))]
df_2015 = df_2015[(df_2015['State'] != 'MD') & (df_2015['Payment Reduction'].isin(['Yes', 'No'])) & (~df_2015['Total HAC Score'].isin([float("NaN"), np.nan]))]

hac_scores = []
ct1 = 0
ct2 = 0

for hosp in df_2015['Facility ID'].tolist():
    tdf = df_2015[df_2015['Facility ID'] == hosp]

    d1 = 0
    d2 = 0
    d2_o = 0
    
    w_ls = []
    sum_ls = []

    # Use derived scores for CAUTI and CLABSI
    m_ls = ['CAUTI derived score', 'CLABSI derived score']
                
    s = 0
    w = 0
    for m in m_ls:
        v = tdf[m].tolist()
        if len(list(set(v))) > 1:
            print('len(list(set(v))) > 1')
            print(len(list(set(v))))
            print(list(set(v)))
            sys.exit()

        v = tdf[m].iloc[0]

        if np.isnan(v) == False: 
            s += v
            w += 1

    if w == 0:
        d2 = np.nan
    else:
        d2 = s/w
        
    d1 = float(tdf['AHRQ PSI-90 Score'].iloc[0])
    d2_o = float(tdf['Domain 2 Score'].iloc[0])
    state = tdf['State'].iloc[0]
    
    if np.isnan(d1) == True: 
        # if no score for Domain 1, then total HAC score will be based entirely on Domain 2
        if np.isnan(d2) == True and np.isnan(d2_o) == True:
            hac_scores.append(d2_o)
        
        elif np.isnan(d2) == True and np.isnan(d2_o) == False:
            hac_scores.append(d2_o)
        
        elif np.isnan(d2) == False and np.isnan(d2_o) == False:
            hac_scores.append(d2_o)
        
        elif np.isnan(d2) == False and np.isnan(d2_o) == True:
            hac_scores.append(d2_o)
        
    elif np.isnan(d1) == False:
        # if there is a score for Domain 1 ...
        
        if state == 'MD':
            # states in MD should not have scores for domain 1
            print('Error:')
            print('Domain 1:', d1)
            print('State:', state)
            print('Hospitals in this state should not have domain 1 scores.\n')
            
        # Domain 2:
        
        # If the derived score is NaN and the original score is NaN ...
        if np.isnan(d2) == True and np.isnan(d2_o) == True:
            hac_scores.append(d1)
        
        # If the derived score is NaN but the original score is a float ...
        elif np.isnan(d2) == True and np.isnan(d2_o) == False:
            ct1 += 1
            hac_scores.append(0.35*d1 + 0.65*d2_o)
        
        # If the derived score is a float and the original score is a float ...
        elif np.isnan(d2) == False and np.isnan(d2_o) == False:
            if d2 != d2_o:
                ct2 += 1
                #print(d2, d2_o)
            hac_scores.append(0.35*d1 + 0.65*d2)
        
        # If the derived score is a float but the original score is NaN ...
        elif np.isnan(d2) == False and np.isnan(d2_o) == True:
            hac_scores.append(d1)

    else:
        print(d1, ',', d2, ',', d2_o)
        break
        
print('ct1:', ct1)
print('ct2:', ct2)
    

ct1: 19
ct2: 26


In [15]:
df_2015['Total HAC Score (derived)'] = hac_scores
print(df_2015.shape[0], 'hospitals in hac_df')

tdf = df_2015[~df_2015['Total HAC Score (derived)'].isin([np.nan, float('NaN')])]
p75 = np.percentile(tdf['Total HAC Score (derived)'], 75, method='linear')

pr = []
for hosp in df_2015['Facility ID'].tolist():
    tdf = df_2015[df_2015['Facility ID'] == hosp]

    p = tdf['Payment Reduction'].iloc[0]
    if p != 'Yes' and p != 'No' and np.isnan(p) == True:
        pr.append(np.nan)

    else:
        score = tdf['Total HAC Score (derived)'].iloc[0]

        if np.isnan(score) == True:
            pr.append('No')
        elif score <= p75:
            pr.append('No')
        elif score > p75:
            pr.append('Yes')
        else:
            print('This score is an error:', score)
            sys.exit()

df_2015['Payment Reduction (derived)'] = pr
            
o_list = df_2015['Payment Reduction'].tolist()
d_list = df_2015['Payment Reduction (derived)'].tolist()

same = 0
diff = 0
res_ls = []
for i, o in enumerate(o_list):
    if o == d_list[i]:
        same += 1
        res_ls.append(1)
    else:
        diff += 1
        res_ls.append(0)
            
df_2015['Payment Reduction Reproduced?'] = res_ls
    
print(same, "Penalty assignments were reproduced")
print(diff, "Penalty assignments were not reproduced")
print(str(np.round(100 * same/(same+diff),2)) + '% penalty assignments were reproduced\n')
    

df_2015['HAC delta'] = df_2015['Total HAC Score'] - df_2015['Total HAC Score (derived)']
df_2015['CAUTI delta'] = df_2015['CAUTI Score'] - df_2015['CAUTI derived score']
df_2015['CLABSI delta'] = df_2015['CLABSI Score'] - df_2015['CLABSI derived score']


3241 hospitals in hac_df
3240 Penalty assignments were reproduced
1 Penalty assignments were not reproduced
99.97% penalty assignments were reproduced



In [16]:
ls1 = list(df_2015)
ls2 = list(holdout_df)
ls = list(filter(lambda x:x in ls1, ls2))
print(df_2015.shape)
print(holdout_df.shape)
df_2015 = df_2015.merge(holdout_df, how='outer', on=ls)
print(df_2015.shape)

df_2015.to_pickle('~/GitHub/HACRP-HAIs/data/merged_HAC_HAI/HAI_HAC_2015.pkl', protocol=5)

(3241, 34)
(95, 28)
(3336, 34)


In [17]:
display_df = df_2015[df_2015['Payment Reduction Reproduced?'] == 0]
items = ['file_year', 'Facility ID', 'HAI Measures End Date', 'HAI Measures Start Date',
             'CAUTI Footnote', 'CAUTI Score', 'CAUTI derived score',
             'CLABSI Footnote', 'CLABSI Score', 'CLABSI derived score',
             'AHRQ PSI-90 Footnote', 'AHRQ PSI-90 Score',
             'Total HAC Footnote', 'Total HAC Score', 'Total HAC Score (derived)',
             'Payment Reduction Footnote', 'Payment Reduction', 
             'Payment Reduction (derived)', 'Payment Reduction Reproduced?',
             ]

display_df = display_df.filter(items=items)
display_df['delta'] = ((display_df['Total HAC Score'] - display_df['Total HAC Score (derived)'])**2)**0.5
display_df = display_df.round(4)
display_df.head()


Unnamed: 0,file_year,Facility ID,CAUTI Score,CAUTI derived score,CLABSI Score,CLABSI derived score,AHRQ PSI-90 Score,Total HAC Score,Total HAC Score (derived),Payment Reduction,Payment Reduction (derived),Payment Reduction Reproduced?,delta
2525,2015,670041,9.0,9.0,3.0,4.0,8.0,6.7,7.025,No,Yes,0.0,0.325


In [21]:
tdf = df_2015[(~df_2015['Total HAC Score'].isin([float("NaN"), np.nan]))]
tdf = tdf[tdf['State'] != 'MD']

tdf1 = tdf[tdf['Payment Reduction'] == 'Yes']
tdf2 = tdf[tdf['Payment Reduction'].isin(['Yes', 'No'])]
print(tdf1.shape[0]/tdf2.shape[0])

tdf1 = tdf[tdf['Payment Reduction'] == 'No']
print(tdf1.shape[0]/tdf2.shape[0])

print(tdf.shape[0])
print(len(tdf['Facility ID'].unique()))


0.2203023758099352
0.7796976241900648
3241
3241
