In [1]:
import pandas as pd
import warnings
from IPython.utils import io
import sys
import numpy as np
from functools import reduce

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

stars_dir = '~/GitHub/stars-data-builder/'
hos_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

In [2]:
def curate(df):

    try:
        df = df[df['PROVIDER_ID'] != np.nan]
        df['PROVIDER_ID'] = df['PROVIDER_ID'].values.astype(str)
        
        ids = df['PROVIDER_ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['PROVIDER_ID'] = ids2
        
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    if 'Unnamed: 0' in list(df):
        df.drop(labels=['Unnamed: 0'], axis=1, inplace=True)
    return df

## MEASURES USED FOR APRIL 2021 File: From PDFs (SAS & CMS)

### Mortality

MORT-30-AMI  
MORT-30-CABG  
MORT-30-COPD  
MORT-30-HF  
MORT-30-PN  
MORT-30-STK  
PSI-4-SURG-COMP  

### Safety of Care

HAI-1  
HAI-2  
HAI-3  
HAI-4  
HAI-5  
HAI-6  
COMP-HIP-KNEE  
PSI-90-Safety  

### Readmission

READM-30-CABG  
READM-30-COPD  
READM-30-Hip-Knee  
READM-30-HOSP-WIDE  
EDAC-30-AMI  
EDAC-30-HF  
EDAC-30-PN  
OP-32  
OP-35 ADM  
OP-35 ED  
OP-36  

### Patient Experience

H-COMP-1  
H-COMP-2  
H-COMP-3  
H-COMP-5  
H-COMP-6  
H-COMP-7  
H-CLEAN-HSP / H-QUIET-HSP  
H-HSP-RATING / H-RECMND  

### Timely and Effective Care

ED-2b  
IMM-3  
OP-10  
OP-13  
OP-18b  
OP-2  
OP-22  
OP-23  
OP-29  
OP-30  
OP-33  
OP-3b  
OP-8  
PC-01  
SEP-1  


In [3]:
sas_input_df = pd.read_sas(stars_dir + 'Reproduce_Stars_Input/2021/Input_file/all_data_2021apr.sas7bdat', 
                           format = 'sas7bdat', encoding = "utf8")

sas_input_df = curate(sas_input_df)
sas_cols = list(sas_input_df)
remaining_ls = list(sas_cols)
print('sas_input_df.shape:', sas_input_df.shape)

labels = list(sas_input_df)
labels.remove('PROVIDER_ID')
print(len(labels), 'individual measures in the publicly-released SAS dataframe\n')
del labels

sas_input_df.head()

sas_input_df.shape: (4536, 99)
98 individual measures in the publicly-released SAS dataframe



Unnamed: 0,PROVIDER_ID,HAI_1_DEN_VOL,HAI_2_DEN_VOL,HAI_3_DEN_VOL,HAI_4_DEN_VOL,HAI_5_DEN_VOL,HAI_6_DEN_VOL,HAI_1_DEN_PRED,HAI_2_DEN_PRED,HAI_3_DEN_PRED,HAI_4_DEN_PRED,HAI_5_DEN_PRED,HAI_6_DEN_PRED,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,READM_30_HOSP_WIDE,READM_30_HIP_KNEE,EDAC_30_HF,READM_30_COPD,EDAC_30_AMI,EDAC_30_PN,MORT_30_STK,MORT_30_PN,MORT_30_HF,MORT_30_COPD,MORT_30_AMI,COMP_HIP_KNEE,READM_30_HOSP_WIDE_DEN,READM_30_HIP_KNEE_DEN,EDAC_30_HF_DEN,READM_30_COPD_DEN,EDAC_30_AMI_DEN,EDAC_30_PN_DEN,MORT_30_STK_DEN,MORT_30_PN_DEN,MORT_30_HF_DEN,MORT_30_COPD_DEN,MORT_30_AMI_DEN,COMP_HIP_KNEE_DEN,OP_2,OP_2_DEN,OP_3B,OP_3B_DEN,OP_8,OP_8_DEN,OP_10,OP_10_DEN,OP_13,OP_13_DEN,OP_18B,OP_18B_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,OP_30,OP_30_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,PSI_90_SAFETY,IMM_3_DEN,IMM_3,PC_01,PC_01_DEN,SEP_1,SEP_1_DEN,ED_2B,ED_2B_DEN,H_RESP_RATE_P,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,H_NUMB_COMP,PSI_90_SAFETY_DEN,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,OP_32,OP_32_DEN,OP_33,OP_33_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN
0,10001,7846.0,13268.0,165.0,216.0,102765.0,102765.0,8.086,18.498,4.615,1.884,7.305,76.294,0.742,0.324,0.65,0.0,0.548,0.537,0.156,0.046,21.6,0.207,7.0,-20.6,0.166,0.157,0.113,0.084,0.119,0.023,4474.0,258.0,1106.0,443.0,620.0,594.0,644.0,554.0,858.0,374.0,586.0,250.0,,,,,0.389,211.0,0.072,2117.0,0.028,211.0,178.0,349.0,0.03,57844.0,,,0.81,64.0,0.98,162.0,170.87,165.0,0.93,4817.0,0.97,0.0,24.0,0.6,102.0,103.0,655.0,21.0,3.0,3.0,3.0,3.0,4.0,3.0,3.5,3.5,507.0,3896.300852,0.047,281.0,0.149,268.0,13.4,606.0,,,10.8,190.0,7.1,190.0,0.8,993.0
1,10005,3088.0,7928.0,74.0,,40143.0,37697.0,1.988,4.66,1.97,,1.548,12.386,0.0,1.073,0.508,,1.938,0.565,0.159,0.039,10.2,0.187,-7.0,22.9,0.171,0.195,0.168,0.099,0.138,0.023,2018.0,221.0,304.0,591.0,52.0,625.0,169.0,637.0,291.0,502.0,80.0,213.0,,,62.0,18.0,0.427,246.0,0.142,1504.0,0.033,273.0,115.0,1408.0,0.02,71631.0,0.71,24.0,0.82,204.0,0.94,413.0,190.88,51.0,1.0,1915.0,0.91,0.03,193.0,0.68,330.0,82.0,1034.0,30.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,568.0,1538.334998,,,,,16.3,1150.0,1.0,12.0,11.1,121.0,6.2,121.0,1.1,483.0
2,10006,5874.0,10270.0,90.0,,64819.0,62418.0,4.781,8.992,2.439,,4.588,28.161,0.0,0.222,0.82,,0.654,0.426,0.152,0.046,-17.8,0.186,-5.5,27.5,0.125,0.184,0.125,0.095,0.157,0.028,3620.0,392.0,771.0,565.0,443.0,763.0,362.0,738.0,659.0,489.0,422.0,363.0,,,,,0.429,119.0,0.145,1386.0,0.026,265.0,152.0,362.0,0.01,41321.0,,,0.81,94.0,0.87,125.0,217.08,109.0,1.07,2456.0,0.93,0.0,35.0,0.33,105.0,110.0,552.0,23.0,3.0,3.0,1.0,2.0,2.0,2.0,3.0,3.0,1136.0,2816.123681,0.041,139.0,0.132,130.0,16.4,1948.0,,,,,,,1.1,454.0
3,10007,,,,,,4783.0,,,,,,1.84,,,,,,0.0,0.161,,29.1,0.194,,16.9,0.153,0.203,0.141,0.09,,,443.0,,83.0,136.0,,200.0,29.0,205.0,78.0,116.0,,,,,,,,,0.091,208.0,,,108.0,1295.0,0.03,1116.0,,,0.13,82.0,0.9,40.0,,,0.92,,,,,0.98,52.0,62.0,525.0,27.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,4.0,179.0,277.574249,,,,,17.7,213.0,,,,,,,1.0,76.0
4,10008,,,,,,,,,,,,,,,,,,,0.148,,,0.194,,-3.6,,0.178,0.12,0.105,,,127.0,,,30.0,,39.0,,43.0,27.0,32.0,,,,,,,,,0.034,148.0,,,91.0,335.0,0.01,7012.0,,,0.59,34.0,0.97,30.0,,,0.99,189.0,0.48,,,0.43,14.0,86.0,396.0,,,,,,,,,,,58.903206,,,,,17.5,77.0,,,,,,,,


## HAIs

In [4]:
df = pd.read_pickle(hos_dir + 'HAI/CombinedFiles_HAI/Facility.pkl')
df = df[df['file_year'] == '2020']
df = df[df['file_month'] == '10']

measures = ['HAI_1_ELIGCASES', 'HAI_1_DOPC', 'HAI_1_SIR', 
            'HAI_2_ELIGCASES', 'HAI_2_DOPC', 'HAI_2_SIR', 
            'HAI_3_ELIGCASES', 'HAI_3_DOPC', 'HAI_3_SIR', 
            'HAI_4_ELIGCASES', 'HAI_4_DOPC', 'HAI_4_SIR', 
            'HAI_5_ELIGCASES', 'HAI_5_DOPC', 'HAI_5_SIR', 
            'HAI_6_ELIGCASES', 'HAI_6_DOPC', 'HAI_6_SIR',
            ]
df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score'], axis=1)

hai_df = pd.DataFrame(columns=['Facility ID']) 
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    
    hai_df = hai_df.merge(tdf2, on='Facility ID', how='outer')
    
hai_df.rename(columns={'HAI_1_ELIGCASES': 'HAI_1_DEN_PRED',
                       'HAI_1_DOPC': 'HAI_1_DEN_VOL',
                       'HAI_1_SIR': 'HAI_1',
                       'HAI_2_ELIGCASES': 'HAI_2_DEN_PRED',
                       'HAI_2_DOPC': 'HAI_2_DEN_VOL',
                       'HAI_2_SIR': 'HAI_2',
                       'HAI_3_ELIGCASES': 'HAI_3_DEN_PRED',
                       'HAI_3_DOPC': 'HAI_3_DEN_VOL',
                       'HAI_3_SIR': 'HAI_3',
                       'HAI_4_ELIGCASES': 'HAI_4_DEN_PRED',
                       'HAI_4_DOPC': 'HAI_4_DEN_VOL',
                       'HAI_4_SIR': 'HAI_4',
                       'HAI_5_ELIGCASES': 'HAI_5_DEN_PRED',
                       'HAI_5_DOPC': 'HAI_5_DEN_VOL',
                       'HAI_5_SIR': 'HAI_5',
                       'HAI_6_ELIGCASES': 'HAI_6_DEN_PRED',
                       'HAI_6_DOPC': 'HAI_6_DEN_VOL',
                       'HAI_6_SIR': 'HAI_6',
                       'Facility ID': 'PROVIDER_ID',
                   }, inplace=True)

for col in list(hai_df):
    if col != 'PROVIDER_ID':
        hai_df[col] = pd.to_numeric(hai_df[col], errors='coerce')
        
ls = ['1', '2', '3', '4', '5', '6']
for l in ls:
    hai_df['HAI_' + l + '_DEN_PRED'] = hai_df['HAI_' + l + '_DEN_PRED'].where(hai_df['HAI_' + l + '_DEN_PRED'] >= 1, np.nan)
    hai_df['HAI_' + l + '_DEN_VOL'] = hai_df['HAI_' + l + '_DEN_VOL'].where(hai_df['HAI_' + l + '_DEN_PRED'] >= 1, np.nan)
    
for c in list(hai_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', remaining_ls, '\n')       

hai_df = curate(hai_df)

80 remaining features: ['READM_30_HOSP_WIDE', 'READM_30_HIP_KNEE', 'EDAC_30_HF', 'READM_30_COPD', 'EDAC_30_AMI', 'EDAC_30_PN', 'MORT_30_STK', 'MORT_30_PN', 'MORT_30_HF', 'MORT_30_COPD', 'MORT_30_AMI', 'COMP_HIP_KNEE', 'READM_30_HOSP_WIDE_DEN', 'READM_30_HIP_KNEE_DEN', 'EDAC_30_HF_DEN', 'READM_30_COPD_DEN', 'EDAC_30_AMI_DEN', 'EDAC_30_PN_DEN', 'MORT_30_STK_DEN', 'MORT_30_PN_DEN', 'MORT_30_HF_DEN', 'MORT_30_COPD_DEN', 'MORT_30_AMI_DEN', 'COMP_HIP_KNEE_DEN', 'OP_2', 'OP_2_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'OP_30', 'OP_30_DEN', 'PSI_4_SURG_COMP', 'PSI_4_SURG_COMP_DEN', 'PSI_90_SAFETY', 'IMM_3_DEN', 'IMM_3', 'PC_01', 'PC_01_DEN', 'SEP_1', 'SEP_1_DEN', 'ED_2B', 'ED_2B_DEN', 'H_RESP_RATE_P', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_R

## Unplanned Hospital Visits


In [5]:
df = pd.read_pickle(hos_dir + 'Unplanned_Visits/CombinedFiles_Unplanned_Visits/Facility.pkl')
df = df[df['file_year'] == '2020']
df = df[df['file_month'] == '10']

measures = ['EDAC_30_AMI', 'EDAC_30_HF', 'EDAC_30_PN', 'OP_32', 'OP_35_ADM', 
             'OP_35_ED', 'OP_36', 'READM_30_CABG', 'READM_30_COPD', 
             'READM_30_HIP_KNEE', 'READM_30_HOSP_WIDE',
             ]
df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Denominator', 'Measure ID', 'Score'], axis=1)

uhv_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    tdf2[m + '_DEN'] = tdf1['Denominator'].tolist()
    
    uhv_df = uhv_df.merge(tdf2, on='Facility ID', how='outer')
    
uhv_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                   }, inplace=True)

for c in list(uhv_df):
    try:
        remaining_ls.remove(c)
    except:
        pass
        
print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
uhv_df = curate(uhv_df)

58 remaining features: ['COMP_HIP_KNEE', 'COMP_HIP_KNEE_DEN', 'ED_2B', 'ED_2B_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'IMM_3', 'IMM_3_DEN', 'MORT_30_AMI', 'MORT_30_AMI_DEN', 'MORT_30_CABG', 'MORT_30_CABG_DEN', 'MORT_30_COPD', 'MORT_30_COPD_DEN', 'MORT_30_HF', 'MORT_30_HF_DEN', 'MORT_30_PN', 'MORT_30_PN_DEN', 'MORT_30_STK', 'MORT_30_STK_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_2', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'OP_2_DEN', 'OP_30', 'OP_30_DEN', 'OP_33', 'OP_33_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'PC_01', 'PC_01_DEN', 'PSI_4_SURG_COMP', 'PSI_4_SURG_COMP_DEN', 'PSI_90_SAFETY', 'PSI_90_SAFETY_DEN', 'SEP_1', 'SEP_1_DEN'] 



## COMPLICATIONS AND DEATHS

In [6]:
df = pd.read_pickle(hos_dir + 'Complications_and_Deaths/CombinedFiles_ComplicationsAndDeaths/Facility.pkl')
df = df[df['file_year'] == '2020']
df = df[df['file_month'] == '10']

measures = ['MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF', 
            'MORT_30_PN', 'MORT_30_STK', 'PSI_4_SURG_COMP', 'COMP_HIP_KNEE',
            'PSI_90_SAFETY',
            ]
df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Denominator', 'Measure ID', 'Score'], axis=1)

cad_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    tdf2[m + '_DEN'] = tdf1['Denominator'].tolist()
    
    cad_df = cad_df.merge(tdf2, on='Facility ID', how='outer')
    

cad_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                   }, inplace=True)

for c in list(cad_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
cad_df = curate(cad_df)

cad_df.head()

40 remaining features: ['ED_2B', 'ED_2B_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'IMM_3', 'IMM_3_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_2', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'OP_2_DEN', 'OP_30', 'OP_30_DEN', 'OP_33', 'OP_33_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'PC_01', 'PC_01_DEN', 'SEP_1', 'SEP_1_DEN'] 



Unnamed: 0,PROVIDER_ID,MORT_30_AMI,MORT_30_AMI_DEN,MORT_30_CABG,MORT_30_CABG_DEN,MORT_30_COPD,MORT_30_COPD_DEN,MORT_30_HF,MORT_30_HF_DEN,MORT_30_PN,MORT_30_PN_DEN,MORT_30_STK,MORT_30_STK_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,COMP_HIP_KNEE,COMP_HIP_KNEE_DEN,PSI_90_SAFETY,PSI_90_SAFETY_DEN
0,10001,11.9,586,4.7,281,8.4,374,11.3,858,15.7,554,16.6,644,170.87,165,2.3,250,0.93,Not Applicable
1,10005,13.8,80,Not Available,Not Available,9.9,502,16.8,291,19.5,637,17.1,169,190.88,51,2.3,213,1.0,Not Applicable
2,10006,15.7,422,4.1,139,9.5,489,12.5,659,18.4,738,12.5,362,217.08,109,2.8,363,1.07,Not Applicable
3,10007,Not Available,Not Available,Not Available,Not Available,9.0,116,14.1,78,20.3,205,15.3,29,Not Available,Not Available,Not Available,Not Available,0.92,Not Applicable
4,10008,Not Available,Not Available,Not Available,Not Available,10.5,32,12.0,27,17.8,43,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,0.99,Not Applicable


## TIMELY AND EFFECTIVE CARE

In [7]:
df = pd.read_pickle(hos_dir + 'Timely_and_Effective_Care/CombinedFiles_Timely_and_Effective_Care/Facility.pkl')
df = df[df['file_year'] == '2020']
df = df[df['file_month'] == '10']

measures = ['ED_2b', 'IMM_3', 'OP_18b', 'OP_2', 'OP_22', 'OP_23', 'OP_29', 'OP_30',
             'OP_33', 'OP_3b', 'PC_01', 'SEP_1',
             ]
df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Sample', 'Measure ID', 'Score'], axis=1)

tec_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    tdf2[m + '_DEN'] = tdf1['Sample'].tolist()
    
    tec_df = tec_df.merge(tdf2, on='Facility ID', how='outer')
    

tec_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                       'ED_2b': 'ED_2B',
                       'ED_2b_DEN': 'ED_2B_DEN',
                       'OP_3b': 'OP_3B',
                       'OP_3b_DEN': 'OP_3B_DEN',
                       'OP_18b': 'OP_18B',
                       'OP_18b_DEN': 'OP_18B_DEN',
                   }, inplace=True)

for c in list(tec_df):
    try:
        remaining_ls.remove(c)
    except:
        pass
        print(c, 'not in remaining')
print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')

tec_df = curate(tec_df)

PROVIDER_ID not in remaining
16 remaining features: ['H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_8', 'OP_8_DEN'] 



## HCAHPS

In [8]:
df = pd.read_csv(hos_dir + '2020/hospitals_archive_10_2020/HCAHPS_Hospital.csv')

measures = ['H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING',
            'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 
            'H_CLEAN_STAR_RATING',  'H_QUIET_STAR_RATING', 
            'H_RECMND_STAR_RATING', 'H_HSP_RATING_STAR_RATING',
            ]
df = df[df['HCAHPS Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'HCAHPS Measure ID', 'Patient Survey Star Rating', 
                        'Number of Completed Surveys', 'Survey Response Rate Percent'], axis=1)


HCAHPS_df = pd.DataFrame(columns=['Facility ID'])
for i, m in enumerate(measures):
    tdf1 = df[df['HCAHPS Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Patient Survey Star Rating'].tolist()
    if i == 0:
        tdf2['H_NUMB_COMP'] = tdf1['Number of Completed Surveys'].tolist()
        tdf2['H_RESP_RATE_P'] = tdf1['Survey Response Rate Percent'].tolist()
        
    HCAHPS_df = HCAHPS_df.merge(tdf2, on='Facility ID', how='outer')
    

HCAHPS_df.rename(columns={'Facility ID': 'PROVIDER_ID'}, inplace=True)
HCAHPS_df['H_HSP_RATING_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_HSP_RATING_STAR_RATING'] = HCAHPS_df['H_HSP_RATING_STAR_RATING'].astype(int)
HCAHPS_df['H_RECMND_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_RECMND_STAR_RATING'] = HCAHPS_df['H_RECMND_STAR_RATING'].astype(int)
HCAHPS_df['H_GLOB_STAR_RATING'] = np.round((HCAHPS_df['H_HSP_RATING_STAR_RATING'] + HCAHPS_df['H_RECMND_STAR_RATING']) / 2, 1)
HCAHPS_df['H_GLOB_STAR_RATING'].replace(0, np.nan, inplace=True)

HCAHPS_df['H_CLEAN_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_CLEAN_STAR_RATING'] = HCAHPS_df['H_CLEAN_STAR_RATING'].astype(int)
HCAHPS_df['H_QUIET_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_QUIET_STAR_RATING'] = HCAHPS_df['H_QUIET_STAR_RATING'].astype(int)
HCAHPS_df['H_INDI_STAR_RATING'] = np.round((HCAHPS_df['H_CLEAN_STAR_RATING'] + HCAHPS_df['H_QUIET_STAR_RATING']) / 2, 1)
HCAHPS_df['H_INDI_STAR_RATING'].replace(0, np.nan, inplace=True)

HCAHPS_df.drop(labels = ['H_CLEAN_STAR_RATING',  'H_QUIET_STAR_RATING', 'H_RECMND_STAR_RATING', 
                         'H_HSP_RATING_STAR_RATING'], axis=1, inplace=True)

for c in list(HCAHPS_df):
    try:
        remaining_ls.remove(c)
    except:
        pass
        
print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
HCAHPS_df = curate(HCAHPS_df)


6 remaining features: ['OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_8', 'OP_8_DEN'] 



## Outpatient Imaging Efficiency

In [9]:
df = pd.read_pickle(hos_dir + 'Outpatient_Imaging_Efficiency/CombinedFiles_Outpatient_Imaging_Efficiency/Facility.pkl')
df = df[df['file_year'] == '2020']
df = df[df['file_month'] == '10']

measures = ['OP-8', 'OP-10', 'OP-13']
df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score'], axis=1)

oie_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    tdf2[m + '_DEN'] = [np.nan]*tdf2.shape[0]
    
    oie_df = oie_df.merge(tdf2, on='Facility ID', how='outer')
    
    
oie_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                       'OP-8': 'OP_8',
                       'OP-10': 'OP_10',
                       'OP-13': 'OP_13',
                       'OP-8_DEN': 'OP_8_DEN',
                       'OP-10_DEN': 'OP_10_DEN',
                       'OP-13_DEN': 'OP_13_DEN',
                   }, inplace=True)

for c in list(oie_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
oie_df = curate(oie_df)


0 remaining features: [] 



## MERGE DATAFRAME AND COMPARE TO SAS FILE

In [10]:
main_df = tec_df.merge(cad_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(HCAHPS_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(uhv_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(hai_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(oie_df, on='PROVIDER_ID', how='outer')

In [11]:
prvdrs_sas = sorted(sas_input_df['PROVIDER_ID'].tolist())
prvdrs_main = sorted(main_df['PROVIDER_ID'].tolist())

ps = []
for p in prvdrs_sas:
    if p not in prvdrs_main:
        ps.append(p)

print('Hospitals in the SAS file but not in file created here:')
print(ps)

main_df = main_df[main_df['PROVIDER_ID'].isin(prvdrs_sas)]
prvdrs_main1 = sorted(main_df['PROVIDER_ID'].tolist())

# Remove columns in main_df that are not in the original sas output data
main_df = main_df.filter(sas_cols)
for col in list(main_df):
    if col != 'PROVIDER_ID':
        main_df[col] = pd.to_numeric(main_df[col], errors='coerce')

# Remove hospitals 
ls = list(main_df)
ls.remove('PROVIDER_ID')
main_df.dropna(how='all', axis=0, subset=ls, inplace=True)
prvdrs_main2 = sorted(main_df['PROVIDER_ID'].tolist())

ls = np.setdiff1d(prvdrs_main1, prvdrs_main2)

print('Hospitals in the file created here that have zero data for the features included in the SAS file:')
print(ls, '\n')

ls = ['READM_30_HIP_KNEE', 'READM_30_COPD', 'MORT_30_STK', 'MORT_30_PN',
      'MORT_30_HF', 'MORT_30_COPD', 'MORT_30_AMI', 'COMP_HIP_KNEE', 'OP_22',
      'OP_23', 'OP_29', 'OP_30', 'IMM_3', 'PC_01', 'SEP_1', 'MORT_30_CABG',
      'READM_30_CABG', 'OP_33', 'READM_30_HOSP_WIDE', 'OP_2', 'OP_8',
      'OP_10', 'OP_13']

for l in ls:
    main_df[l] = main_df[l] * 0.01


#NO DATA:
#  PSI_90_SAFETY_DEN
#  OP_8_DEN
#  OP_10_DEN
#  OP_13_DEN

print('main_df.shape:', main_df.shape)
print('sas_input_df.shape:', sas_input_df.shape)

Hospitals in the SAS file but not in file created here:
['360367', '451394']
Hospitals in the file created here that have zero data for the features included in the SAS file:
[] 

main_df.shape: (4534, 99)
sas_input_df.shape: (4536, 99)


In [12]:
print('main_df.shape:', main_df.shape)
main_df.dropna(how='all', axis=1, inplace=True)
print('main_df.shape:', main_df.shape)

main_df.head()

main_df.shape: (4534, 99)
main_df.shape: (4534, 95)


Unnamed: 0,PROVIDER_ID,HAI_1_DEN_VOL,HAI_2_DEN_VOL,HAI_3_DEN_VOL,HAI_4_DEN_VOL,HAI_5_DEN_VOL,HAI_6_DEN_VOL,HAI_1_DEN_PRED,HAI_2_DEN_PRED,HAI_3_DEN_PRED,HAI_4_DEN_PRED,HAI_5_DEN_PRED,HAI_6_DEN_PRED,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,READM_30_HOSP_WIDE,READM_30_HIP_KNEE,EDAC_30_HF,READM_30_COPD,EDAC_30_AMI,EDAC_30_PN,MORT_30_STK,MORT_30_PN,MORT_30_HF,MORT_30_COPD,MORT_30_AMI,COMP_HIP_KNEE,READM_30_HOSP_WIDE_DEN,READM_30_HIP_KNEE_DEN,EDAC_30_HF_DEN,READM_30_COPD_DEN,EDAC_30_AMI_DEN,EDAC_30_PN_DEN,MORT_30_STK_DEN,MORT_30_PN_DEN,MORT_30_HF_DEN,MORT_30_COPD_DEN,MORT_30_AMI_DEN,COMP_HIP_KNEE_DEN,OP_2,OP_2_DEN,OP_3B,OP_3B_DEN,OP_8,OP_10,OP_13,OP_18B,OP_18B_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,OP_30,OP_30_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,PSI_90_SAFETY,IMM_3_DEN,IMM_3,PC_01,PC_01_DEN,SEP_1,SEP_1_DEN,ED_2B,ED_2B_DEN,H_RESP_RATE_P,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,H_NUMB_COMP,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,OP_32,OP_32_DEN,OP_33,OP_33_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN
0,10001,7846.0,13268.0,165.0,216.0,102765.0,102765.0,8.086,18.498,4.615,1.884,7.305,76.294,0.742,0.324,0.65,0.0,0.548,0.537,0.156,0.046,21.6,0.207,7.0,-20.6,0.166,0.157,0.113,0.084,0.119,0.023,4474.0,258.0,1106.0,443.0,620.0,594.0,644.0,554.0,858.0,374.0,586.0,250.0,,,,,0.389,0.072,0.028,178.0,349.0,0.03,57844.0,,,0.81,64.0,0.98,162.0,170.87,165.0,0.93,4817.0,0.97,0.0,24.0,0.6,102.0,103.0,655.0,21.0,3.0,3.0,3.0,3.0,4.0,3.0,3.5,3.5,507.0,0.047,281.0,0.149,268.0,13.4,606.0,,,10.8,190.0,7.1,190.0,0.8,993.0
1,10005,3088.0,7928.0,74.0,,40143.0,37697.0,1.988,4.66,1.97,,1.548,12.386,0.0,1.073,0.508,,1.938,0.565,0.159,0.039,10.2,0.187,-7.0,22.9,0.171,0.195,0.168,0.099,0.138,0.023,2018.0,221.0,304.0,591.0,52.0,625.0,169.0,637.0,291.0,502.0,80.0,213.0,,,62.0,18.0,0.427,0.142,0.033,115.0,1408.0,0.02,71631.0,0.71,24.0,0.82,204.0,0.94,413.0,190.88,51.0,1.0,1915.0,0.91,0.03,193.0,0.68,330.0,82.0,1034.0,30.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,568.0,,,,,16.3,1150.0,1.0,12.0,11.1,121.0,6.2,121.0,1.1,483.0
2,10006,5874.0,10270.0,90.0,,64819.0,62418.0,4.781,8.992,2.439,,4.588,28.161,0.0,0.222,0.82,,0.654,0.426,0.152,0.046,-17.8,0.186,-5.5,27.5,0.125,0.184,0.125,0.095,0.157,0.028,3620.0,392.0,771.0,565.0,443.0,763.0,362.0,738.0,659.0,489.0,422.0,363.0,,,,,0.429,0.145,0.026,152.0,362.0,0.01,41321.0,,,0.81,94.0,0.87,125.0,217.08,109.0,1.07,2456.0,0.93,0.0,35.0,0.33,105.0,110.0,552.0,23.0,3.0,3.0,1.0,2.0,2.0,2.0,3.0,3.0,1136.0,0.041,139.0,0.132,130.0,16.4,1948.0,,,,,,,1.1,454.0
3,10007,,,,,,4783.0,,,,,,1.84,,,,,,0.0,0.161,,29.1,0.194,,16.9,0.153,0.203,0.141,0.09,,,443.0,,83.0,136.0,,200.0,29.0,205.0,78.0,116.0,,,,,,,,0.091,,108.0,1295.0,0.03,1116.0,,,0.13,82.0,0.9,40.0,,,0.92,,,,,0.98,52.0,62.0,525.0,27.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,4.0,179.0,,,,,17.7,213.0,,,,,,,1.0,76.0
4,10008,,,,,,,,,,,,,,,,,,,0.148,,,0.194,,-3.6,,0.178,0.12,0.105,,,127.0,,,30.0,,39.0,,43.0,27.0,32.0,,,,,,,,0.034,,91.0,335.0,0.01,7012.0,,,0.59,34.0,0.97,30.0,,,0.99,189.0,0.48,,,0.43,14.0,86.0,396.0,25.0,,,,,,,,,46.0,,,,,17.5,77.0,,,,,,,,


In [13]:
def get_apd(obs, exp):
    if len(obs) != len(exp):
        raise ValueError("Both lists must have the same length")

    n = len(obs)
    apd_values = []

    for i in range(n):
        numerator = abs(obs[i] - exp[i])
        denominator = (obs[i] + exp[i]) / 2

        # Avoid division by zero
        if denominator == 0:
            apd = 0  # Both obs[i] and exp[i] are zero, no error
        else:
            apd = numerator / denominator * 100

        apd_values.append(apd)

    average_apd = sum(apd_values) / n
    return average_apd


In [20]:
apd_ls = []
n_perfect = 0
n_miss = 0
n_tot = 0

tdf = main_df.copy(deep=True)

prvdrs = tdf['PROVIDER_ID'].unique()
sas_tdf = sas_input_df[sas_input_df['PROVIDER_ID'].isin(prvdrs)]

tdf.sort_values(by=['PROVIDER_ID'], inplace=True)
sas_tdf.sort_values(by=['PROVIDER_ID'], inplace=True)

tdf.fillna(0, inplace=True)
sas_tdf.fillna(0, inplace=True)

labels = ['MORT_30_AMI',
          'MORT_30_CABG',
          'MORT_30_COPD', 
          'MORT_30_HF',
          'MORT_30_PN',
          'MORT_30_STK',
          'PSI_4_SURG_COMP',
          'COMP_HIP_KNEE', 
          'HAI_1',
          'HAI_2',
          'HAI_3',
          'HAI_4',
          'HAI_5',
          'HAI_6',
          'PSI_90_SAFETY',
          'EDAC_30_AMI',
          'EDAC_30_HF',
          'EDAC_30_PN', 
          'OP_32',
          'READM_30_CABG',
          'READM_30_COPD',
          'READM_30_HIP_KNEE',
          'READM_30_HOSP_WIDE',
          'OP_35_ADM', 
          'OP_35_ED',
          'OP_36',
          'H_COMP_1_STAR_RATING',
          'H_COMP_2_STAR_RATING',
          'H_COMP_3_STAR_RATING',
          'H_COMP_5_STAR_RATING',
          'H_COMP_6_STAR_RATING',
          'H_COMP_7_STAR_RATING',
          'H_GLOB_STAR_RATING',
          'H_INDI_STAR_RATING',
          'IMM_3',
          'OP_22',
          'OP_23',
          'OP_29',
          'OP_30',
          'OP_33',
          'PC_01',
          'SEP_1',
          'OP_3B',
          'OP_18B',
          'ED_2B',
          'OP_8',
          'OP_10',
          'OP_13',
          'OP_2',
         ]

print(len(labels), 'individual measures used in stars SAS files')
for label in labels:
        
    obs = tdf[label].astype('float').tolist()
    exp = sas_tdf[label].astype('float').tolist()
    
    result = get_apd(obs, exp)
    apd_ls.append(result)
    
    for i, o in enumerate(obs):
        e = exp[i]
        if o == e:
            n_perfect += 1
            n_tot += 1
        else:
            n_miss += 1
            n_tot += 1

print('Results for Average Percent Difference:')
print('median:', np.nanmedian(apd_ls))
q1 = np.percentile(apd_ls, 25)
q3 = np.percentile(apd_ls, 75)
print('Q1:', q1)
print('Q3:', q3)

print('% of all measures that were perfectly reproduced:', 100*n_perfect/n_tot)

49 individual measures used in stars SAS files
Results for Average Percent Difference:
median: 0.0
Q1: 0.0
Q3: 3.107022095158506e-16
% of all measures that were perfectly reproduced: 98.39714447755281


In [None]:
# ##############################################################################

for col in list(main_df):
    if col != 'PROVIDER_ID':
        main_df[col] = pd.to_numeric(main_df[col], errors='coerce')

main_df.to_csv(stars_dir + "Reproduce_Stars_Input/2021/Input_File/all_data_2021apr.csv", index=False)
