In [1]:
import pandas as pd
import warnings
from IPython.utils import io
import sys
import numpy as np
from functools import reduce

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

stars_dir = '~/GitHub/stars-data-builder/'
hos_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

In [2]:
def curate(df):

    try:
        df = df[df['PROVIDER_ID'] != np.nan]
        df['PROVIDER_ID'] = df['PROVIDER_ID'].values.astype(str)
        
        ids = df['PROVIDER_ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['PROVIDER_ID'] = ids2
        
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    if 'Unnamed: 0' in list(df):
        df.drop(labels=['Unnamed: 0'], axis=1, inplace=True)
    return df

## MEASURES USED FOR JULY 2022 File: From PDFs (SAS & CMS)

### Mortality

MORT-30-AMI  
MORT-30-CABG  
MORT-30-COPD  
MORT-30-HF  
MORT-30-PN  
MORT-30-STK  
PSI-4-SURG-COMP  

### Safety of Care

HAI-1  
HAI-2  
HAI-3  
HAI-4  
HAI-5  
HAI-6  
COMP-HIP-KNEE  
PSI-90-Safety  

### Readmission

READM-30-CABG  
READM-30-COPD  
READM-30-Hip-Knee  
READM-30-HOSP-WIDE  
EDAC-30-AMI  
EDAC-30-HF  
EDAC-30-PN  
OP-32  
OP-35 ADM  
OP-35 ED  
OP-36  

### Patient Experience

H-COMP-1  
H-COMP-2  
H-COMP-3  
H-COMP-5  
H-COMP-6  
H-COMP-7  
H-CLEAN-HSP / H-QUIET-HSP  
H-HSP-RATING / H-RECMND  

### Timely and Effective Care

IMM-3  
OP-10  
OP-13  
OP-18b  
OP-2  
OP-22  
OP-23  
OP-29  
OP-33  
OP-3b  
OP-8  
PC-01  
SEP-1  


## Retired:
ED-2b  
OP-30 

In [3]:
sas_input_df = pd.read_sas(stars_dir + 'Reproduce_Stars_Input/2022/Input_file/all_data_2022jul.sas7bdat', 
                           format = 'sas7bdat', encoding = "utf8")

sas_input_df = curate(sas_input_df)
sas_cols = list(sas_input_df)
remaining_ls = list(sas_cols)
print('sas_input_df.shape:', sas_input_df.shape)

labels = list(sas_input_df)
labels.remove('PROVIDER_ID')
print(len(labels), 'individual measures in the publicly-released SAS dataframe\n')
del labels

sas_input_df.head()

sas_input_df.shape: (4489, 95)
94 individual measures in the publicly-released SAS dataframe



Unnamed: 0,PROVIDER_ID,HAI_1_DEN_VOL,HAI_2_DEN_VOL,HAI_3_DEN_VOL,HAI_4_DEN_VOL,HAI_5_DEN_VOL,HAI_6_DEN_VOL,HAI_1_DEN_PRED,HAI_2_DEN_PRED,HAI_3_DEN_PRED,HAI_4_DEN_PRED,HAI_5_DEN_PRED,HAI_6_DEN_PRED,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,READM_30_HOSP_WIDE,READM_30_HIP_KNEE,EDAC_30_HF,READM_30_COPD,EDAC_30_AMI,EDAC_30_PN,MORT_30_STK,MORT_30_PN,MORT_30_HF,MORT_30_COPD,MORT_30_AMI,COMP_HIP_KNEE,READM_30_HOSP_WIDE_DEN,READM_30_HIP_KNEE_DEN,EDAC_30_HF_DEN,READM_30_COPD_DEN,EDAC_30_AMI_DEN,EDAC_30_PN_DEN,MORT_30_STK_DEN,MORT_30_PN_DEN,MORT_30_HF_DEN,MORT_30_COPD_DEN,MORT_30_AMI_DEN,COMP_HIP_KNEE_DEN,OP_2,OP_2_DEN,OP_3B,OP_3B_DEN,OP_8,OP_8_DEN,OP_10,OP_10_DEN,OP_13,OP_13_DEN,OP_18B,OP_18B_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,PSI_90_SAFETY,IMM_3_DEN,IMM_3,PC_01,PC_01_DEN,SEP_1,SEP_1_DEN,H_RESP_RATE_P,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,H_NUMB_COMP,PSI_90_SAFETY_DEN,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,OP_32,OP_32_DEN,OP_33,OP_33_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN
0,10001,8391.0,13866.0,171.0,229.0,102470.0,102470.0,8.711,19.223,4.938,2.025,7.631,78.514,0.574,0.312,0.608,0.0,0.786,0.56,0.155,0.049,16.1,0.212,6.5,-7.1,0.198,0.145,0.089,0.083,0.116,0.026,1805.0,165.0,904.0,310.0,424.0,503.0,533.0,474.0,725.0,270.0,412.0,175.0,,,,,0.459,122.0,0.04,925.0,0.048,147.0,183.0,176.0,0.03,59762.0,,,0.9,52.0,184.28,115.0,0.81,4817.0,0.97,0.0,18.0,0.55,56.0,21.0,3.0,3.0,3.0,3.0,4.0,3.0,3.5,3.5,507.0,2828.228824,0.038,200.0,0.135,193.0,14.1,511.0,,,11.3,192.0,6.7,192.0,0.9,1003.0
1,10005,2864.0,8199.0,79.0,,37761.0,35442.0,1.808,4.57,2.084,,1.506,11.292,0.553,0.875,0.48,,1.328,0.708,0.147,0.039,15.5,0.181,-17.4,12.9,0.155,0.18,0.167,0.086,0.124,0.02,764.0,168.0,223.0,378.0,36.0,468.0,122.0,477.0,213.0,314.0,57.0,166.0,,,,,0.415,94.0,0.149,739.0,0.013,154.0,122.0,588.0,0.02,68296.0,0.67,12.0,0.97,180.0,145.66,40.0,1.19,1915.0,0.91,0.03,105.0,0.64,145.0,30.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,568.0,1106.469176,,,,,14.8,1086.0,1.0,21.0,9.5,120.0,5.4,120.0,0.9,514.0
2,10006,6011.0,10398.0,92.0,,63658.0,62016.0,5.334,10.217,2.432,,4.672,28.079,0.562,0.392,0.411,,1.07,0.356,0.157,0.048,-8.1,0.182,0.1,39.2,0.15,0.163,0.124,0.078,0.157,0.032,1563.0,306.0,615.0,381.0,363.0,663.0,286.0,634.0,528.0,331.0,344.0,306.0,,,,,0.279,61.0,0.158,728.0,0.007,152.0,171.0,183.0,0.01,47004.0,,,0.92,89.0,193.03,87.0,1.19,2456.0,0.93,0.0,15.0,0.33,54.0,23.0,3.0,3.0,1.0,2.0,2.0,2.0,3.0,3.0,1136.0,2131.116508,0.034,127.0,0.138,120.0,13.6,1908.0,,,,,,,1.0,502.0
3,10007,,,,,,5230.0,,,,,,2.033,,,,,,0.0,0.157,,8.0,0.2,,13.9,,0.192,0.126,0.089,,,144.0,,70.0,103.0,,147.0,,152.0,67.0,90.0,,,,,,,,,0.022,90.0,,,110.0,586.0,0.02,12514.0,,,0.92,169.0,,,0.94,,,,,1.0,32.0,27.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,4.0,179.0,188.206619,,,,,18.4,184.0,,,,,,,1.0,31.0
4,10008,,,,,,,,,,,,,,,,,,,0.153,,,0.194,,2.0,,0.166,,0.104,,,38.0,,,25.0,,26.0,,31.0,,28.0,,,,,,,,,0.038,53.0,,,97.0,165.0,0.0,7485.0,,,0.42,31.0,,,0.99,189.0,0.48,,,,,,,,,,,,,,,46.037684,,,,,18.7,81.0,,,,,,,,


## HAIs

In [4]:
df = pd.read_pickle(hos_dir + 'HAI/CombinedFiles_HAI/Facility.pkl')
df = df[df['file_year'] == '2021']
df = df[df['file_month'] == '07']

measures = ['HAI_1_ELIGCASES', 'HAI_1_DOPC', 'HAI_1_SIR', 
            'HAI_2_ELIGCASES', 'HAI_2_DOPC', 'HAI_2_SIR', 
            'HAI_3_ELIGCASES', 'HAI_3_DOPC', 'HAI_3_SIR', 
            'HAI_4_ELIGCASES', 'HAI_4_DOPC', 'HAI_4_SIR', 
            'HAI_5_ELIGCASES', 'HAI_5_DOPC', 'HAI_5_SIR', 
            'HAI_6_ELIGCASES', 'HAI_6_DOPC', 'HAI_6_SIR',
            ]
df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score'], axis=1)

hai_df = pd.DataFrame(columns=['Facility ID']) 
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    
    hai_df = hai_df.merge(tdf2, on='Facility ID', how='outer')
    
    
hai_df.rename(columns={'HAI_1_ELIGCASES': 'HAI_1_DEN_PRED',
                       'HAI_1_DOPC': 'HAI_1_DEN_VOL',
                       'HAI_1_SIR': 'HAI_1',
                       'HAI_2_ELIGCASES': 'HAI_2_DEN_PRED',
                       'HAI_2_DOPC': 'HAI_2_DEN_VOL',
                       'HAI_2_SIR': 'HAI_2',
                       'HAI_3_ELIGCASES': 'HAI_3_DEN_PRED',
                       'HAI_3_DOPC': 'HAI_3_DEN_VOL',
                       'HAI_3_SIR': 'HAI_3',
                       'HAI_4_ELIGCASES': 'HAI_4_DEN_PRED',
                       'HAI_4_DOPC': 'HAI_4_DEN_VOL',
                       'HAI_4_SIR': 'HAI_4',
                       'HAI_5_ELIGCASES': 'HAI_5_DEN_PRED',
                       'HAI_5_DOPC': 'HAI_5_DEN_VOL',
                       'HAI_5_SIR': 'HAI_5',
                       'HAI_6_ELIGCASES': 'HAI_6_DEN_PRED',
                       'HAI_6_DOPC': 'HAI_6_DEN_VOL',
                       'HAI_6_SIR': 'HAI_6',
                       'Facility ID': 'PROVIDER_ID',
                   }, inplace=True)

for col in list(hai_df):
    if col != 'PROVIDER_ID':
        hai_df[col] = pd.to_numeric(hai_df[col], errors='coerce')
        
ls = ['1', '2', '3', '4', '5', '6']
for l in ls:
    hai_df['HAI_' + l + '_DEN_PRED'] = hai_df['HAI_' + l + '_DEN_PRED'].where(hai_df['HAI_' + l + '_DEN_PRED'] >= 1, np.nan)
    hai_df['HAI_' + l + '_DEN_VOL'] = hai_df['HAI_' + l + '_DEN_VOL'].where(hai_df['HAI_' + l + '_DEN_PRED'] >= 1, np.nan)
    
for c in list(hai_df):
    try:
        remaining_ls.remove(c)
    except:
        pass
print(len(remaining_ls), 'remaining features:', remaining_ls, '\n')       

hai_df = curate(hai_df)

76 remaining features: ['READM_30_HOSP_WIDE', 'READM_30_HIP_KNEE', 'EDAC_30_HF', 'READM_30_COPD', 'EDAC_30_AMI', 'EDAC_30_PN', 'MORT_30_STK', 'MORT_30_PN', 'MORT_30_HF', 'MORT_30_COPD', 'MORT_30_AMI', 'COMP_HIP_KNEE', 'READM_30_HOSP_WIDE_DEN', 'READM_30_HIP_KNEE_DEN', 'EDAC_30_HF_DEN', 'READM_30_COPD_DEN', 'EDAC_30_AMI_DEN', 'EDAC_30_PN_DEN', 'MORT_30_STK_DEN', 'MORT_30_PN_DEN', 'MORT_30_HF_DEN', 'MORT_30_COPD_DEN', 'MORT_30_AMI_DEN', 'COMP_HIP_KNEE_DEN', 'OP_2', 'OP_2_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'PSI_4_SURG_COMP', 'PSI_4_SURG_COMP_DEN', 'PSI_90_SAFETY', 'IMM_3_DEN', 'IMM_3', 'PC_01', 'PC_01_DEN', 'SEP_1', 'SEP_1_DEN', 'H_RESP_RATE_P', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP',

## Unplanned Hospital Visits


In [5]:

df = pd.read_pickle(hos_dir + 'Unplanned_Visits/CombinedFiles_Unplanned_Visits/Facility.pkl')
df = df[df['file_year'] == '2021']
df = df[df['file_month'] == '07']

measures = ['EDAC_30_AMI', 'EDAC_30_HF', 'EDAC_30_PN', 'OP_32', 'OP_35_ADM', 
             'OP_35_ED', 'OP_36', 'READM_30_CABG', 'READM_30_COPD', 
             'READM_30_HIP_KNEE', 'READM_30_HOSP_WIDE',
             ]
df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Denominator', 'Measure ID', 'Score'], axis=1)

uhv_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    tdf2[m + '_DEN'] = tdf1['Denominator'].tolist()
    
    uhv_df = uhv_df.merge(tdf2, on='Facility ID', how='outer')


uhv_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                   }, inplace=True)

for col in list(uhv_df):
    if col != 'PROVIDER_ID':
        uhv_df[col] = pd.to_numeric(uhv_df[col], errors='coerce')

for c in list(uhv_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
uhv_df = curate(uhv_df)

54 remaining features: ['COMP_HIP_KNEE', 'COMP_HIP_KNEE_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'IMM_3', 'IMM_3_DEN', 'MORT_30_AMI', 'MORT_30_AMI_DEN', 'MORT_30_CABG', 'MORT_30_CABG_DEN', 'MORT_30_COPD', 'MORT_30_COPD_DEN', 'MORT_30_HF', 'MORT_30_HF_DEN', 'MORT_30_PN', 'MORT_30_PN_DEN', 'MORT_30_STK', 'MORT_30_STK_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_2', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'OP_2_DEN', 'OP_33', 'OP_33_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'PC_01', 'PC_01_DEN', 'PSI_4_SURG_COMP', 'PSI_4_SURG_COMP_DEN', 'PSI_90_SAFETY', 'PSI_90_SAFETY_DEN', 'SEP_1', 'SEP_1_DEN'] 



## COMPLICATIONS AND DEATHS

In [6]:
df = pd.read_pickle(hos_dir + 'Complications_and_Deaths/CombinedFiles_ComplicationsAndDeaths/Facility.pkl')
df = df[df['file_year'] == '2021']
df = df[df['file_month'] == '07']

measures = ['MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF', 
            'MORT_30_PN', 'MORT_30_STK', 'PSI_04', 'COMP_HIP_KNEE',
            'PSI_90', 'PSI_4_SURG_COMP_DEN', 'PSI_90_SAFETY_DEN',
            ]

df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Denominator', 'Measure ID', 'Score'], axis=1)

cad_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    tdf2[m + '_DEN'] = tdf1['Denominator'].tolist()
    
    cad_df = cad_df.merge(tdf2, on='Facility ID', how='outer')
    
cad_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                       'PSI_04': 'PSI_4_SURG_COMP',
                       'PSI_90': 'PSI_90_SAFETY'
                   }, inplace=True)

for col in list(cad_df):
    if col != 'PROVIDER_ID':
        cad_df[col] = pd.to_numeric(cad_df[col], errors='coerce')

for c in list(cad_df):
    try:
        remaining_ls.remove(c)
    except:
        pass
    
print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
cad_df = curate(cad_df)

36 remaining features: ['H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'IMM_3', 'IMM_3_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_2', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'OP_2_DEN', 'OP_33', 'OP_33_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'PC_01', 'PC_01_DEN', 'SEP_1', 'SEP_1_DEN'] 



## TIMELY AND EFFECTIVE CARE

In [7]:
df = pd.read_pickle(hos_dir + 'Timely_and_Effective_Care/CombinedFiles_Timely_and_Effective_Care/Facility.pkl')
df = df[df['file_year'] == '2021']
df = df[df['file_month'] == '07']

measures = ['IMM_3', 'OP_18b', 'OP_2', 'OP_22', 'OP_23', 'OP_29', 'OP_33', 'OP_3b', 'PC_01', 'SEP_1']

df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Sample', 'Measure ID', 'Score'], axis=1)

tec_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    tdf2[m + '_DEN'] = tdf1['Sample'].tolist()
    
    tec_df = tec_df.merge(tdf2, on='Facility ID', how='outer')
    
tec_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                       'OP_3b': 'OP_3B',
                       'OP_3b_DEN': 'OP_3B_DEN',
                       'OP_18b': 'OP_18B',
                       'OP_18b_DEN': 'OP_18B_DEN',
                       
                   }, inplace=True)

for col in list(tec_df):
    if col != 'PROVIDER_ID':
        tec_df[col] = pd.to_numeric(tec_df[col], errors='coerce')

for c in list(tec_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
tec_df = curate(tec_df)

16 remaining features: ['H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_8', 'OP_8_DEN'] 



## HCAHPS

In [8]:
#df = pd.read_pickle(main_dir + 'HCAHPS/CombinedFiles_HCAHPS/Facility.pkl')
#df = df[df['file_year'] == '2020']
#df = df[df['file_month'] == '10']
df = pd.read_csv(hos_dir + '2021/hospitals_07_2021/HCAHPS-Hospital.csv')

measures = ['H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 
            'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_CLEAN_STAR_RATING',  'H_QUIET_STAR_RATING', 
            'H_RECMND_STAR_RATING', 'H_HSP_RATING_STAR_RATING']

df = df[df['HCAHPS Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'HCAHPS Measure ID', 'Patient Survey Star Rating', 
                        'Number of Completed Surveys', 'Survey Response Rate Percent'], axis=1)

HCAHPS_df = pd.DataFrame(columns=['Facility ID'])
for i, m in enumerate(measures):
    tdf1 = df[df['HCAHPS Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Patient Survey Star Rating'].tolist()
    if i == 0:
        tdf2['H_NUMB_COMP'] = tdf1['Number of Completed Surveys'].tolist()
        tdf2['H_RESP_RATE_P'] = tdf1['Survey Response Rate Percent'].tolist()
        
    HCAHPS_df = HCAHPS_df.merge(tdf2, on='Facility ID', how='outer')
    

HCAHPS_df.rename(columns={'Facility ID': 'PROVIDER_ID'}, inplace=True)
HCAHPS_df['H_HSP_RATING_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_HSP_RATING_STAR_RATING'] = HCAHPS_df['H_HSP_RATING_STAR_RATING'].astype(int)
HCAHPS_df['H_RECMND_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_RECMND_STAR_RATING'] = HCAHPS_df['H_RECMND_STAR_RATING'].astype(int)
HCAHPS_df['H_GLOB_STAR_RATING'] = np.round((HCAHPS_df['H_HSP_RATING_STAR_RATING'] + HCAHPS_df['H_RECMND_STAR_RATING']) / 2, 1)
HCAHPS_df['H_GLOB_STAR_RATING'].replace(0, np.nan, inplace=True)

HCAHPS_df['H_CLEAN_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_CLEAN_STAR_RATING'] = HCAHPS_df['H_CLEAN_STAR_RATING'].astype(int)
HCAHPS_df['H_QUIET_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_QUIET_STAR_RATING'] = HCAHPS_df['H_QUIET_STAR_RATING'].astype(int)
HCAHPS_df['H_INDI_STAR_RATING'] = np.round((HCAHPS_df['H_CLEAN_STAR_RATING'] + HCAHPS_df['H_QUIET_STAR_RATING']) / 2, 1)
HCAHPS_df['H_INDI_STAR_RATING'].replace(0, np.nan, inplace=True)

HCAHPS_df.drop(labels = ['H_CLEAN_STAR_RATING',  'H_QUIET_STAR_RATING', 
                         'H_RECMND_STAR_RATING', 'H_HSP_RATING_STAR_RATING'], axis=1, inplace=True)

for c in list(HCAHPS_df):
    try:
        remaining_ls.remove(c)
    except:
        pass
        print(c, 'not in remaining')
        
print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
HCAHPS_df = curate(HCAHPS_df)

PROVIDER_ID not in remaining
6 remaining features: ['OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_8', 'OP_8_DEN'] 



## Outpatient Imaging Efficiency

In [9]:
df = pd.read_pickle(hos_dir + 'Outpatient_Imaging_Efficiency/CombinedFiles_Outpatient_Imaging_Efficiency/Facility.pkl')
df = df[df['file_year'] == '2021']
df = df[df['file_month'] == '07']

measures = ['OP-8', 'OP-10', 'OP-13']
df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score'], axis=1)

oie_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    tdf2[m + '_DEN'] = [np.nan]*tdf2.shape[0]
    
    oie_df = oie_df.merge(tdf2, on='Facility ID', how='outer')
    
    
oie_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                       'OP-8': 'OP_8',
                       'OP-10': 'OP_10',
                       'OP-13': 'OP_13',
                       'OP-8_DEN': 'OP_8_DEN',
                       'OP-10_DEN': 'OP_10_DEN',
                       'OP-13_DEN': 'OP_13_DEN',
                   }, inplace=True)

for col in list(oie_df):
    if col != 'PROVIDER_ID':
        oie_df[col] = pd.to_numeric(oie_df[col], errors='coerce')

for c in list(oie_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
oie_df = curate(oie_df)

0 remaining features: [] 



## MERGE DATAFRAME AND COMPARE TO SAS FILE

In [10]:
main_df = tec_df.merge(cad_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(HCAHPS_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(uhv_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(hai_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(oie_df, on='PROVIDER_ID', how='outer')

In [11]:
prvdrs_sas = sorted(sas_input_df['PROVIDER_ID'].tolist())
prvdrs_main1 = sorted(main_df['PROVIDER_ID'].tolist())

ps = []
for p in prvdrs_sas:
    if p not in prvdrs_main1:
        ps.append(p)

print('Hospitals in the SAS file but not in file created here:')
print(ps)

main_df = main_df[main_df['PROVIDER_ID'].isin(prvdrs_sas)]
prvdrs_main1 = sorted(main_df['PROVIDER_ID'].tolist())

# Remove columns in main_df that are not in the original sas output data
main_df = main_df.filter(sas_cols)
for col in list(main_df):
    if col != 'PROVIDER_ID':
        main_df[col] = pd.to_numeric(main_df[col], errors='coerce')

# Remove hospitals 
ls = list(main_df)
ls.remove('PROVIDER_ID')
main_df.dropna(how='all', axis=0, subset=ls, inplace=True)
prvdrs_main2 = sorted(main_df['PROVIDER_ID'].tolist())

ls = np.setdiff1d(prvdrs_main1, prvdrs_main2)

print('Hospitals in the file created here that have zero data for the features included in the SAS file:')
print(ls, '\n')

ls = ['READM_30_HIP_KNEE', 'READM_30_COPD', 'MORT_30_STK', 'MORT_30_PN',
      'MORT_30_HF', 'MORT_30_COPD', 'MORT_30_AMI', 'COMP_HIP_KNEE', 'OP_22',
      'OP_23', 'OP_29', 'IMM_3', 'PC_01', 'SEP_1', 'MORT_30_CABG',
      'READM_30_CABG', 'OP_33', 'READM_30_HOSP_WIDE', 'OP_2', 'OP_8',
      'OP_10', 'OP_13']

for l in ls:
    main_df[l] = main_df[l] * 0.01

print('main_df.shape:', main_df.shape)
print('sas_input_df.shape:', sas_input_df.shape)


Hospitals in the SAS file but not in file created here:
['670265']
Hospitals in the file created here that have zero data for the features included in the SAS file:
['191311' '370214' '670109'] 

main_df.shape: (4485, 95)
sas_input_df.shape: (4489, 95)


In [12]:
main_df.head()

Unnamed: 0,PROVIDER_ID,HAI_1_DEN_VOL,HAI_2_DEN_VOL,HAI_3_DEN_VOL,HAI_4_DEN_VOL,HAI_5_DEN_VOL,HAI_6_DEN_VOL,HAI_1_DEN_PRED,HAI_2_DEN_PRED,HAI_3_DEN_PRED,HAI_4_DEN_PRED,HAI_5_DEN_PRED,HAI_6_DEN_PRED,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,READM_30_HOSP_WIDE,READM_30_HIP_KNEE,EDAC_30_HF,READM_30_COPD,EDAC_30_AMI,EDAC_30_PN,MORT_30_STK,MORT_30_PN,MORT_30_HF,MORT_30_COPD,MORT_30_AMI,COMP_HIP_KNEE,READM_30_HOSP_WIDE_DEN,READM_30_HIP_KNEE_DEN,EDAC_30_HF_DEN,READM_30_COPD_DEN,EDAC_30_AMI_DEN,EDAC_30_PN_DEN,MORT_30_STK_DEN,MORT_30_PN_DEN,MORT_30_HF_DEN,MORT_30_COPD_DEN,MORT_30_AMI_DEN,COMP_HIP_KNEE_DEN,OP_2,OP_2_DEN,OP_3B,OP_3B_DEN,OP_8,OP_8_DEN,OP_10,OP_10_DEN,OP_13,OP_13_DEN,OP_18B,OP_18B_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,PSI_90_SAFETY,IMM_3_DEN,IMM_3,PC_01,PC_01_DEN,SEP_1,SEP_1_DEN,H_RESP_RATE_P,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,H_NUMB_COMP,PSI_90_SAFETY_DEN,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,OP_32,OP_32_DEN,OP_33,OP_33_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN
0,10001,8391.0,13866.0,171.0,229.0,102470.0,102470.0,8.711,19.223,4.938,2.025,7.631,78.514,0.574,0.312,0.608,0.0,0.786,0.56,0.155,0.049,16.1,0.212,6.5,-7.1,0.198,0.145,0.089,0.083,0.116,0.026,1805.0,165.0,904.0,310.0,424.0,503.0,533.0,474.0,725.0,270.0,412.0,175.0,,,,,0.459,,0.011,,0.048,,183.0,176.0,0.03,59762.0,,,0.9,52.0,184.28,,0.81,4817.0,0.97,0.0,18.0,0.55,56.0,21.0,3.0,3.0,3.0,3.0,4.0,3.0,3.5,3.5,507.0,,0.038,200.0,0.135,193.0,14.1,511.0,,,11.3,192.0,6.7,192.0,0.9,1003.0
1,10005,2864.0,8199.0,79.0,,37761.0,35442.0,1.808,4.57,2.084,,1.506,11.292,0.553,0.875,0.48,,1.328,0.708,0.147,0.039,15.5,0.181,-17.4,12.9,0.155,0.18,0.167,0.086,0.124,0.02,764.0,168.0,223.0,378.0,36.0,468.0,122.0,477.0,213.0,314.0,57.0,166.0,,,,,0.415,,0.05,,0.013,,122.0,588.0,0.02,68296.0,0.67,12.0,0.97,180.0,145.66,,1.19,1915.0,0.91,0.03,105.0,0.64,145.0,30.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,568.0,,,,,,14.8,1086.0,1.0,21.0,9.5,120.0,5.4,120.0,0.9,514.0
2,10006,6011.0,10398.0,92.0,,63658.0,62016.0,5.334,10.217,2.432,,4.672,28.079,0.562,0.392,0.411,,1.07,0.356,0.157,0.048,-8.1,0.182,0.1,39.2,0.15,0.163,0.124,0.078,0.157,0.032,1563.0,306.0,615.0,381.0,363.0,663.0,286.0,634.0,528.0,331.0,344.0,306.0,,,,,0.279,,0.044,,0.007,,171.0,183.0,0.01,47004.0,,,0.92,89.0,193.03,,1.19,2456.0,0.93,0.0,15.0,0.33,54.0,23.0,3.0,3.0,1.0,2.0,2.0,2.0,3.0,3.0,1136.0,,0.034,127.0,0.138,120.0,13.6,1908.0,,,,,,,1.0,502.0
3,10007,,,,,,5230.0,,,,,,2.033,,,,,,0.0,0.157,,8.0,0.2,,13.9,,0.192,0.126,0.089,,,144.0,,70.0,103.0,,147.0,,152.0,67.0,90.0,,,,,,,,,0.011,,,,110.0,586.0,0.02,12514.0,,,0.92,169.0,,,0.94,,,,,1.0,32.0,27.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,4.0,179.0,,,,,,18.4,184.0,,,,,,,1.0,31.0
4,10008,,,,,,,,,,,,,,,,,,,0.153,,,0.194,,2.0,,0.166,,0.104,,,38.0,,,25.0,,26.0,,31.0,,28.0,,,,,,,,,0.019,,,,97.0,165.0,0.0,7485.0,,,0.42,31.0,,,0.99,189.0,0.48,,,,,25.0,,,,,,,,,46.0,,,,,,18.7,81.0,,,,,,,,


In [13]:
def get_apd(obs, exp):
    if len(obs) != len(exp):
        raise ValueError("Both lists must have the same length")

    n = len(obs)
    apd_values = []

    for i in range(n):
        numerator = abs(obs[i] - exp[i])
        denominator = (obs[i] + exp[i]) / 2

        # Avoid division by zero
        if denominator == 0:
            apd = 0  # Both obs[i] and exp[i] are zero, no error
        else:
            apd = numerator / denominator * 100

        apd_values.append(apd)

    average_apd = sum(apd_values) / n
    return average_apd


In [14]:
apd_ls = []
n_perfect = 0
n_miss = 0
n_tot = 0

tdf = main_df.copy(deep=True)

prvdrs = tdf['PROVIDER_ID'].unique()
sas_tdf = sas_input_df[sas_input_df['PROVIDER_ID'].isin(prvdrs)]

tdf.sort_values(by=['PROVIDER_ID'], inplace=True)
sas_tdf.sort_values(by=['PROVIDER_ID'], inplace=True)

tdf.fillna(0, inplace=True)
sas_tdf.fillna(0, inplace=True)

labels = ['MORT_30_AMI',
          'MORT_30_CABG',
          'MORT_30_COPD', 
          'MORT_30_HF',
          'MORT_30_PN',
          'MORT_30_STK',
          'PSI_4_SURG_COMP',
          'COMP_HIP_KNEE', 
          'HAI_1',
          'HAI_2',
          'HAI_3',
          'HAI_4',
          'HAI_5',
          'HAI_6',
          'PSI_90_SAFETY',
          'EDAC_30_AMI',
          'EDAC_30_HF',
          'EDAC_30_PN', 
          'OP_32',
          'READM_30_CABG',
          'READM_30_COPD',
          'READM_30_HIP_KNEE',
          'READM_30_HOSP_WIDE',
          'OP_35_ADM', 
          'OP_35_ED',
          'OP_36',
          'H_COMP_1_STAR_RATING',
          'H_COMP_2_STAR_RATING',
          'H_COMP_3_STAR_RATING',
          'H_COMP_5_STAR_RATING',
          'H_COMP_6_STAR_RATING',
          'H_COMP_7_STAR_RATING',
          'H_GLOB_STAR_RATING',
          'H_INDI_STAR_RATING',
          'IMM_3',
          'OP_22',
          'OP_23',
          'OP_29',
          #'OP_30',
          'OP_33',
          'PC_01',
          'SEP_1',
          'OP_3B',
          'OP_18B',
          #'ED_2B',
          'OP_8',
          'OP_10',
          'OP_13',
          'OP_2',
         ]

print(len(labels), 'individual measures used in stars SAS files')
for label in labels:
        
    obs = tdf[label].astype('float').tolist()
    exp = sas_tdf[label].astype('float').tolist()
    
    result = get_apd(obs, exp)
    apd_ls.append(result)
    
    for i, o in enumerate(obs):
        e = exp[i]
        if o == e:
            n_perfect += 1
            n_tot += 1
        else:
            n_miss += 1
            n_tot += 1

print('Results for Average Percent Difference:')
print('median:', np.nanmedian(apd_ls))
q1 = np.percentile(apd_ls, 25)
q3 = np.percentile(apd_ls, 75)
print('Q1:', q1)
print('Q3:', q3)

print('% of all measures that were perfectly reproduced:', 100*n_perfect/n_tot)

47 individual measures used in stars SAS files
Results for Average Percent Difference:
median: 0.08918617614269801
Q1: 2.6406540481416435e-16
Q3: 0.4459308807134898
% of all measures that were perfectly reproduced: 96.8628288147252


In [None]:
for col in list(main_df):
    if col != 'PROVIDER_ID':
        main_df[col] = pd.to_numeric(main_df[col], errors='coerce')

main_df.to_csv(stars_dir + "Reproduce_Stars_Input/2022/Input_File/all_data_2022jul.csv", index=False)
