In [1]:
import pandas as pd
import warnings
from IPython.utils import io
import sys
import numpy as np
from functools import reduce

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

stars_dir = '~/GitHub/stars-data-builder/'
hos_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

In [2]:
def curate(df):

    try:
        df = df[df['PROVIDER_ID'] != np.nan]
        df['PROVIDER_ID'] = df['PROVIDER_ID'].values.astype(str)
        
        ids = df['PROVIDER_ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['PROVIDER_ID'] = ids2
        
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    if 'Unnamed: 0' in list(df):
        df.drop(labels=['Unnamed: 0'], axis=1, inplace=True)
    return df

## MEASURES USED FOR JULY 2022 File: From PDFs (SAS & CMS)

### Mortality

MORT-30-AMI  
MORT-30-CABG  
MORT-30-COPD  
MORT-30-HF  
MORT-30-PN  
MORT-30-STK  
PSI-4-SURG-COMP  

### Safety of Care

HAI-1  
HAI-2  
HAI-3  
HAI-4  
HAI-5  
HAI-6  
COMP-HIP-KNEE  
PSI-90-Safety  

### Readmission

READM-30-CABG  
READM-30-COPD  
READM-30-Hip-Knee  
READM-30-HOSP-WIDE  
EDAC-30-AMI  
EDAC-30-HF  
EDAC-30-PN  
OP-32  
OP-35 ADM  
OP-35 ED  
OP-36  

### Patient Experience

H-COMP-1  
H-COMP-2  
H-COMP-3  
H-COMP-5  
H-COMP-6  
H-COMP-7  
H-CLEAN-HSP / H-QUIET-HSP  
H-HSP-RATING / H-RECMND  

### Timely and Effective Care

IMM-3  
OP-10  
OP-13  
OP-18b  
OP-2  * <= 100 hospitals reporting
OP-22  
OP-23  
OP-29
OP-3b  
OP-8  
PC-01  
SEP-1  


## Retired (2022):
ED-2b  
OP-30 

## Retired in (2023):
OP-33: Percentage of patients receiving appropriate radiation therapy for cancer that has
spread to the bone

## Measure excluded from 2023 due to no more than 100 hospitals reporting performance publicly:
OP-2: Percentage of outpatients with chest pain or possible heart attack who got drugs to
break up blood clots within 30 minutes of arrival

## New measure added for 2023:
HCP COVID-19: COVID-19 Vaccination Coverage Among Health Care Providers


In [21]:
sas_input_df = pd.read_sas(stars_dir + 'Reproduce_Stars_Input/2023/Input_File/alldata_2023jul.sas7bdat', 
                           format = 'sas7bdat', encoding = "utf8")


sas_input_df = curate(sas_input_df)
sas_cols = list(sas_input_df)
remaining_ls = list(sas_cols)
print('sas_input_df.shape:', sas_input_df.shape)

labels = list(sas_input_df)
labels.remove('PROVIDER_ID')
print(len(labels), 'individual measures in the publicly-released SAS dataframe\n')
del labels

#prvdrs = sas_input_df['PROVIDER_ID'].tolist()
#for p in prvdrs:
#    if 'F' in p:
#        print(p)

sas_input_df.head()

sas_input_df.shape: (4654, 95)
94 individual measures in the publicly-released SAS dataframe



Unnamed: 0,PROVIDER_ID,HAI_1_DEN_VOL,HAI_2_DEN_VOL,HAI_3_DEN_VOL,HAI_4_DEN_VOL,HAI_5_DEN_VOL,HAI_6_DEN_VOL,HAI_1_DEN_PRED,HAI_2_DEN_PRED,HAI_3_DEN_PRED,HAI_4_DEN_PRED,HAI_5_DEN_PRED,HAI_6_DEN_PRED,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,READM_30_HOSP_WIDE,READM_30_HIP_KNEE,EDAC_30_HF,READM_30_COPD,EDAC_30_AMI,EDAC_30_PN,MORT_30_STK,MORT_30_PN,MORT_30_HF,MORT_30_COPD,MORT_30_AMI,COMP_HIP_KNEE,READM_30_HOSP_WIDE_DEN,READM_30_HIP_KNEE_DEN,EDAC_30_HF_DEN,READM_30_COPD_DEN,EDAC_30_AMI_DEN,EDAC_30_PN_DEN,MORT_30_STK_DEN,MORT_30_PN_DEN,MORT_30_HF_DEN,MORT_30_COPD_DEN,MORT_30_AMI_DEN,COMP_HIP_KNEE_DEN,OP_2,OP_2_DEN,OP_3B,OP_3B_DEN,OP_8,OP_8_DEN,OP_10,OP_10_DEN,OP_13,OP_13_DEN,OP_18B,OP_18B_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,PSI_90_SAFETY,IMM_3_DEN,IMM_3,HCP_COVID_19_DEN,HCP_COVID_19,PC_01,PC_01_DEN,SEP_1,SEP_1_DEN,H_RESP_RATE_P,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,H_NUMB_COMP,PSI_90_SAFETY_DEN,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,OP_32,OP_32_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN
0,10001,10024.0,17731.0,154.0,200.0,101908.0,101451.0,10.597,26.63,4.548,1.845,9.412,72.686,0.661,0.3,1.099,0.0,0.85,0.66,0.142,0.042,21.8,0.199,1.9,-1.5,0.164,0.159,0.083,0.085,0.124,0.024,3058.0,98.0,755.0,202.0,319.0,436.0,489.0,407.0,630.0,182.0,317.0,102.0,,,,,0.425,146.0,0.057,1488.0,0.067,208.0,205.0,323.0,0.03,51079.0,,,0.81,16.0,173.39,120.0,1.01,3795.0,0.97,2323.0,0.737,0.09,34.0,0.46,146.0,15.0,2.0,3.0,2.0,4.0,4.0,3.0,3.0,3.5,434.0,2046.895485,0.047,172.0,0.117,165.0,14.1,254.0,10.2,214.0,4.7,214.0,1.0,688.0
1,10005,3713.0,8670.0,88.0,,38413.0,35686.0,2.45,4.995,2.512,,1.999,10.484,3.673,1.201,1.194,,0.0,0.858,0.139,0.04,9.3,0.176,4.7,2.2,0.166,0.218,0.169,0.081,0.126,0.018,1258.0,178.0,157.0,234.0,38.0,361.0,100.0,369.0,153.0,195.0,52.0,134.0,,,,,0.545,191.0,0.138,1214.0,0.043,208.0,146.0,1003.0,0.03,54503.0,0.73,15.0,0.99,108.0,142.88,35.0,0.91,2593.0,0.9,2026.0,0.821,0.01,194.0,0.59,242.0,16.0,3.0,4.0,2.0,3.0,3.0,3.0,3.0,3.5,717.0,819.043002,,,,,14.6,850.0,11.1,108.0,5.7,108.0,0.9,362.0
2,10006,7318.0,11755.0,91.0,,62709.0,54159.0,7.924,15.296,2.523,,4.164,22.618,0.757,0.196,0.396,,1.441,0.088,0.142,0.048,-2.3,0.177,25.9,42.3,0.189,0.178,0.122,0.078,0.165,0.034,2555.0,246.0,550.0,235.0,312.0,538.0,261.0,528.0,468.0,209.0,295.0,234.0,,,,,0.412,97.0,0.11,1168.0,0.014,217.0,144.0,363.0,0.01,41137.0,0.57,14.0,0.88,75.0,157.42,84.0,1.1,2292.0,0.64,2694.0,0.651,0.0,37.0,0.58,142.0,17.0,2.0,3.0,1.0,2.0,3.0,2.0,2.0,2.5,1358.0,1487.163359,0.035,117.0,0.156,109.0,12.5,1505.0,,,,,1.1,468.0
3,10007,,,,,,5413.0,,,,,,2.148,,,,,,0.466,0.151,,36.3,0.197,,-12.6,,0.217,0.139,0.103,,,272.0,,51.0,72.0,,99.0,,106.0,45.0,63.0,,,,,,,,,0.059,169.0,,,119.0,1202.0,0.03,11120.0,,,0.63,68.0,,,0.99,318.0,0.61,277.0,0.574,,,0.93,55.0,23.0,3.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,173.0,142.073902,,,,,15.3,118.0,,,,,1.0,56.0
4,10008,,,,,,,,,,,,,,,,,,,0.145,,,,,,,0.197,,,,,93.0,,,,,,,26.0,,,,,,,,,,,0.021,97.0,,,113.0,346.0,0.0,6205.0,,,0.52,23.0,,,,125.0,0.46,169.0,0.623,,,,,,,,,,,,,,,,,,,,14.3,62.0,,,,,,


## HAIs

In [4]:
df = pd.read_pickle(hos_dir + 'HAI/CombinedFiles_HAI/Facility.pkl')
df = df[df['file_year'] == '2023']
df = df[df['file_month'] == '01']
#print(df['Measure ID'].unique())

measures = ['HAI_1_ELIGCASES', 'HAI_1_DOPC', 'HAI_1_SIR', 'HAI_2_ELIGCASES', 'HAI_2_DOPC', 'HAI_2_SIR', 
            'HAI_3_ELIGCASES', 'HAI_3_DOPC', 'HAI_3_SIR', 'HAI_4_ELIGCASES', 'HAI_4_DOPC', 'HAI_4_SIR', 
            'HAI_5_ELIGCASES', 'HAI_5_DOPC', 'HAI_5_SIR', 'HAI_6_ELIGCASES', 'HAI_6_DOPC', 'HAI_6_SIR']

df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score'], axis=1)

hai_df = pd.DataFrame(columns=['Facility ID']) 
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    hai_df = hai_df.merge(tdf2, on='Facility ID', how='outer')
    
hai_df.rename(columns={'HAI_1_ELIGCASES': 'HAI_1_DEN_PRED',
                       'HAI_1_DOPC': 'HAI_1_DEN_VOL',
                       'HAI_1_SIR': 'HAI_1',
                       'HAI_2_ELIGCASES': 'HAI_2_DEN_PRED',
                       'HAI_2_DOPC': 'HAI_2_DEN_VOL',
                       'HAI_2_SIR': 'HAI_2',
                       'HAI_3_ELIGCASES': 'HAI_3_DEN_PRED',
                       'HAI_3_DOPC': 'HAI_3_DEN_VOL',
                       'HAI_3_SIR': 'HAI_3',
                       'HAI_4_ELIGCASES': 'HAI_4_DEN_PRED',
                       'HAI_4_DOPC': 'HAI_4_DEN_VOL',
                       'HAI_4_SIR': 'HAI_4',
                       'HAI_5_ELIGCASES': 'HAI_5_DEN_PRED',
                       'HAI_5_DOPC': 'HAI_5_DEN_VOL',
                       'HAI_5_SIR': 'HAI_5',
                       'HAI_6_ELIGCASES': 'HAI_6_DEN_PRED',
                       'HAI_6_DOPC': 'HAI_6_DEN_VOL',
                       'HAI_6_SIR': 'HAI_6',
                       'Facility ID': 'PROVIDER_ID',
                   }, inplace=True)

for c in list(hai_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
hai_df = curate(hai_df)

76 remaining features: ['COMP_HIP_KNEE', 'COMP_HIP_KNEE_DEN', 'EDAC_30_AMI', 'EDAC_30_AMI_DEN', 'EDAC_30_HF', 'EDAC_30_HF_DEN', 'EDAC_30_PN', 'EDAC_30_PN_DEN', 'HCP_COVID_19', 'HCP_COVID_19_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'IMM_3', 'IMM_3_DEN', 'MORT_30_AMI', 'MORT_30_AMI_DEN', 'MORT_30_CABG', 'MORT_30_CABG_DEN', 'MORT_30_COPD', 'MORT_30_COPD_DEN', 'MORT_30_HF', 'MORT_30_HF_DEN', 'MORT_30_PN', 'MORT_30_PN_DEN', 'MORT_30_STK', 'MORT_30_STK_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_2', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'OP_2_DEN', 'OP_32', 'OP_32_DEN', 'OP_35_ADM', 'OP_35_ADM_DEN', 'OP_35_ED', 'OP_35_ED_DEN', 'OP_36', 'OP_36_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'PC_01', 'PC_01_DEN', 'PSI_4_SURG_COMP', 'PSI_4_SURG_COMP_DE

## Unplanned Hospital Visits


In [5]:
df = pd.read_pickle(hos_dir + 'Unplanned_Visits/CombinedFiles_Unplanned_Visits/Facility.pkl')
df = df[df['file_year'] == '2023']
df = df[df['file_month'] == '01']

print(df['Measure ID'].unique())

measures = ['EDAC_30_AMI', 'EDAC_30_HF', 'EDAC_30_PN', 'OP_32', 'OP_35_ADM', 'OP_35_ED', 'OP_36', 
            'READM_30_CABG', 'READM_30_COPD', 'READM_30_HIP_KNEE', 'READM_30_HOSP_WIDE']

df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Denominator', 'Measure ID', 'Score'], axis=1)

uhv_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    tdf2[m + '_DEN'] = tdf1['Denominator'].tolist()
    uhv_df = uhv_df.merge(tdf2, on='Facility ID', how='outer')

uhv_df.rename(columns={'Facility ID': 'PROVIDER_ID'}, inplace=True)

for c in list(uhv_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
uhv_df = curate(uhv_df)

['EDAC_30_AMI' 'EDAC_30_HF' 'EDAC_30_PN' 'OP_32' 'OP_35_ADM' 'OP_35_ED'
 'OP_36' 'READM_30_AMI' 'READM_30_CABG' 'READM_30_COPD' 'READM_30_HF'
 'READM_30_HIP_KNEE' 'READM_30_HOSP_WIDE' 'READM_30_PN']
54 remaining features: ['COMP_HIP_KNEE', 'COMP_HIP_KNEE_DEN', 'HCP_COVID_19', 'HCP_COVID_19_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'IMM_3', 'IMM_3_DEN', 'MORT_30_AMI', 'MORT_30_AMI_DEN', 'MORT_30_CABG', 'MORT_30_CABG_DEN', 'MORT_30_COPD', 'MORT_30_COPD_DEN', 'MORT_30_HF', 'MORT_30_HF_DEN', 'MORT_30_PN', 'MORT_30_PN_DEN', 'MORT_30_STK', 'MORT_30_STK_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_2', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'OP_2_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'PC_01', 'PC_01_DEN', 'PSI_4_SURG_COMP', 'PSI_4_SURG_COMP_DEN

## COMPLICATIONS AND DEATHS

In [6]:
df = pd.read_pickle(hos_dir + 'Complications_and_Deaths/CombinedFiles_ComplicationsAndDeaths/Facility.pkl')
df = df[df['file_year'] == '2023']
df = df[df['file_month'] == '01']
print(df['Measure ID'].unique())

measures = ['MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF', 
            'MORT_30_PN', 'MORT_30_STK', 'PSI_04', 'COMP_HIP_KNEE',
            'PSI_90', 
            ]

df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score', 'Denominator'], axis=1)

cad_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    
    tdf2[m + '_DEN'] = tdf1['Denominator'].tolist()
    cad_df = cad_df.merge(tdf2, on='Facility ID', how='outer')
    
cad_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                       'PSI_04': 'PSI_4_SURG_COMP',
                       'PSI_04_DEN': 'PSI_4_SURG_COMP_DEN',
                       'PSI_90': 'PSI_90_SAFETY',
                       'PSI_90_DEN': 'PSI_90_SAFETY_DEN',
                   }, inplace=True)

for c in list(cad_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
cad_df = curate(cad_df)

['COMP_HIP_KNEE' 'MORT_30_AMI' 'MORT_30_CABG' 'MORT_30_COPD' 'MORT_30_HF'
 'MORT_30_PN' 'MORT_30_STK' 'PSI_03' 'PSI_04' 'PSI_06' 'PSI_08' 'PSI_09'
 'PSI_10' 'PSI_11' 'PSI_12' 'PSI_13' 'PSI_14' 'PSI_15' 'PSI_90']
36 remaining features: ['HCP_COVID_19', 'HCP_COVID_19_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'IMM_3', 'IMM_3_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_2', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'OP_2_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'PC_01', 'PC_01_DEN', 'SEP_1', 'SEP_1_DEN'] 



## TIMELY AND EFFECTIVE CARE

In [7]:
df = pd.read_pickle(hos_dir + 'Timely_and_Effective_Care/CombinedFiles_Timely_and_Effective_Care/Facility.pkl')
df = df[df['file_year'] == '2023']
df = df[df['file_month'] == '01']
print(df['Measure ID'].unique())

measures = ['IMM_3', 'OP_18b', 'OP_2', 'OP_22', 'OP_23', 'OP_29', 'OP_3b', 'SEP_1', 'HCP_COVID_19']

df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Sample', 'Measure ID', 'Score'], axis=1)

tec_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    
    if m == 'HCP_COVID_19':
        pass
    else:
        tdf2[m + '_DEN'] = tdf1['Sample'].tolist()
    
    tec_df = tec_df.merge(tdf2, on='Facility ID', how='outer')
    
tec_df.rename(columns={'Facility ID': 'PROVIDER_ID', 
                       'OP_3b': 'OP_3B', 
                       'OP_3b_DEN': 'OP_3B_DEN', 
                       'OP_18b': 'OP_18B',
                       'OP_18b_DEN': 'OP_18B_DEN',
                      }, inplace=True)

for c in list(tec_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
tec_df = curate(tec_df)


['EDV' 'ED_2_Strata_1' 'ED_2_Strata_2' 'HCP_COVID_19' 'IMM_3' 'OP_18b'
 'OP_18c' 'OP_2' 'OP_22' 'OP_23' 'OP_29' 'OP_31' 'OP_3b'
 'SAFE_USE_OF_OPIOIDS' 'SEP_1' 'SEP_SH_3HR' 'SEP_SH_6HR' 'SEV_SEP_3HR'
 'SEV_SEP_6HR' 'STK_02' 'STK_03' 'STK_05' 'STK_06' 'VTE_1' 'VTE_2']
19 remaining features: ['HCP_COVID_19_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_8', 'OP_8_DEN', 'PC_01', 'PC_01_DEN'] 



## TIMELY AND EFFECTIVE CARE

Only PC-01, which for 2023 is located in the Maternal Health files of the hospitals data archive

In [8]:
df = pd.read_csv(hos_dir + '2023/hospitals_01_2023/Maternal_Health-Hospital.csv')
print(df['Measure ID'].unique())

measures = ['PC_01']
df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score', 'Sample'], axis=1)

tec2_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    tdf2[m + '_DEN'] = tdf1['Sample'].tolist()
    tec2_df = tec2_df.merge(tdf2, on='Facility ID', how='outer')
    
tec2_df.rename(columns={'Facility ID': 'PROVIDER_ID'}, inplace=True)

for c in list(tec2_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
tec2_df = curate(tec2_df)


['PC_01' 'PC_05' 'SM_7']
17 remaining features: ['HCP_COVID_19_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_8', 'OP_8_DEN'] 



## HCAHPS

In [9]:
df = pd.read_csv(hos_dir + '2023/hospitals_01_2023/HCAHPS-Hospital.csv')
print(df['HCAHPS Measure ID'].unique())

measures = ['H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 
            'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_CLEAN_STAR_RATING',  'H_QUIET_STAR_RATING', 
            'H_RECMND_STAR_RATING', 'H_HSP_RATING_STAR_RATING']

df = df[df['HCAHPS Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'HCAHPS Measure ID', 'Patient Survey Star Rating', 
                        'Number of Completed Surveys', 'Survey Response Rate Percent'], axis=1)

HCAHPS_df = pd.DataFrame(columns=['Facility ID'])
for i, m in enumerate(measures):
    tdf1 = df[df['HCAHPS Measure ID'] == m]
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Patient Survey Star Rating'].tolist()
    if i == 0:
        tdf2['H_NUMB_COMP'] = tdf1['Number of Completed Surveys'].tolist()
        tdf2['H_RESP_RATE_P'] = tdf1['Survey Response Rate Percent'].tolist()
        
    HCAHPS_df = HCAHPS_df.merge(tdf2, on='Facility ID', how='outer')
    

HCAHPS_df.rename(columns={'Facility ID': 'PROVIDER_ID'}, inplace=True)
HCAHPS_df['H_HSP_RATING_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_HSP_RATING_STAR_RATING'] = HCAHPS_df['H_HSP_RATING_STAR_RATING'].astype(int)
HCAHPS_df['H_RECMND_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_RECMND_STAR_RATING'] = HCAHPS_df['H_RECMND_STAR_RATING'].astype(int)
HCAHPS_df['H_GLOB_STAR_RATING'] = np.round((HCAHPS_df['H_HSP_RATING_STAR_RATING'] + HCAHPS_df['H_RECMND_STAR_RATING']) / 2, 1)
HCAHPS_df['H_GLOB_STAR_RATING'].replace(0, np.nan, inplace=True)

HCAHPS_df['H_CLEAN_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_CLEAN_STAR_RATING'] = HCAHPS_df['H_CLEAN_STAR_RATING'].astype(int)
HCAHPS_df['H_QUIET_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_QUIET_STAR_RATING'] = HCAHPS_df['H_QUIET_STAR_RATING'].astype(int)
HCAHPS_df['H_INDI_STAR_RATING'] = np.round((HCAHPS_df['H_CLEAN_STAR_RATING'] + HCAHPS_df['H_QUIET_STAR_RATING']) / 2, 1)
HCAHPS_df['H_INDI_STAR_RATING'].replace(0, np.nan, inplace=True)

HCAHPS_df.drop(labels = ['H_CLEAN_STAR_RATING',  'H_QUIET_STAR_RATING', 'H_RECMND_STAR_RATING', 'H_HSP_RATING_STAR_RATING'], axis=1, inplace=True)

for c in list(HCAHPS_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
HCAHPS_df = curate(HCAHPS_df)

['H_COMP_1_A_P' 'H_COMP_1_SN_P' 'H_COMP_1_U_P' 'H_COMP_1_LINEAR_SCORE'
 'H_COMP_1_STAR_RATING' 'H_NURSE_RESPECT_A_P' 'H_NURSE_RESPECT_SN_P'
 'H_NURSE_RESPECT_U_P' 'H_NURSE_LISTEN_A_P' 'H_NURSE_LISTEN_SN_P'
 'H_NURSE_LISTEN_U_P' 'H_NURSE_EXPLAIN_A_P' 'H_NURSE_EXPLAIN_SN_P'
 'H_NURSE_EXPLAIN_U_P' 'H_COMP_2_A_P' 'H_COMP_2_SN_P' 'H_COMP_2_U_P'
 'H_COMP_2_LINEAR_SCORE' 'H_COMP_2_STAR_RATING' 'H_DOCTOR_RESPECT_A_P'
 'H_DOCTOR_RESPECT_SN_P' 'H_DOCTOR_RESPECT_U_P' 'H_DOCTOR_LISTEN_A_P'
 'H_DOCTOR_LISTEN_SN_P' 'H_DOCTOR_LISTEN_U_P' 'H_DOCTOR_EXPLAIN_A_P'
 'H_DOCTOR_EXPLAIN_SN_P' 'H_DOCTOR_EXPLAIN_U_P' 'H_COMP_3_A_P'
 'H_COMP_3_SN_P' 'H_COMP_3_U_P' 'H_COMP_3_LINEAR_SCORE'
 'H_COMP_3_STAR_RATING' 'H_CALL_BUTTON_A_P' 'H_CALL_BUTTON_SN_P'
 'H_CALL_BUTTON_U_P' 'H_BATH_HELP_A_P' 'H_BATH_HELP_SN_P'
 'H_BATH_HELP_U_P' 'H_COMP_5_A_P' 'H_COMP_5_SN_P' 'H_COMP_5_U_P'
 'H_COMP_5_LINEAR_SCORE' 'H_COMP_5_STAR_RATING' 'H_MED_FOR_A_P'
 'H_MED_FOR_SN_P' 'H_MED_FOR_U_P' 'H_SIDE_EFFECTS_A_P'
 'H_SIDE_EFFECTS_SN_P'

## Outpatient Imaging Efficiency

In [10]:
df = pd.read_pickle(hos_dir + 'Outpatient_Imaging_Efficiency/CombinedFiles_Outpatient_Imaging_Efficiency/Facility.pkl')
df = df[df['file_year'] == '2023']
df = df[df['file_month'] == '04']
print(df['Measure ID'].unique())

measures = ['OP-13']
df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score'], axis=1)

oie_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    
    oie_df = oie_df.merge(tdf2, on='Facility ID', how='outer')
    
    
oie_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                       'OP-13': 'OP_13',
                   }, inplace=True)

for c in list(oie_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
oie_df1 = curate(oie_df)


['OP-10' 'OP-13' 'OP-39' 'OP-8']
6 remaining features: ['HCP_COVID_19_DEN', 'OP_10', 'OP_10_DEN', 'OP_13_DEN', 'OP_8', 'OP_8_DEN'] 



## Part 2 of Outpatient Imaging Efficiency

In [11]:
df = pd.read_pickle(hos_dir + 'Outpatient_Imaging_Efficiency/CombinedFiles_Outpatient_Imaging_Efficiency/Facility.pkl')
df = df[df['file_year'] == '2023']
df = df[df['file_month'] == '01']
print(df['Measure ID'].unique())

measures = ['OP-8', 'OP-10']
df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score'], axis=1)

oie_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    
    oie_df = oie_df.merge(tdf2, on='Facility ID', how='outer')
    
oie_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                       'OP-8': 'OP_8',
                       'OP-10': 'OP_10',
                       'OP-13': 'OP_13',
                   }, inplace=True)

for c in list(oie_df):
    try:
        remaining_ls.remove(c)
    except:
        pass

print(len(remaining_ls), 'remaining features:', sorted(remaining_ls), '\n')
oie_df2 = curate(oie_df)


['OP-10' 'OP-13' 'OP-39' 'OP-8']
4 remaining features: ['HCP_COVID_19_DEN', 'OP_10_DEN', 'OP_13_DEN', 'OP_8_DEN'] 



## MERGE DATAFRAME AND COMPARE TO SAS FILE

In [12]:
main_df = tec_df.merge(tec2_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(cad_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(HCAHPS_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(uhv_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(hai_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(oie_df1, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(oie_df2, on='PROVIDER_ID', how='outer')


In [13]:
main_df.head()

Unnamed: 0,PROVIDER_ID,IMM_3,IMM_3_DEN,OP_18B,OP_18B_DEN,OP_2,OP_2_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,OP_3B,OP_3B_DEN,SEP_1,SEP_1_DEN,HCP_COVID_19,PC_01,PC_01_DEN,MORT_30_AMI,MORT_30_AMI_DEN,MORT_30_CABG,MORT_30_CABG_DEN,MORT_30_COPD,MORT_30_COPD_DEN,MORT_30_HF,MORT_30_HF_DEN,MORT_30_PN,MORT_30_PN_DEN,MORT_30_STK,MORT_30_STK_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,COMP_HIP_KNEE,COMP_HIP_KNEE_DEN,PSI_90_SAFETY,PSI_90_SAFETY_DEN,H_COMP_1_STAR_RATING,H_NUMB_COMP,H_RESP_RATE_P,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,EDAC_30_AMI,EDAC_30_AMI_DEN,EDAC_30_HF,EDAC_30_HF_DEN,EDAC_30_PN,EDAC_30_PN_DEN,OP_32,OP_32_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN,READM_30_CABG,READM_30_CABG_DEN,READM_30_COPD,READM_30_COPD_DEN,READM_30_HIP_KNEE,READM_30_HIP_KNEE_DEN,READM_30_HOSP_WIDE,READM_30_HOSP_WIDE_DEN,HAI_1_DEN_PRED,HAI_1_DEN_VOL,HAI_1,HAI_2_DEN_PRED,HAI_2_DEN_VOL,HAI_2,HAI_3_DEN_PRED,HAI_3_DEN_VOL,HAI_3,HAI_4_DEN_PRED,HAI_4_DEN_VOL,HAI_4,HAI_5_DEN_PRED,HAI_5_DEN_VOL,HAI_5,HAI_6_DEN_PRED,HAI_6_DEN_VOL,HAI_6,OP_13,OP_8,OP_10
0,10001,97,3795,205,323,Not Available,Not Available,3,51079,Not Available,Not Available,81,16,Not Available,Not Available,46,146,73.7,9,34,12.4,317,4.7,172,8.5,182,8.3,630,15.9,407,16.4,489,173.39,120,2.4,102,1.01,Not Applicable,2,434,15,3,2,4,4,3,3.0,3.5,1.9,319,21.8,755,-1.5,436,14.1,254,10.2,214,4.7,214,1,688,11.7,165,19.9,202,4.2,98,14.2,3058,10.597,10024,0.661,26.63,17731,0.300,4.548,154,1.099,1.845,200,0.000,9.412,101908,0.850,72.686,101451,0.660,6.7,42.5,5.7
1,10005,90,2593,146,1003,Not Available,Not Available,3,54503,73,15,99,108,Not Available,Not Available,59,242,82.1,1,194,12.6,52,Not Available,Not Available,8.1,195,16.9,153,21.8,369,16.6,100,142.88,35,1.8,134,0.91,Not Applicable,3,717,16,4,2,3,3,3,3.0,3.5,4.7,38,9.3,157,2.2,361,14.6,850,11.1,108,5.7,108,0.9,362,Not Available,Not Available,17.6,234,4,178,13.9,1258,2.45,3713,3.673,4.995,8670,1.201,2.512,88,1.194,0.316,38,Not Available,1.999,38413,0.000,10.484,35686,0.858,4.3,54.5,13.8
2,10006,64,2292,144,363,Not Available,Not Available,1,41137,57,14,88,75,Not Available,Not Available,58,142,65.1,0,37,16.5,295,3.5,117,7.8,209,12.2,468,17.8,528,18.9,261,157.42,84,3.4,234,1.10,Not Applicable,2,1358,17,3,1,2,3,2,2.0,2.5,25.9,312,-2.3,550,42.3,538,12.5,1505,Not Available,Not Available,Not Available,Not Available,1.1,468,15.6,109,17.7,235,4.8,246,14.2,2555,7.924,7318,0.757,15.296,11755,0.196,2.523,91,0.396,0.373,35,Not Available,4.164,62709,1.441,22.618,54159,0.088,1.4,41.2,11.0
3,10007,61,318,119,1202,Not Available,Not Available,3,11120,Not Available,Not Available,63,68,Not Available,Not Available,93,55,57.4,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,10.3,63,13.9,45,21.7,106,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,0.99,Not Applicable,3,173,23,5,4,4,4,4,4.0,4.0,Not Available,Not Available,36.3,51,-12.6,99,15.3,118,Not Available,Not Available,Not Available,Not Available,1,56,Not Available,Not Available,19.7,72,Not Available,Not Available,15.1,272,0.165,268,Not Available,0.774,1417,Not Available,0.152,6,Not Available,Not Available,Not Available,Not Available,0.132,5484,Not Available,2.148,5413,0.466,Not Available,Not Available,5.9
4,10008,46,125,113,346,Not Available,Not Available,0,6205,Not Available,Not Available,52,23,Not Available,Not Available,Not Available,Not Available,62.3,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,19.7,26,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Applicable,Not Available,47,22,Not Available,Not Available,Not Available,Not Available,Not Available,,,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,14.3,62,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,14.5,93,0.008,14,Not Available,0.265,488,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,0.051,2171,Not Available,0.398,2171,Not Available,Not Available,Not Available,2.1


In [15]:
prvdrs_sas = sorted(sas_input_df['PROVIDER_ID'].tolist())
prvdrs_main1 = sorted(main_df['PROVIDER_ID'].tolist())

ps = []
for p in prvdrs_sas:
    if p not in prvdrs_main1:
        ps.append(p)

print(len(ps), 'hospitals in the SAS file that are not in the reconstructed file:')
print(ps, '\n')


ps = []
for p in prvdrs_main1:
    if p not in prvdrs_sas:
        ps.append(p)

print(len(ps), 'hospitals in the reconstructed file that are not in the SAS file:')
print(ps, '\n')

print('sas_input_df.shape:', sas_input_df.shape)
print('main_df.shape:', main_df.shape)

1 hospitals in the SAS file that are not in the reconstructed file:
['670265'] 

203 hospitals in the reconstructed file that are not in the SAS file:
['013300', '013301', '02013F', '02014F', '030152', '031307', '033302', '043300', '043301', '05015F', '05020F', '05022F', '05039F', '05041F', '050546', '050849', '051309', '053300', '053301', '053302', '053303', '053304', '053305', '053306', '053308', '053309', '053311', '06003F', '061323', '063301', '063303', '070038', '070040', '073300', '083300', '093300', '10013F', '10021F', '100298', '100361', '103300', '103301', '103304', '11032F', '11033F', '11035F', '113300', '113301', '12001F', '123300', '123301', '143300', '143301', '143302', '150193', '161315', '17002F', '173300', '18003F', '190300', '190302', '190315', '19050F', '191308', '191314', '191319', '193300', '193301', '21007F', '21010F', '213300', '213301', '223300', '223302', '223303', '223304', '233300', '243300', '243302', '25039F', '251307', '251314', '251339', '26002F', '261315'

In [16]:
# Remove providers from main_df that are not in sas_input_df
main_df = main_df[main_df['PROVIDER_ID'].isin(prvdrs_sas)]
prvdrs_main1 = sorted(main_df['PROVIDER_ID'].tolist())

# Remove columns in main_df that are not in the original sas output data
main_df = main_df.filter(sas_cols, axis=1)

for col in list(main_df):
    if col != 'PROVIDER_ID':
        main_df[col] = pd.to_numeric(main_df[col], errors='coerce')

ls = ['READM_30_HIP_KNEE', 'READM_30_COPD', 'MORT_30_STK', 'MORT_30_PN',
      'MORT_30_HF', 'MORT_30_COPD', 'MORT_30_AMI', 'COMP_HIP_KNEE', 'OP_22',
      'OP_23', 'OP_29', 'IMM_3', 'PC_01', 'SEP_1', 'MORT_30_CABG',
      'READM_30_CABG', 'READM_30_HOSP_WIDE', 'OP_2', 'OP_8',
      'OP_10', 'OP_13']

for l in ls:
    main_df[l] = main_df[l] * 0.01

print('main_df.shape:', main_df.shape)
print('sas_input_df.shape:', sas_input_df.shape)


main_df.shape: (4653, 91)
sas_input_df.shape: (4654, 95)


In [17]:
main_df.head()

Unnamed: 0,PROVIDER_ID,HAI_1_DEN_VOL,HAI_2_DEN_VOL,HAI_3_DEN_VOL,HAI_4_DEN_VOL,HAI_5_DEN_VOL,HAI_6_DEN_VOL,HAI_1_DEN_PRED,HAI_2_DEN_PRED,HAI_3_DEN_PRED,HAI_4_DEN_PRED,HAI_5_DEN_PRED,HAI_6_DEN_PRED,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,READM_30_HOSP_WIDE,READM_30_HIP_KNEE,EDAC_30_HF,READM_30_COPD,EDAC_30_AMI,EDAC_30_PN,MORT_30_STK,MORT_30_PN,MORT_30_HF,MORT_30_COPD,MORT_30_AMI,COMP_HIP_KNEE,READM_30_HOSP_WIDE_DEN,READM_30_HIP_KNEE_DEN,EDAC_30_HF_DEN,READM_30_COPD_DEN,EDAC_30_AMI_DEN,EDAC_30_PN_DEN,MORT_30_STK_DEN,MORT_30_PN_DEN,MORT_30_HF_DEN,MORT_30_COPD_DEN,MORT_30_AMI_DEN,COMP_HIP_KNEE_DEN,OP_2,OP_2_DEN,OP_3B,OP_3B_DEN,OP_8,OP_10,OP_13,OP_18B,OP_18B_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,PSI_90_SAFETY,IMM_3_DEN,IMM_3,HCP_COVID_19,PC_01,PC_01_DEN,SEP_1,SEP_1_DEN,H_RESP_RATE_P,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,H_NUMB_COMP,PSI_90_SAFETY_DEN,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,OP_32,OP_32_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN
0,10001,10024.0,17731.0,154.0,200.0,101908.0,101451.0,10.597,26.63,4.548,1.845,9.412,72.686,0.661,0.3,1.099,0.0,0.85,0.66,0.142,0.042,21.8,0.199,1.9,-1.5,0.164,0.159,0.083,0.085,0.124,0.024,3058.0,98.0,755.0,202.0,319.0,436.0,489.0,407.0,630.0,182.0,317.0,102.0,,,,,0.425,0.057,0.067,205.0,323.0,0.03,51079.0,,,0.81,16.0,173.39,120.0,1.01,3795.0,0.97,73.7,0.09,34.0,0.46,146.0,15.0,2.0,3.0,2.0,4.0,4.0,3.0,3.0,3.5,434.0,,0.047,172.0,0.117,165.0,14.1,254.0,10.2,214.0,4.7,214.0,1.0,688.0
1,10005,3713.0,8670.0,88.0,38.0,38413.0,35686.0,2.45,4.995,2.512,0.316,1.999,10.484,3.673,1.201,1.194,,0.0,0.858,0.139,0.04,9.3,0.176,4.7,2.2,0.166,0.218,0.169,0.081,0.126,0.018,1258.0,178.0,157.0,234.0,38.0,361.0,100.0,369.0,153.0,195.0,52.0,134.0,,,,,0.545,0.138,0.043,146.0,1003.0,0.03,54503.0,0.73,15.0,0.99,108.0,142.88,35.0,0.91,2593.0,0.9,82.1,0.01,194.0,0.59,242.0,16.0,3.0,4.0,2.0,3.0,3.0,3.0,3.0,3.5,717.0,,,,,,14.6,850.0,11.1,108.0,5.7,108.0,0.9,362.0
2,10006,7318.0,11755.0,91.0,35.0,62709.0,54159.0,7.924,15.296,2.523,0.373,4.164,22.618,0.757,0.196,0.396,,1.441,0.088,0.142,0.048,-2.3,0.177,25.9,42.3,0.189,0.178,0.122,0.078,0.165,0.034,2555.0,246.0,550.0,235.0,312.0,538.0,261.0,528.0,468.0,209.0,295.0,234.0,,,,,0.412,0.11,0.014,144.0,363.0,0.01,41137.0,0.57,14.0,0.88,75.0,157.42,84.0,1.1,2292.0,0.64,65.1,0.0,37.0,0.58,142.0,17.0,2.0,3.0,1.0,2.0,3.0,2.0,2.0,2.5,1358.0,,0.035,117.0,0.156,109.0,12.5,1505.0,,,,,1.1,468.0
3,10007,268.0,1417.0,6.0,,5484.0,5413.0,0.165,0.774,0.152,,0.132,2.148,,,,,,0.466,0.151,,36.3,0.197,,-12.6,,0.217,0.139,0.103,,,272.0,,51.0,72.0,,99.0,,106.0,45.0,63.0,,,,,,,,0.059,,119.0,1202.0,0.03,11120.0,,,0.63,68.0,,,0.99,318.0,0.61,57.4,,,0.93,55.0,23.0,3.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,173.0,,,,,,15.3,118.0,,,,,1.0,56.0
4,10008,14.0,488.0,,,2171.0,2171.0,0.008,0.265,,,0.051,0.398,,,,,,,0.145,,,,,,,0.197,,,,,93.0,,,,,,,26.0,,,,,,,,,,0.021,,113.0,346.0,0.0,6205.0,,,0.52,23.0,,,,125.0,0.46,62.3,,,,,22.0,,,,,,,,,47.0,,,,,,14.3,62.0,,,,,,


In [18]:
def get_apd(obs, exp):
    if len(obs) != len(exp):
        raise ValueError("Both lists must have the same length")

    n = len(obs)
    apd_values = []

    for i in range(n):
        numerator = abs(obs[i] - exp[i])
        denominator = (obs[i] + exp[i]) / 2

        # Avoid division by zero
        if denominator == 0:
            apd = 0  # Both obs[i] and exp[i] are zero, no error
        else:
            apd = numerator / denominator * 100

        apd_values.append(apd)

    average_apd = sum(apd_values) / n
    return average_apd


In [19]:
apd_ls = []
n_perfect = 0
n_miss = 0
n_tot = 0

tdf = main_df.copy(deep=True)

prvdrs = tdf['PROVIDER_ID'].unique()
sas_tdf = sas_input_df[sas_input_df['PROVIDER_ID'].isin(prvdrs)]

tdf.sort_values(by=['PROVIDER_ID'], inplace=True)
sas_tdf.sort_values(by=['PROVIDER_ID'], inplace=True)

tdf.fillna(0, inplace=True)
sas_tdf.fillna(0, inplace=True)

labels = ['MORT_30_AMI',
          'MORT_30_CABG',
          'MORT_30_COPD', 
          'MORT_30_HF',
          'MORT_30_PN',
          'MORT_30_STK',
          'PSI_4_SURG_COMP',
          'COMP_HIP_KNEE', 
          'HAI_1',
          'HAI_2',
          'HAI_3',
          'HAI_4',
          'HAI_5',
          'HAI_6',
          'PSI_90_SAFETY',
          'EDAC_30_AMI',
          'EDAC_30_HF',
          'EDAC_30_PN', 
          'OP_32',
          'READM_30_CABG',
          'READM_30_COPD',
          'READM_30_HIP_KNEE',
          'READM_30_HOSP_WIDE',
          'OP_35_ADM', 
          'OP_35_ED',
          'OP_36',
          'H_COMP_1_STAR_RATING',
          'H_COMP_2_STAR_RATING',
          'H_COMP_3_STAR_RATING',
          'H_COMP_5_STAR_RATING',
          'H_COMP_6_STAR_RATING',
          'H_COMP_7_STAR_RATING',
          'H_GLOB_STAR_RATING',
          'H_INDI_STAR_RATING',
          'IMM_3',
          'OP_22',
          'OP_23',
          'OP_29',
          #'OP_30',
          #'OP_33',
          'PC_01',
          'SEP_1',
          'OP_3B',
          'OP_18B',
          #'ED_2B',
          'OP_8',
          'OP_10',
          'OP_13',
          'OP_2',
          'HCP_COVID_19',
         ]

print(len(labels), 'individual measures used in stars SAS files')
for label in labels:
        
    obs = tdf[label].astype('float').tolist()
    exp = sas_tdf[label].astype('float').tolist()
    
    result = get_apd(obs, exp)
    apd_ls.append(result)
    
    for i, o in enumerate(obs):
        e = exp[i]
        if o == e:
            n_perfect += 1
            n_tot += 1
        else:
            n_miss += 1
            n_tot += 1

print('Results for Average Percent Difference:')
print('median:', np.nanmedian(apd_ls))
q1 = np.percentile(apd_ls, 25)
q3 = np.percentile(apd_ls, 75)
print('Q1:', q1)
print('Q3:', q3)

print('% of all measures that were perfectly reproduced:', 100*n_perfect/n_tot)

47 individual measures used in stars SAS files
Results for Average Percent Difference:
median: 0.04298302170642596
Q1: 0.0
Q3: 0.17193208682570393
% of all measures that were perfectly reproduced: 96.40451596087631
