In [1]:
import pandas as pd
import warnings
from IPython.utils import io
import sys
import numpy as np
from functools import reduce

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

stars_dir = '~/GitHub/stars-data-builder/'
hos_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

dates_df = pd.DataFrame(columns = ['Measure ID', 'Start Date', 'End Date'])

In [2]:
def curate(df):

    try:
        df = df[df['PROVIDER_ID'] != np.nan]
        df['PROVIDER_ID'] = df['PROVIDER_ID'].values.astype(str)
        
        ids = df['PROVIDER_ID'].tolist()
        ids2 = []
        for i in ids:
            if len(i) < 6:
                i = '0' + i
            ids2.append(i)
        df['PROVIDER_ID'] = ids2
        
    except:
        pass
    
    for c in list(df):    
        try:
            df[c] = df[c].str.replace("\t","")
        except:
            pass

    if 'Unnamed: 0' in list(df):
        df.drop(labels=['Unnamed: 0'], axis=1, inplace=True)
    return df

# Use 2023 SAS input file to get the column labels needed for 2024 prognostication

In [3]:
sas_input_df_2023 = pd.read_sas(stars_dir + 'Reproduce_Stars_Input/2023/Input_File/alldata_2023jul.sas7bdat', 
                           format = 'sas7bdat', encoding = "utf8")

sas_cols_2023 = list(sas_input_df_2023)

print(len(sas_cols_2023), 'columns in 2023 Stars SAS input file (not all get used by the SAS programs):')
print(sas_cols_2023, '\n')

print('Create columns for 2024 data')

sas_cols = list(sas_cols_2023)

sas_input_df_2023 = curate(sas_input_df_2023)
prvdrs_2023 = sas_input_df_2023['PROVIDER_ID'].unique().tolist()

print(len(prvdrs_2023), 'hospitals in 2023 Stars output file\n')
prvdrs_2023 = sorted(list(set(prvdrs_2023)))

print(len(sas_cols_2023), 'remaining features:', sorted(sas_cols_2023), '\n')

sas_input_df_2023.head()

95 columns in 2023 Stars SAS input file (not all get used by the SAS programs):
['PROVIDER_ID', 'HAI_1_DEN_VOL', 'HAI_2_DEN_VOL', 'HAI_3_DEN_VOL', 'HAI_4_DEN_VOL', 'HAI_5_DEN_VOL', 'HAI_6_DEN_VOL', 'HAI_1_DEN_PRED', 'HAI_2_DEN_PRED', 'HAI_3_DEN_PRED', 'HAI_4_DEN_PRED', 'HAI_5_DEN_PRED', 'HAI_6_DEN_PRED', 'HAI_1', 'HAI_2', 'HAI_3', 'HAI_4', 'HAI_5', 'HAI_6', 'READM_30_HOSP_WIDE', 'READM_30_HIP_KNEE', 'EDAC_30_HF', 'READM_30_COPD', 'EDAC_30_AMI', 'EDAC_30_PN', 'MORT_30_STK', 'MORT_30_PN', 'MORT_30_HF', 'MORT_30_COPD', 'MORT_30_AMI', 'COMP_HIP_KNEE', 'READM_30_HOSP_WIDE_DEN', 'READM_30_HIP_KNEE_DEN', 'EDAC_30_HF_DEN', 'READM_30_COPD_DEN', 'EDAC_30_AMI_DEN', 'EDAC_30_PN_DEN', 'MORT_30_STK_DEN', 'MORT_30_PN_DEN', 'MORT_30_HF_DEN', 'MORT_30_COPD_DEN', 'MORT_30_AMI_DEN', 'COMP_HIP_KNEE_DEN', 'OP_2', 'OP_2_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'PSI_4_S

Unnamed: 0,PROVIDER_ID,HAI_1_DEN_VOL,HAI_2_DEN_VOL,HAI_3_DEN_VOL,HAI_4_DEN_VOL,HAI_5_DEN_VOL,HAI_6_DEN_VOL,HAI_1_DEN_PRED,HAI_2_DEN_PRED,HAI_3_DEN_PRED,HAI_4_DEN_PRED,HAI_5_DEN_PRED,HAI_6_DEN_PRED,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,READM_30_HOSP_WIDE,READM_30_HIP_KNEE,EDAC_30_HF,READM_30_COPD,EDAC_30_AMI,EDAC_30_PN,MORT_30_STK,MORT_30_PN,MORT_30_HF,MORT_30_COPD,MORT_30_AMI,COMP_HIP_KNEE,READM_30_HOSP_WIDE_DEN,READM_30_HIP_KNEE_DEN,EDAC_30_HF_DEN,READM_30_COPD_DEN,EDAC_30_AMI_DEN,EDAC_30_PN_DEN,MORT_30_STK_DEN,MORT_30_PN_DEN,MORT_30_HF_DEN,MORT_30_COPD_DEN,MORT_30_AMI_DEN,COMP_HIP_KNEE_DEN,OP_2,OP_2_DEN,OP_3B,OP_3B_DEN,OP_8,OP_8_DEN,OP_10,OP_10_DEN,OP_13,OP_13_DEN,OP_18B,OP_18B_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,PSI_90_SAFETY,IMM_3_DEN,IMM_3,HCP_COVID_19_DEN,HCP_COVID_19,PC_01,PC_01_DEN,SEP_1,SEP_1_DEN,H_RESP_RATE_P,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,H_NUMB_COMP,PSI_90_SAFETY_DEN,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,OP_32,OP_32_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN
0,10001,10024.0,17731.0,154.0,200.0,101908.0,101451.0,10.597,26.63,4.548,1.845,9.412,72.686,0.661,0.3,1.099,0.0,0.85,0.66,0.142,0.042,21.8,0.199,1.9,-1.5,0.164,0.159,0.083,0.085,0.124,0.024,3058.0,98.0,755.0,202.0,319.0,436.0,489.0,407.0,630.0,182.0,317.0,102.0,,,,,0.425,146.0,0.057,1488.0,0.067,208.0,205.0,323.0,0.03,51079.0,,,0.81,16.0,173.39,120.0,1.01,3795.0,0.97,2323.0,0.737,0.09,34.0,0.46,146.0,15.0,2.0,3.0,2.0,4.0,4.0,3.0,3.0,3.5,434.0,2046.895485,0.047,172.0,0.117,165.0,14.1,254.0,10.2,214.0,4.7,214.0,1.0,688.0
1,10005,3713.0,8670.0,88.0,,38413.0,35686.0,2.45,4.995,2.512,,1.999,10.484,3.673,1.201,1.194,,0.0,0.858,0.139,0.04,9.3,0.176,4.7,2.2,0.166,0.218,0.169,0.081,0.126,0.018,1258.0,178.0,157.0,234.0,38.0,361.0,100.0,369.0,153.0,195.0,52.0,134.0,,,,,0.545,191.0,0.138,1214.0,0.043,208.0,146.0,1003.0,0.03,54503.0,0.73,15.0,0.99,108.0,142.88,35.0,0.91,2593.0,0.9,2026.0,0.821,0.01,194.0,0.59,242.0,16.0,3.0,4.0,2.0,3.0,3.0,3.0,3.0,3.5,717.0,819.043002,,,,,14.6,850.0,11.1,108.0,5.7,108.0,0.9,362.0
2,10006,7318.0,11755.0,91.0,,62709.0,54159.0,7.924,15.296,2.523,,4.164,22.618,0.757,0.196,0.396,,1.441,0.088,0.142,0.048,-2.3,0.177,25.9,42.3,0.189,0.178,0.122,0.078,0.165,0.034,2555.0,246.0,550.0,235.0,312.0,538.0,261.0,528.0,468.0,209.0,295.0,234.0,,,,,0.412,97.0,0.11,1168.0,0.014,217.0,144.0,363.0,0.01,41137.0,0.57,14.0,0.88,75.0,157.42,84.0,1.1,2292.0,0.64,2694.0,0.651,0.0,37.0,0.58,142.0,17.0,2.0,3.0,1.0,2.0,3.0,2.0,2.0,2.5,1358.0,1487.163359,0.035,117.0,0.156,109.0,12.5,1505.0,,,,,1.1,468.0
3,10007,,,,,,5413.0,,,,,,2.148,,,,,,0.466,0.151,,36.3,0.197,,-12.6,,0.217,0.139,0.103,,,272.0,,51.0,72.0,,99.0,,106.0,45.0,63.0,,,,,,,,,0.059,169.0,,,119.0,1202.0,0.03,11120.0,,,0.63,68.0,,,0.99,318.0,0.61,277.0,0.574,,,0.93,55.0,23.0,3.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,173.0,142.073902,,,,,15.3,118.0,,,,,1.0,56.0
4,10008,,,,,,,,,,,,,,,,,,,0.145,,,,,,,0.197,,,,,93.0,,,,,,,26.0,,,,,,,,,,,0.021,97.0,,,113.0,346.0,0.0,6205.0,,,0.52,23.0,,,,125.0,0.46,169.0,0.623,,,,,,,,,,,,,,,,,,,,14.3,62.0,,,,,,


## HAIs

In [4]:
df = pd.read_csv(stars_dir + 'CareCompare/hospitals_11_2023/Healthcare_Associated_Infections-Hospital.csv')
#print(df['Measure ID'].unique())

measures = ['HAI_1_ELIGCASES', 'HAI_1_DOPC', 'HAI_1_SIR', 'HAI_2_ELIGCASES', 'HAI_2_DOPC', 'HAI_2_SIR', 
            'HAI_3_ELIGCASES', 'HAI_3_DOPC', 'HAI_3_SIR', 'HAI_4_ELIGCASES', 'HAI_4_DOPC', 'HAI_4_SIR', 
            'HAI_5_ELIGCASES', 'HAI_5_DOPC', 'HAI_5_SIR', 'HAI_6_ELIGCASES', 'HAI_6_DOPC', 'HAI_6_SIR']

tdf = df[df['Measure ID'].isin(measures + ['End Date', 'Start Date'])]
tdf = tdf.filter(items = ['Measure ID', 'Start Date', 'End Date'])
tdf.drop_duplicates(inplace=True)
dates_df = pd.concat([dates_df, tdf], axis=0)

df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score'], axis=1)

hai_df = pd.DataFrame(columns=['Facility ID']) 
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    hai_df = hai_df.merge(tdf2, on='Facility ID', how='outer')
    
hai_df.rename(columns={'HAI_1_ELIGCASES': 'HAI_1_DEN_PRED',
                       'HAI_1_DOPC': 'HAI_1_DEN_VOL',
                       'HAI_1_SIR': 'HAI_1',
                       'HAI_2_ELIGCASES': 'HAI_2_DEN_PRED',
                       'HAI_2_DOPC': 'HAI_2_DEN_VOL',
                       'HAI_2_SIR': 'HAI_2',
                       'HAI_3_ELIGCASES': 'HAI_3_DEN_PRED',
                       'HAI_3_DOPC': 'HAI_3_DEN_VOL',
                       'HAI_3_SIR': 'HAI_3',
                       'HAI_4_ELIGCASES': 'HAI_4_DEN_PRED',
                       'HAI_4_DOPC': 'HAI_4_DEN_VOL',
                       'HAI_4_SIR': 'HAI_4',
                       'HAI_5_ELIGCASES': 'HAI_5_DEN_PRED',
                       'HAI_5_DOPC': 'HAI_5_DEN_VOL',
                       'HAI_5_SIR': 'HAI_5',
                       'HAI_6_ELIGCASES': 'HAI_6_DEN_PRED',
                       'HAI_6_DOPC': 'HAI_6_DEN_VOL',
                       'HAI_6_SIR': 'HAI_6',
                       'Facility ID': 'PROVIDER_ID',
                   }, inplace=True)

for c in list(hai_df):
    try:
        sas_cols_2023.remove(c)
    except:
        pass

print(len(sas_cols_2023), 'remaining features:', sorted(sas_cols_2023), '\n')
hai_df = curate(hai_df)

76 remaining features: ['COMP_HIP_KNEE', 'COMP_HIP_KNEE_DEN', 'EDAC_30_AMI', 'EDAC_30_AMI_DEN', 'EDAC_30_HF', 'EDAC_30_HF_DEN', 'EDAC_30_PN', 'EDAC_30_PN_DEN', 'HCP_COVID_19', 'HCP_COVID_19_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'IMM_3', 'IMM_3_DEN', 'MORT_30_AMI', 'MORT_30_AMI_DEN', 'MORT_30_CABG', 'MORT_30_CABG_DEN', 'MORT_30_COPD', 'MORT_30_COPD_DEN', 'MORT_30_HF', 'MORT_30_HF_DEN', 'MORT_30_PN', 'MORT_30_PN_DEN', 'MORT_30_STK', 'MORT_30_STK_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_2', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'OP_2_DEN', 'OP_32', 'OP_32_DEN', 'OP_35_ADM', 'OP_35_ADM_DEN', 'OP_35_ED', 'OP_35_ED_DEN', 'OP_36', 'OP_36_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'PC_01', 'PC_01_DEN', 'PSI_4_SURG_COMP', 'PSI_4_SURG_COMP_DE

## Unplanned Hospital Visits


In [5]:
df = pd.read_csv(stars_dir + 'CareCompare/hospitals_11_2023/Unplanned_Hospital_Visits-Hospital.csv')
measures = ['EDAC_30_AMI', 'EDAC_30_HF', 'EDAC_30_PN', 'OP_32', 'OP_35_ADM', 'OP_35_ED', 'OP_36', 
            'READM_30_CABG', 'READM_30_COPD', 'READM_30_HIP_KNEE', 'READM_30_HOSP_WIDE']

tdf = df[df['Measure ID'].isin(measures + ['End Date', 'Start Date'])]
tdf = tdf.filter(items = ['Measure ID', 'Start Date', 'End Date'])
tdf.drop_duplicates(inplace=True)
dates_df = pd.concat([dates_df, tdf], axis=0)

df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Denominator', 'Measure ID', 'Score'], axis=1)

uhv_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    tdf2[m + '_DEN'] = tdf1['Denominator'].tolist()
    uhv_df = uhv_df.merge(tdf2, on='Facility ID', how='outer')

uhv_df.rename(columns={'Facility ID': 'PROVIDER_ID'}, inplace=True)

for c in list(uhv_df):
    try:
        sas_cols_2023.remove(c)
    except:
        pass
    
print(len(sas_cols_2023), 'remaining features:', sorted(sas_cols_2023), '\n')
uhv_df = curate(uhv_df)

54 remaining features: ['COMP_HIP_KNEE', 'COMP_HIP_KNEE_DEN', 'HCP_COVID_19', 'HCP_COVID_19_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'IMM_3', 'IMM_3_DEN', 'MORT_30_AMI', 'MORT_30_AMI_DEN', 'MORT_30_CABG', 'MORT_30_CABG_DEN', 'MORT_30_COPD', 'MORT_30_COPD_DEN', 'MORT_30_HF', 'MORT_30_HF_DEN', 'MORT_30_PN', 'MORT_30_PN_DEN', 'MORT_30_STK', 'MORT_30_STK_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_2', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'OP_2_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'PC_01', 'PC_01_DEN', 'PSI_4_SURG_COMP', 'PSI_4_SURG_COMP_DEN', 'PSI_90_SAFETY', 'PSI_90_SAFETY_DEN', 'SEP_1', 'SEP_1_DEN'] 



## COMPLICATIONS AND DEATHS

In [6]:
df = pd.read_csv(stars_dir + 'CareCompare/hospitals_11_2023/Complications_and_Deaths-Hospital.csv')

measures = ['MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF', 
            'MORT_30_PN', 'MORT_30_STK', 'PSI_04', 'COMP_HIP_KNEE',
            'PSI_90']

tdf = df[df['Measure ID'].isin(measures + ['End Date', 'Start Date'])]
tdf = tdf.filter(items = ['Measure ID', 'Start Date', 'End Date'])
tdf.drop_duplicates(inplace=True)
dates_df = pd.concat([dates_df, tdf], axis=0, ignore_index=True)


df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score', 'Denominator'], axis=1)

cad_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    
    tdf2[m + '_DEN'] = tdf1['Denominator'].tolist()
    cad_df = cad_df.merge(tdf2, on='Facility ID', how='outer')
    
cad_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                       'PSI_04': 'PSI_4_SURG_COMP',
                       'PSI_04_DEN': 'PSI_4_SURG_COMP_DEN',
                       'PSI_90': 'PSI_90_SAFETY',
                       'PSI_90_DEN': 'PSI_90_SAFETY_DEN',
                   }, inplace=True)

for c in list(cad_df):
    try:
        sas_cols_2023.remove(c)
    except:
        pass
    
print(len(sas_cols_2023), 'remaining features:', sorted(sas_cols_2023), '\n')
cad_df = curate(cad_df)

36 remaining features: ['HCP_COVID_19', 'HCP_COVID_19_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'IMM_3', 'IMM_3_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_2', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'OP_2_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'PC_01', 'PC_01_DEN', 'SEP_1', 'SEP_1_DEN'] 



## TIMELY AND EFFECTIVE CARE

Everything except PC-01, which for 2023 is located in the Maternal Health files of the hospitals data archive

In [7]:
df = pd.read_csv(stars_dir + 'CareCompare/hospitals_11_2023/Timely_and_Effective_Care-Hospital.csv')

measures = ['IMM_3', 'OP_18b', 'OP_2', 'OP_22', 'OP_23', 'OP_29', 'OP_3b', 'SEP_1', 'HCP_COVID_19']

tdf = df[df['Measure ID'].isin(measures + ['End Date', 'Start Date'])]
tdf = tdf.filter(items = ['Measure ID', 'Start Date', 'End Date'])
tdf.drop_duplicates(inplace=True)
dates_df = pd.concat([dates_df, tdf], axis=0, ignore_index=True)


df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Sample', 'Measure ID', 'Score'], axis=1)

tec_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    
    if m == 'HCP_COVID_19':
        pass
    else:
        tdf2[m + '_DEN'] = tdf1['Sample'].tolist()
    
    tec_df = tec_df.merge(tdf2, on='Facility ID', how='outer')
    
tec_df.rename(columns={'Facility ID': 'PROVIDER_ID', 
                       'OP_3b': 'OP_3B', 
                       'OP_3b_DEN': 'OP_3B_DEN', 
                       'OP_18b': 'OP_18B',
                       'OP_18b_DEN': 'OP_18B_DEN',
                      }, inplace=True)

for c in list(tec_df):
    try:
        sas_cols_2023.remove(c)
    except:
        pass

print(len(sas_cols_2023), 'remaining features:', sorted(sas_cols_2023), '\n')
tec_df = curate(tec_df)


19 remaining features: ['HCP_COVID_19_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_8', 'OP_8_DEN', 'PC_01', 'PC_01_DEN'] 



## TIMELY AND EFFECTIVE CARE

Only PC-01, which for 2023 is located in the Maternal Health files of the hospitals data archive

In [8]:
df = pd.read_csv(stars_dir + 'CareCompare/hospitals_11_2023/Maternal_Health-Hospital.csv')
measures = ['PC_01']

tdf = df[df['Measure ID'].isin(measures + ['End Date', 'Start Date'])]
tdf = tdf.filter(items = ['Measure ID', 'Start Date', 'End Date'])
tdf.drop_duplicates(inplace=True)
dates_df = pd.concat([dates_df, tdf], axis=0, ignore_index=True)


df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score', 'Sample'], axis=1)

tec2_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    tdf2[m + '_DEN'] = tdf1['Sample'].tolist()
    tec2_df = tec2_df.merge(tdf2, on='Facility ID', how='outer')
    
tec2_df.rename(columns={'Facility ID': 'PROVIDER_ID'}, inplace=True)

for c in list(tec2_df):
    try:
        sas_cols_2023.remove(c)
    except:
        pass

print(len(sas_cols_2023), 'remaining features:', sorted(sas_cols_2023), '\n')

tec2_df = curate(tec2_df)


17 remaining features: ['HCP_COVID_19_DEN', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'H_NUMB_COMP', 'H_RESP_RATE_P', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_8', 'OP_8_DEN'] 



## HCAHPS

In [9]:
df = pd.read_csv(stars_dir + 'CareCompare/hospitals_11_2023/HCAHPS-Hospital.csv')

measures = ['H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 
            'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 'H_CLEAN_STAR_RATING',  'H_QUIET_STAR_RATING', 
            'H_RECMND_STAR_RATING', 'H_HSP_RATING_STAR_RATING']

tdf = df[df['HCAHPS Measure ID'].isin(measures + ['End Date', 'Start Date'])]
tdf.rename(columns={'HCAHPS Measure ID': 'Measure ID'}, inplace=True)
tdf = tdf.filter(items = ['Measure ID', 'Start Date', 'End Date'])
tdf.drop_duplicates(inplace=True)
dates_df = pd.concat([dates_df, tdf], axis=0, ignore_index=True)


df = df[df['HCAHPS Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'HCAHPS Measure ID', 'Patient Survey Star Rating', 
                        'Number of Completed Surveys', 'Survey Response Rate Percent'], axis=1)

HCAHPS_df = pd.DataFrame(columns=['Facility ID'])
for i, m in enumerate(measures):
    tdf1 = df[df['HCAHPS Measure ID'] == m]
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Patient Survey Star Rating'].tolist()
    if i == 0:
        tdf2['H_NUMB_COMP'] = tdf1['Number of Completed Surveys'].tolist()
        tdf2['H_RESP_RATE_P'] = tdf1['Survey Response Rate Percent'].tolist()
        
    HCAHPS_df = HCAHPS_df.merge(tdf2, on='Facility ID', how='outer')
    

HCAHPS_df.rename(columns={'Facility ID': 'PROVIDER_ID'}, inplace=True)
HCAHPS_df['H_HSP_RATING_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_HSP_RATING_STAR_RATING'] = HCAHPS_df['H_HSP_RATING_STAR_RATING'].astype(int)
HCAHPS_df['H_RECMND_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_RECMND_STAR_RATING'] = HCAHPS_df['H_RECMND_STAR_RATING'].astype(int)
HCAHPS_df['H_GLOB_STAR_RATING'] = np.round((HCAHPS_df['H_HSP_RATING_STAR_RATING'] + HCAHPS_df['H_RECMND_STAR_RATING']) / 2, 1)
HCAHPS_df['H_GLOB_STAR_RATING'].replace(0, np.nan, inplace=True)

HCAHPS_df['H_CLEAN_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_CLEAN_STAR_RATING'] = HCAHPS_df['H_CLEAN_STAR_RATING'].astype(int)
HCAHPS_df['H_QUIET_STAR_RATING'].replace('Not Available', 0, inplace=True)
HCAHPS_df['H_QUIET_STAR_RATING'] = HCAHPS_df['H_QUIET_STAR_RATING'].astype(int)
HCAHPS_df['H_INDI_STAR_RATING'] = np.round((HCAHPS_df['H_CLEAN_STAR_RATING'] + HCAHPS_df['H_QUIET_STAR_RATING']) / 2, 1)
HCAHPS_df['H_INDI_STAR_RATING'].replace(0, np.nan, inplace=True)

HCAHPS_df.drop(labels = ['H_CLEAN_STAR_RATING',  'H_QUIET_STAR_RATING', 'H_RECMND_STAR_RATING', 'H_HSP_RATING_STAR_RATING'], axis=1, inplace=True)

for c in list(HCAHPS_df):
    try:
        sas_cols_2023.remove(c)
    except:
        pass

print(len(sas_cols_2023), 'remaining features:', sorted(sas_cols_2023), '\n')
HCAHPS_df = curate(HCAHPS_df)

7 remaining features: ['HCP_COVID_19_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_8', 'OP_8_DEN'] 



## Outpatient Imaging Efficiency

The July 2023 Overall Star Ratings were calculated using the measure data from the January 2023 Care Compare update, with the re-released OP-13 measure data publicly reported in April 2023 on Care Compare.

Make two dataframes: 1 for OP-13 (April 2023) and 1 for other measures (Jan 2023)

In [10]:
df = pd.read_csv(stars_dir + 'CareCompare/hospitals_11_2023/Outpatient_Imaging_Efficiency-Hospital.csv')
measures = ['OP-13', 'OP-8', 'OP-10']

tdf = df[df['Measure ID'].isin(measures + ['End Date', 'Start Date'])]
tdf = tdf.filter(items = ['Measure ID', 'Start Date', 'End Date'])
tdf.drop_duplicates(inplace=True)
dates_df = pd.concat([dates_df, tdf], axis=0, ignore_index=True)


df = df[df['Measure ID'].isin(measures)]
df = df.filter(items = ['Facility ID', 'Measure ID', 'Score'], axis=1)

oie_df = pd.DataFrame(columns=['Facility ID'])
for m in measures:
    tdf1 = df[df['Measure ID'] == m]
    
    tdf2 = pd.DataFrame(columns=['Facility ID', m]) 
    tdf2['Facility ID'] = tdf1['Facility ID'].tolist()
    tdf2[m] = tdf1['Score'].tolist()
    
    oie_df = oie_df.merge(tdf2, on='Facility ID', how='outer')
    
    
oie_df.rename(columns={'Facility ID': 'PROVIDER_ID',
                       'OP-13': 'OP_13',
                       'OP-8': 'OP_8',
                       'OP-10': 'OP_10',
                   }, inplace=True)

for c in list(oie_df):
    try:
        sas_cols_2023.remove(c)
    except:
        pass

print(len(sas_cols_2023), 'remaining features:', sorted(sas_cols_2023), '\n')
oie_df1 = curate(oie_df)


4 remaining features: ['HCP_COVID_19_DEN', 'OP_10_DEN', 'OP_13_DEN', 'OP_8_DEN'] 



## MERGE DATAFRAME AND COMPARE TO SAS FILE

In [11]:
print(dates_df.shape)
dates_df.to_csv(stars_dir + "2024/measure_dates.csv", index=False)

(61, 3)


In [12]:
main_df = tec_df.merge(tec2_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(cad_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(HCAHPS_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(uhv_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(hai_df, on='PROVIDER_ID', how='outer')
main_df = main_df.merge(oie_df1, on='PROVIDER_ID', how='outer')

main_df['OP_10_DEN'] = np.nan
main_df['OP_13_DEN'] = np.nan
main_df['OP_8_DEN'] = np.nan
main_df['HCP_COVID_19_DEN'] = np.nan

for c in list(main_df):
    try:
        sas_cols_2023.remove(c)
    except:
        pass

print(len(sas_cols_2023), 'remaining features:', sorted(sas_cols_2023), '\n')
print(main_df.shape)
main_df.head()

0 remaining features: [] 

(4829, 95)


Unnamed: 0,PROVIDER_ID,IMM_3,IMM_3_DEN,OP_18B,OP_18B_DEN,OP_2,OP_2_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,OP_3B,OP_3B_DEN,SEP_1,SEP_1_DEN,HCP_COVID_19,PC_01,PC_01_DEN,MORT_30_AMI,MORT_30_AMI_DEN,MORT_30_CABG,MORT_30_CABG_DEN,MORT_30_COPD,MORT_30_COPD_DEN,MORT_30_HF,MORT_30_HF_DEN,MORT_30_PN,MORT_30_PN_DEN,MORT_30_STK,MORT_30_STK_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,COMP_HIP_KNEE,COMP_HIP_KNEE_DEN,PSI_90_SAFETY,PSI_90_SAFETY_DEN,H_COMP_1_STAR_RATING,H_NUMB_COMP,H_RESP_RATE_P,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,EDAC_30_AMI,EDAC_30_AMI_DEN,EDAC_30_HF,EDAC_30_HF_DEN,EDAC_30_PN,EDAC_30_PN_DEN,OP_32,OP_32_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN,READM_30_CABG,READM_30_CABG_DEN,READM_30_COPD,READM_30_COPD_DEN,READM_30_HIP_KNEE,READM_30_HIP_KNEE_DEN,READM_30_HOSP_WIDE,READM_30_HOSP_WIDE_DEN,HAI_1_DEN_PRED,HAI_1_DEN_VOL,HAI_1,HAI_2_DEN_PRED,HAI_2_DEN_VOL,HAI_2,HAI_3_DEN_PRED,HAI_3_DEN_VOL,HAI_3,HAI_4_DEN_PRED,HAI_4_DEN_VOL,HAI_4,HAI_5_DEN_PRED,HAI_5_DEN_VOL,HAI_5,HAI_6_DEN_PRED,HAI_6_DEN_VOL,HAI_6,OP_13,OP_8,OP_10,OP_10_DEN,OP_13_DEN,OP_8_DEN,HCP_COVID_19_DEN
0,10001,95,3905,200,359,Not Available,Not Available,3,51079,Not Available,Not Available,81,16,Not Available,Not Available,55,128,83.9,0,24,12,278,4.1,132,8.8,107,8.9,549,18,400,14.8,398,173.39,120,2.7,49,1.01,Not Applicable,3,536,15,3,2,4,4,4,4.0,3.0,-15.4,274,23.4,614,23.6,403,14.1,254,10.2,214,4.7,214,1,688,10.5,126,19,117,3.8,49,14.2,2912,9.686,9230,1.136,25.198,17443,0.437,5.847,206,0.684,0.920,96,Not Available,11.422,104934,0.788,66.609,104477,0.631,2.8,38,6.1,,,,
1,10005,80,2700,147,1001,Not Available,Not Available,3,54503,62,13,99,108,59,17,64,236,81.7,2,205,13.6,27,Not Available,Not Available,9.9,126,14.9,121,23.3,289,15.3,81,142.88,35,2.3,155,0.91,Not Applicable,3,806,18,4,1,3,4,3,3.0,3.0,Not Available,Not Available,22.1,129,-6.2,285,14.6,850,11.1,108,5.7,108,0.9,362,Not Available,Not Available,16.6,136,3.4,172,12.8,1052,2.129,3345,2.349,4.516,8643,0.664,2.308,85,1.733,0.320,36,Not Available,1.885,37532,0.531,10.26,35284,0.487,4.2,47.7,12.0,,,,
2,10006,67,2536,159,361,Not Available,Not Available,1,41137,71,17,88,75,Not Available,Not Available,60,129,70.1,4,28,16.5,254,3.6,95,9.9,148,12.5,388,19.5,469,17.2,227,157.42,84,4.6,145,1.10,Not Applicable,3,1495,19,3,2,2,3,2,2.0,2.5,28.1,273,-4.7,441,-0.4,472,12.5,1505,Not Available,Not Available,Not Available,Not Available,1.1,468,12.4,89,17.6,158,5.3,138,13.4,2310,6.051,5594,0.661,12.152,9427,0.329,2.954,111,0.000,0.134,15,Not Available,5.035,63230,1.589,23.034,54350,0.043,4.5,46.2,10.1,,,,
3,10007,53,350,130,1287,Not Available,Not Available,3,11120,Not Available,Not Available,63,68,Not Available,Not Available,96,49,62.7,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,13.7,34,12.5,26,28.5,88,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,0.99,Not Applicable,3,194,25,4,4,3,3,3,3.0,3.0,Not Available,Not Available,-1.9,31,-9.4,72,15.3,118,Not Available,Not Available,Not Available,Not Available,1,56,Not Available,Not Available,20,34,4.2,26,15.7,258,0.192,311,Not Available,0.712,1302,Not Available,0.107,4,Not Available,Not Available,Not Available,Not Available,0.14,5601,Not Available,2.589,5601,0.386,Not Available,Not Available,3.4,,,,
4,10008,45,126,121,346,Not Available,Not Available,0,6205,Not Available,Not Available,52,23,Not Available,Not Available,Not Available,Not Available,80.6,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Applicable,Not Available,43,25,Not Available,Not Available,Not Available,Not Available,Not Available,,,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,14.3,62,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,14.8,69,0.018,31,Not Available,0.19,350,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,0.052,2228,Not Available,0.409,2228,Not Available,Not Available,Not Available,0.0,,,,


In [13]:
ls = np.setdiff1d(list(main_df), sas_cols)
print(ls)

main_df = main_df.filter(items=sas_cols, axis=1)
print(main_df.shape)

main_df.head()

[]
(4829, 95)


Unnamed: 0,PROVIDER_ID,HAI_1_DEN_VOL,HAI_2_DEN_VOL,HAI_3_DEN_VOL,HAI_4_DEN_VOL,HAI_5_DEN_VOL,HAI_6_DEN_VOL,HAI_1_DEN_PRED,HAI_2_DEN_PRED,HAI_3_DEN_PRED,HAI_4_DEN_PRED,HAI_5_DEN_PRED,HAI_6_DEN_PRED,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,READM_30_HOSP_WIDE,READM_30_HIP_KNEE,EDAC_30_HF,READM_30_COPD,EDAC_30_AMI,EDAC_30_PN,MORT_30_STK,MORT_30_PN,MORT_30_HF,MORT_30_COPD,MORT_30_AMI,COMP_HIP_KNEE,READM_30_HOSP_WIDE_DEN,READM_30_HIP_KNEE_DEN,EDAC_30_HF_DEN,READM_30_COPD_DEN,EDAC_30_AMI_DEN,EDAC_30_PN_DEN,MORT_30_STK_DEN,MORT_30_PN_DEN,MORT_30_HF_DEN,MORT_30_COPD_DEN,MORT_30_AMI_DEN,COMP_HIP_KNEE_DEN,OP_2,OP_2_DEN,OP_3B,OP_3B_DEN,OP_8,OP_8_DEN,OP_10,OP_10_DEN,OP_13,OP_13_DEN,OP_18B,OP_18B_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,PSI_90_SAFETY,IMM_3_DEN,IMM_3,HCP_COVID_19_DEN,HCP_COVID_19,PC_01,PC_01_DEN,SEP_1,SEP_1_DEN,H_RESP_RATE_P,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,H_NUMB_COMP,PSI_90_SAFETY_DEN,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,OP_32,OP_32_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN
0,10001,9230,17443,206,96,104934,104477,9.686,25.198,5.847,0.920,11.422,66.609,1.136,0.437,0.684,Not Available,0.788,0.631,14.2,3.8,23.4,19,-15.4,23.6,14.8,18,8.9,8.8,12,2.7,2912,49,614,117,274,403,398,400,549,107,278,49,Not Available,Not Available,Not Available,Not Available,38,,6.1,,2.8,,200,359,3,51079,Not Available,Not Available,81,16,173.39,120,1.01,3905,95,,83.9,0,24,55,128,15,3,3,2,4,4,4,4.0,3.0,536,Not Applicable,4.1,132,10.5,126,14.1,254,10.2,214,4.7,214,1,688
1,10005,3345,8643,85,36,37532,35284,2.129,4.516,2.308,0.320,1.885,10.26,2.349,0.664,1.733,Not Available,0.531,0.487,12.8,3.4,22.1,16.6,Not Available,-6.2,15.3,23.3,14.9,9.9,13.6,2.3,1052,172,129,136,Not Available,285,81,289,121,126,27,155,Not Available,Not Available,59,17,47.7,,12.0,,4.2,,147,1001,3,54503,62,13,99,108,142.88,35,0.91,2700,80,,81.7,2,205,64,236,18,3,4,1,3,4,3,3.0,3.0,806,Not Applicable,Not Available,Not Available,Not Available,Not Available,14.6,850,11.1,108,5.7,108,0.9,362
2,10006,5594,9427,111,15,63230,54350,6.051,12.152,2.954,0.134,5.035,23.034,0.661,0.329,0.000,Not Available,1.589,0.043,13.4,5.3,-4.7,17.6,28.1,-0.4,17.2,19.5,12.5,9.9,16.5,4.6,2310,138,441,158,273,472,227,469,388,148,254,145,Not Available,Not Available,Not Available,Not Available,46.2,,10.1,,4.5,,159,361,1,41137,71,17,88,75,157.42,84,1.10,2536,67,,70.1,4,28,60,129,19,3,3,2,2,3,2,2.0,2.5,1495,Not Applicable,3.6,95,12.4,89,12.5,1505,Not Available,Not Available,Not Available,Not Available,1.1,468
3,10007,311,1302,4,Not Available,5601,5601,0.192,0.712,0.107,Not Available,0.14,2.589,Not Available,Not Available,Not Available,Not Available,Not Available,0.386,15.7,4.2,-1.9,20,Not Available,-9.4,Not Available,28.5,12.5,13.7,Not Available,Not Available,258,26,31,34,Not Available,72,Not Available,88,26,34,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,,3.4,,Not Available,,130,1287,3,11120,Not Available,Not Available,63,68,Not Available,Not Available,0.99,350,53,,62.7,Not Available,Not Available,96,49,25,3,4,4,3,3,3,3.0,3.0,194,Not Applicable,Not Available,Not Available,Not Available,Not Available,15.3,118,Not Available,Not Available,Not Available,Not Available,1,56
4,10008,31,350,Not Available,Not Available,2228,2228,0.018,0.19,Not Available,Not Available,0.052,0.409,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,14.8,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,69,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,,0.0,,Not Available,,121,346,0,6205,Not Available,Not Available,52,23,Not Available,Not Available,Not Available,126,45,,80.6,Not Available,Not Available,Not Available,Not Available,25,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,,,43,Not Applicable,Not Available,Not Available,Not Available,Not Available,14.3,62,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available


In [14]:
prvdrs_main1 = sorted(main_df['PROVIDER_ID'].tolist())

ls = ['READM_30_HIP_KNEE', 'READM_30_COPD', 'MORT_30_STK', 'MORT_30_PN',
      'MORT_30_HF', 'MORT_30_COPD', 'MORT_30_AMI', 'COMP_HIP_KNEE', 'OP_22',
      'OP_23', 'OP_29', 'IMM_3', 'PC_01', 'SEP_1', 'MORT_30_CABG',
      'READM_30_CABG', 'READM_30_HOSP_WIDE', 'OP_2', 'OP_8',
      'OP_10', 'OP_13', 'HCP_COVID_19']

for col in list(main_df):
    if col != 'PROVIDER_ID':
        main_df[col] = pd.to_numeric(main_df[col], errors='coerce')
        
for l in ls: 
    main_df[l] = main_df[l] * 0.01


In [15]:
prvdrs = []
for p in main_df['PROVIDER_ID'].tolist():
    if 'F' in p:
        p = p[:-1]
        p = p + '666666'
    prvdrs.append(p)

main_df['PROVIDER_ID'] = prvdrs

main_df['PROVIDER_ID'] = pd.to_numeric(main_df['PROVIDER_ID'], errors='coerce')
main_df.sort_values(by=['PROVIDER_ID'], ascending = True, inplace = True)
main_df.to_csv(stars_dir + "Reproduce_Stars_Input/2024/Input_File/all_data_for_2024_prognostications.csv", index=False)

print(list(main_df))
main_df.head()


['PROVIDER_ID', 'HAI_1_DEN_VOL', 'HAI_2_DEN_VOL', 'HAI_3_DEN_VOL', 'HAI_4_DEN_VOL', 'HAI_5_DEN_VOL', 'HAI_6_DEN_VOL', 'HAI_1_DEN_PRED', 'HAI_2_DEN_PRED', 'HAI_3_DEN_PRED', 'HAI_4_DEN_PRED', 'HAI_5_DEN_PRED', 'HAI_6_DEN_PRED', 'HAI_1', 'HAI_2', 'HAI_3', 'HAI_4', 'HAI_5', 'HAI_6', 'READM_30_HOSP_WIDE', 'READM_30_HIP_KNEE', 'EDAC_30_HF', 'READM_30_COPD', 'EDAC_30_AMI', 'EDAC_30_PN', 'MORT_30_STK', 'MORT_30_PN', 'MORT_30_HF', 'MORT_30_COPD', 'MORT_30_AMI', 'COMP_HIP_KNEE', 'READM_30_HOSP_WIDE_DEN', 'READM_30_HIP_KNEE_DEN', 'EDAC_30_HF_DEN', 'READM_30_COPD_DEN', 'EDAC_30_AMI_DEN', 'EDAC_30_PN_DEN', 'MORT_30_STK_DEN', 'MORT_30_PN_DEN', 'MORT_30_HF_DEN', 'MORT_30_COPD_DEN', 'MORT_30_AMI_DEN', 'COMP_HIP_KNEE_DEN', 'OP_2', 'OP_2_DEN', 'OP_3B', 'OP_3B_DEN', 'OP_8', 'OP_8_DEN', 'OP_10', 'OP_10_DEN', 'OP_13', 'OP_13_DEN', 'OP_18B', 'OP_18B_DEN', 'OP_22', 'OP_22_DEN', 'OP_23', 'OP_23_DEN', 'OP_29', 'OP_29_DEN', 'PSI_4_SURG_COMP', 'PSI_4_SURG_COMP_DEN', 'PSI_90_SAFETY', 'IMM_3_DEN', 'IMM_3', 'HCP_CO

Unnamed: 0,PROVIDER_ID,HAI_1_DEN_VOL,HAI_2_DEN_VOL,HAI_3_DEN_VOL,HAI_4_DEN_VOL,HAI_5_DEN_VOL,HAI_6_DEN_VOL,HAI_1_DEN_PRED,HAI_2_DEN_PRED,HAI_3_DEN_PRED,HAI_4_DEN_PRED,HAI_5_DEN_PRED,HAI_6_DEN_PRED,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,READM_30_HOSP_WIDE,READM_30_HIP_KNEE,EDAC_30_HF,READM_30_COPD,EDAC_30_AMI,EDAC_30_PN,MORT_30_STK,MORT_30_PN,MORT_30_HF,MORT_30_COPD,MORT_30_AMI,COMP_HIP_KNEE,READM_30_HOSP_WIDE_DEN,READM_30_HIP_KNEE_DEN,EDAC_30_HF_DEN,READM_30_COPD_DEN,EDAC_30_AMI_DEN,EDAC_30_PN_DEN,MORT_30_STK_DEN,MORT_30_PN_DEN,MORT_30_HF_DEN,MORT_30_COPD_DEN,MORT_30_AMI_DEN,COMP_HIP_KNEE_DEN,OP_2,OP_2_DEN,OP_3B,OP_3B_DEN,OP_8,OP_8_DEN,OP_10,OP_10_DEN,OP_13,OP_13_DEN,OP_18B,OP_18B_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,PSI_90_SAFETY,IMM_3_DEN,IMM_3,HCP_COVID_19_DEN,HCP_COVID_19,PC_01,PC_01_DEN,SEP_1,SEP_1_DEN,H_RESP_RATE_P,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,H_NUMB_COMP,PSI_90_SAFETY_DEN,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,OP_32,OP_32_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN
0,10001,9230.0,17443.0,206.0,96.0,104934.0,104477.0,9.686,25.198,5.847,0.92,11.422,66.609,1.136,0.437,0.684,,0.788,0.631,0.142,0.038,23.4,0.19,-15.4,23.6,0.148,0.18,0.089,0.088,0.12,0.027,2912.0,49.0,614.0,117.0,274.0,403.0,398.0,400.0,549.0,107.0,278.0,49.0,,,,,0.38,,0.061,,0.028,,200.0,359.0,0.03,51079.0,,,0.81,16.0,173.39,120.0,1.01,3905.0,0.95,,0.839,0.0,24.0,0.55,128.0,15.0,3.0,3.0,2.0,4.0,4.0,4.0,4.0,3.0,536.0,,0.041,132.0,0.105,126.0,14.1,254.0,10.2,214.0,4.7,214.0,1.0,688.0
1,10005,3345.0,8643.0,85.0,36.0,37532.0,35284.0,2.129,4.516,2.308,0.32,1.885,10.26,2.349,0.664,1.733,,0.531,0.487,0.128,0.034,22.1,0.166,,-6.2,0.153,0.233,0.149,0.099,0.136,0.023,1052.0,172.0,129.0,136.0,,285.0,81.0,289.0,121.0,126.0,27.0,155.0,,,59.0,17.0,0.477,,0.12,,0.042,,147.0,1001.0,0.03,54503.0,0.62,13.0,0.99,108.0,142.88,35.0,0.91,2700.0,0.8,,0.817,0.02,205.0,0.64,236.0,18.0,3.0,4.0,1.0,3.0,4.0,3.0,3.0,3.0,806.0,,,,,,14.6,850.0,11.1,108.0,5.7,108.0,0.9,362.0
2,10006,5594.0,9427.0,111.0,15.0,63230.0,54350.0,6.051,12.152,2.954,0.134,5.035,23.034,0.661,0.329,0.0,,1.589,0.043,0.134,0.053,-4.7,0.176,28.1,-0.4,0.172,0.195,0.125,0.099,0.165,0.046,2310.0,138.0,441.0,158.0,273.0,472.0,227.0,469.0,388.0,148.0,254.0,145.0,,,,,0.462,,0.101,,0.045,,159.0,361.0,0.01,41137.0,0.71,17.0,0.88,75.0,157.42,84.0,1.1,2536.0,0.67,,0.701,0.04,28.0,0.6,129.0,19.0,3.0,3.0,2.0,2.0,3.0,2.0,2.0,2.5,1495.0,,0.036,95.0,0.124,89.0,12.5,1505.0,,,,,1.1,468.0
3,10007,311.0,1302.0,4.0,,5601.0,5601.0,0.192,0.712,0.107,,0.14,2.589,,,,,,0.386,0.157,0.042,-1.9,0.2,,-9.4,,0.285,0.125,0.137,,,258.0,26.0,31.0,34.0,,72.0,,88.0,26.0,34.0,,,,,,,,,0.034,,,,130.0,1287.0,0.03,11120.0,,,0.63,68.0,,,0.99,350.0,0.53,,0.627,,,0.96,49.0,25.0,3.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,194.0,,,,,,15.3,118.0,,,,,1.0,56.0
4,10008,31.0,350.0,,,2228.0,2228.0,0.018,0.19,,,0.052,0.409,,,,,,,0.148,,,,,,,,,,,,69.0,,,,,,,,,,,,,,,,,,0.0,,,,121.0,346.0,0.0,6205.0,,,0.52,23.0,,,,126.0,0.45,,0.806,,,,,25.0,,,,,,,,,43.0,,,,,,14.3,62.0,,,,,,


# 
----

## Compare 2023 publicly released results to 2024 predictions

In [20]:
path = stars_dir + '2023/2023-07 Stars Release/SAS_CSV_output/CMS_Stars_Jul_2023.csv'
df_2023 = pd.read_csv(path)

df_2023 = df_2023[~df_2023['star'].isin([np.nan, float("NaN")])]
print(df_2023.shape)
df_2023.head()

(3076, 27)


Unnamed: 0,PROVIDER_ID,Std_Outcomes_Mortality_score,Std_Outcomes_Readmission_score,Std_Outcomes_Safety_score,Std_PatientExp_score,Std_Process_score,std_weight_PatientExperience,std_weight_Readmission,std_weight_Mortality,std_weight_safety,std_weight_Process,weight_PatientExperience,weight_Outcomes_Readmission,weight_Outcomes_Mortality,weight_Outcomes_Safety,weight_Process,summary_score,Outcomes_Mortality_cnt,Outcomes_safety_cnt,Outcomes_Readmission_cnt,Patient_Experience_cnt,Process_cnt,Total_measure_group_cnt,MortSafe_Group_cnt,report_indicator,cnt_grp,star
0,10001,-0.647058,0.284081,0.312074,-0.127836,-1.024044,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.162208,7,8,11,8,10,5,2,1,3) # of groups=5,3.0
1,10005,-1.564103,0.560369,-0.237844,-0.166838,-0.302742,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.346181,6,7,10,8,11,5,2,1,3) # of groups=5,2.0
2,10006,-1.694318,-0.554988,-0.089526,-1.241108,-0.17935,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.809109,7,7,9,8,11,5,2,1,3) # of groups=5,1.0
3,10007,-2.40715,-0.488553,0.022657,0.993806,-0.65976,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.492604,3,2,6,8,7,4,1,1,2) # of groups=4,2.0
5,10011,-0.517349,-0.624302,0.42877,0.134223,-2.385055,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.413511,7,7,9,8,8,5,2,1,3) # of groups=5,2.0


In [22]:
path = stars_dir + '2024/SAS_CSV_output/CMS_Stars_Nov_2024.csv'
df_2024 = pd.read_csv(path)

df_2024 = df_2024[~df_2024['star'].isin([np.nan, float("NaN")])]
print(df_2024.shape)
df_2024.head()

(2860, 27)


Unnamed: 0,PROVIDER_ID,Std_Outcomes_Mortality_score,Std_Outcomes_Readmission_score,Std_Outcomes_Safety_score,Std_PatientExp_score,Std_Process_score,std_weight_PatientExperience,std_weight_Readmission,std_weight_Mortality,std_weight_safety,std_weight_Process,weight_PatientExperience,weight_Outcomes_Readmission,weight_Outcomes_Mortality,weight_Outcomes_Safety,weight_Process,summary_score,Outcomes_Mortality_cnt,Outcomes_safety_cnt,Outcomes_Readmission_cnt,Patient_Experience_cnt,Process_cnt,Total_measure_group_cnt,MortSafe_Group_cnt,report_indicator,cnt_grp,star
0,10001,-0.193681,0.4973,0.217552,0.134474,-0.116505,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,0.130261,7,7,11,8,10,5,2,1,3) # of groups=5,4.0
1,10005,-1.260674,1.093058,-0.056298,-0.295171,-0.465657,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.170078,6,7,9,8,12,5,2,1,3) # of groups=5,3.0
2,10006,-1.570297,-0.035525,-0.171409,-1.006743,-0.874025,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.717357,7,7,9,8,11,5,2,1,3) # of groups=5,1.0
3,10007,-3.504838,-0.499953,0.164934,-0.068313,-0.755976,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.950515,3,2,7,8,7,4,1,1,2) # of groups=4,1.0
5,10011,-0.203704,-0.576404,0.205723,0.012696,-2.186195,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.385915,7,7,9,8,8,5,2,1,3) # of groups=5,2.0


In [23]:
## Replace the imputed 666666 suffixes of VHA hospitals with their original 'F' suffix
prvdrs1 = []
for p in df_2024['PROVIDER_ID'].tolist():
    p = str(p)
    if '666666' in p:
        p = p[:-6]
        p = p + 'F'
    while len(p) < 6:
        p = '0' + p
    prvdrs1.append(p)
    
df_2024['PROVIDER_ID'] = prvdrs1

prvdrs2 = df_2023['PROVIDER_ID'].unique()

# Get providers in prvdrs1 that are NOT in prvdrs2
not_in_ls1 = np.setdiff1d(prvdrs1, prvdrs2)
print(len(not_in_ls1))

# Get providers in prvdrs2 that are NOT in prvdrs1
not_in_ls2 = np.setdiff1d(prvdrs2, prvdrs1)
print(len(not_in_ls2))

print(not_in_ls1, '\n')
print(not_in_ls2)

35
251
['030138' '030146' '040161' '050113' '05022F' '100359' '100360' '110252'
 '161347' '161365' '161375' '190099' '191312' '21007F' '261303' '281336'
 '301302' '360368' '371342' '380056' '381315' '381318' '390332' '390334'
 '390335' '45068F' '45069F' '50005F' '50011F' '500154' '521351' '650001'
 '670078' '670300' '670309'] 

['010034' '010038' '010044' '010062' '010099' '010101' '010112' '010126'
 '010150' '010158' '010169' '030073' '030137' '030139' '040001' '040015'
 '040019' '040067' '040076' '041307' '041312' '050028' '050279' '050528'
 '050684' '050744' '050771' '051304' '051324' '051325' '051328' '060076'
 '06007F' '060118' '060126' '061322' '061327' '070012' '100081' '100106'
 '100130' '100134' '100175' '110050' '110092' '110115' '110121' '110132'
 '110135' '110146' '11031F' '130066' '131320' '14004F' '140137' '140177'
 '140181' '140294' '141324' '141327' '141337' '141341' '141342' '141343'
 '141347' '141348' '150045' '150091' '150102' '150183' '151307' '151311'
 '151315' '15

In [25]:

prvdrs = df_2023['PROVIDER_ID'].unique()

df_2024 = df_2024[df_2024['PROVIDER_ID'].isin(prvdrs)]

prvdrs = df_2024['PROVIDER_ID'].unique()
df_2023 = df_2023[df_2023['PROVIDER_ID'].isin(prvdrs)]

df_2024.sort_values(by=['PROVIDER_ID'], inplace=True)
df_2023.sort_values(by=['PROVIDER_ID'], inplace=True)

if df_2024['PROVIDER_ID'].tolist() == df_2023['PROVIDER_ID'].tolist():
    print('The ordered list of providers in pred and actual are the same')
else:
    print('The ordered list of providers in pred and actual are NOT the same')

print('df_2023.shape:', df_2023.shape)
print('df_2024.shape:', df_2024.shape)

The ordered list of providers in pred and actual are the same
df_2023.shape: (2825, 27)
df_2024.shape: (2825, 27)


## Get the expected changes in star ratings

In [28]:
stars = [1,2,3,4,5]

for star in stars:
    print('Predicted changes for hospitals that were', star, 'star 2023:')
    
    tdf_2023 = df_2023[df_2023['star'] == star]
    prvdrs = tdf_2023['PROVIDER_ID'].tolist()
    tdf_2024 = df_2024[df_2024['PROVIDER_ID'].isin(prvdrs)]
    
    print(tdf_2024.shape[0], star, 'hospitals')
    stars_ls = tdf_2024['star'].tolist()
    
    print('1 star:', stars_ls.count(1))
    print('2 star:', stars_ls.count(2))
    print('3 star:', stars_ls.count(3))
    print('4 star:', stars_ls.count(4))
    print('5 star:', stars_ls.count(5))
    


Predicted changes for hospitals that were 1 star 2023:
228 1 hospitals
1 star: 153
2 star: 66
3 star: 8
4 star: 1
5 star: 0
Predicted changes for hospitals that were 2 star 2023:
619 2 hospitals
1 star: 102
2 star: 349
3 star: 153
4 star: 15
5 star: 0
Predicted changes for hospitals that were 3 star 2023:
789 3 hospitals
1 star: 16
2 star: 173
3 star: 455
4 star: 140
5 star: 5
Predicted changes for hospitals that were 4 star 2023:
744 4 hospitals
1 star: 2
2 star: 32
3 star: 260
4 star: 397
5 star: 53
Predicted changes for hospitals that were 5 star 2023:
445 5 hospitals
1 star: 0
2 star: 1
3 star: 31
4 star: 189
5 star: 224
