In [4]:
import pandas as pd
import warnings
from IPython.utils import io
import sys
import numpy as np
from functools import reduce

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

stars_dir = '~/GitHub/stars-data-builder/'
hos_dir = '~/Desktop/Rush/CMS_HospitalArchives/'


# 
----

# Find areas of accuracy

The overall accuracy of estimates made in July 2023 was 71%. Perhaps, the accuracy was concentrated among particular hospitals.

In [48]:
def get_results(gen_info, stars_df_2024, exp_df):
    
    hosps_2023 = gen_info['PROVIDER_ID'].unique().tolist()
    hosps_2024 = stars_df_2024['PROVIDER_ID'].unique().tolist()
    exp_hosps = exp_df['PROVIDER_ID'].unique().tolist()
    
    hosps = list(set(hosps_2023) & set(hosps_2024) & set(exp_hosps))
    
    stars_df_2024 = stars_df_2024[stars_df_2024['PROVIDER_ID'].isin(hosps)]
    exp_df = exp_df[exp_df['PROVIDER_ID'].isin(hosps)]
    gen_info = gen_info[gen_info['PROVIDER_ID'].isin(hosps)]
    
    stars_df_2024.sort_values(by='PROVIDER_ID', ascending=True, inplace=True)
    gen_info.sort_values(by='PROVIDER_ID', ascending=True, inplace=True)
    exp_df.sort_values(by='PROVIDER_ID', ascending=True, inplace=True)
    
    if gen_info['PROVIDER_ID'].tolist() == stars_df_2024['PROVIDER_ID'].tolist() == exp_df['PROVIDER_ID'].tolist():
        
        stars_2023 = gen_info['2023 overall star rating'].tolist()
        stars_2024 = stars_df_2024['star'].tolist()
        stars_exp_2024 = exp_df['star'].tolist()
        
        T1 = 0
        T2 = 0
            
        for i, star_2023 in enumerate(stars_2023):
            star_2024 = stars_2024[i]
            exp_star = stars_exp_2024[i]
            
            if np.abs(star_2023 - exp_star) > 1:
                T1 += 1
                if exp_star == star_2024:
                    T2 += 1
            
        print(T1, T2, 100*T2/T1)
            
    return 

#      
------
# Load and merge estimates of star ratings for 2024 with the actual outcomes

In [49]:
dates = ['July_2023', 'Oct_2023', 'Nov_2023', 'Jan_2024']

for lab in dates:
    
    print(lab)
    stars_df_2024 = pd.read_csv(stars_dir + '2024/2024-07 Stars Release/SAS_CSV_output/CMS_Stars_Jul_2024.csv')
    exp_df = pd.read_csv(stars_dir + 'Reproduce_Stars_Input/2024/SAS_output/CMS_Stars_2024_predictions_from_' + lab + '_data.csv')

    ## Replace the imputed 666666 suffixes of VHA hospitals with their original 'F' suffix
    prvdrs1 = []
    for p in exp_df['PROVIDER_ID'].tolist():
        p = str(p)
        if '666666' in p:
            p = p[:-6] + 'F'
        while len(p) < 6:
            p = '0' + p
        prvdrs1.append(p)

    exp_df['PROVIDER_ID'] = prvdrs1

    print(stars_df_2024.shape)
    print(exp_df.shape)
    
    
    n_dir = stars_dir + 'CareCompare'
    ls = ['PROVIDER_ID', 'Hospital Name', 'State', '2023 overall star rating', '2022 overall star rating',
          'Hospital Type', 'Hospital Ownership', 'Emergency Services', 
          'month_year',
         ]

    # hospitals in 2023 stars
    tdf1 = pd.read_csv(n_dir + '/hospitals_01_2024/Hospital_General_Information.csv')
    tdf1['Facility ID'] = tdf1['Facility ID'].astype(str)
    tdf1['Hospital overall rating'] = pd.to_numeric(tdf1['Hospital overall rating'], errors='coerce')

    # hospitals in 2022 stars
    tdf2 = pd.read_csv(n_dir + '/hospitals_04_2023/Hospital_General_Information.csv')
    tdf2['Facility ID'] = tdf2['Facility ID'].astype(str)
    tdf2['Hospital overall rating'] = pd.to_numeric(tdf2['Hospital overall rating'], errors='coerce')

    cols1 = ['Facility ID', 'Facility Name', 'Hospital overall rating']
    cols2 = ['PROVIDER_ID', 'Hospital Name', '2022 overall star rating']
    cols3 = ['PROVIDER_ID', 'Hospital Name', '2023 overall star rating']

    for i, col in enumerate(cols1):
        if col in list(tdf1):
            tdf1.rename(columns={col: cols3[i]}, inplace=True)
        if col in list(tdf2):
            tdf2.rename(columns={col: cols2[i]}, inplace=True)
            
    
    tdf1 = tdf1.filter(items=ls, axis=1)
    tdf1['month_year'] = lab

    tdf2 = tdf2.filter(items=ls, axis=1)
    tdf2['month_year'] = lab

    
    print(list(tdf2))
    stars_2022 = tdf2['2022 overall star rating'].tolist()
    hosps_2022 = tdf2['PROVIDER_ID'].tolist()
    stars_2023 = tdf1['2023 overall star rating'].tolist()
    hosps_2023 = tdf1['PROVIDER_ID'].tolist()
    
    retained = []
    for i, hosp in enumerate(hosps_2023):
        star_2023 = stars_2023[i]
        
        if hosp in hosps_2022:
            ii = hosps_2022.index(hosp)
            star_2022 = stars_2022[ii]
            if star_2022 == star_2023:
                retained.append(1)
            else:
                retained.append(0)
        else:
            retained.append(0)
    
    tdf1['Retained 2022 Star'] = retained
    gen_info = tdf1.copy(deep=True)

    print('filtering out hospitals not included in 2024 stars estimates and outcomes:')
    hosps = stars_df_2024['PROVIDER_ID'].unique().tolist()
    gen_info = gen_info[gen_info['PROVIDER_ID'].isin(hosps)]
    print(gen_info.shape)
    
    get_results(gen_info, stars_df_2024, exp_df)
    print('\n\n')


July_2023
(4626, 27)
(4677, 27)
['PROVIDER_ID', 'Hospital Name', 'State', '2022 overall star rating', 'Hospital Type', 'Hospital Ownership', 'Emergency Services', 'month_year']
filtering out hospitals not included in 2024 stars estimates and outcomes:
(4622, 9)
1174 732 62.35093696763203



Oct_2023
(4626, 27)
(4677, 27)
['PROVIDER_ID', 'Hospital Name', 'State', '2022 overall star rating', 'Hospital Type', 'Hospital Ownership', 'Emergency Services', 'month_year']
filtering out hospitals not included in 2024 stars estimates and outcomes:
(4622, 9)
1175 732 62.297872340425535



Nov_2023
(4626, 27)
(4673, 27)
['PROVIDER_ID', 'Hospital Name', 'State', '2022 overall star rating', 'Hospital Type', 'Hospital Ownership', 'Emergency Services', 'month_year']
filtering out hospitals not included in 2024 stars estimates and outcomes:
(4622, 9)
1277 851 66.64056382145654



Jan_2024
(4626, 27)
(4654, 27)
['PROVIDER_ID', 'Hospital Name', 'State', '2022 overall star rating', 'Hospital Type', 'Hospit