In [1]:
import pandas as pd
import warnings
from IPython.utils import io
import sys
import numpy as np
from functools import reduce

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

stars_dir = '~/GitHub/stars-data-builder/'
hos_dir = '~/Desktop/Rush/CMS_HospitalArchives/'

dates_df = pd.DataFrame(columns = ['Measure ID', 'Start Date', 'End Date'])

# 
----

# Compare 2024 publicly released results to 2025 predictions

In [2]:
def compare_2024_2025(df_2022, df_2023, df_2024, df_2025, include_2023, include_2022):
    
    df_2022 = df_2022[df_2022['star'].isin(['1','2','3','4','5'])]
    df_2023 = df_2023[df_2023['star'].isin(['1','2','3','4','5'])]
    df_2024 = df_2024[~df_2024['star'].isin([np.nan, float("NaN")])]
    df_2025 = df_2025[~df_2025['star'].isin([np.nan, float("NaN")])]
    
    ## Replace the imputed 666666 suffixes of VHA hospitals with their original 'F' suffix
    prvdrs_2025 = []
    for p in df_2025['PROVIDER_ID'].tolist():
        p = str(p)
        if '666666' in p:
            p = p[:-6]
            p = p + 'F'
        while len(p) < 6:
            p = '0' + p
            
        prvdrs_2025.append(p)
    
    df_2025['PROVIDER_ID'] = prvdrs_2025

    prvdrs_2024 = df_2024['PROVIDER_ID'].unique().tolist()
    print('Hospitals with overall star ratings in 2024:', df_2024.shape[0])

    prvdrs_2025 = df_2025['PROVIDER_ID'].unique().tolist()
    print('Hospitals with overall star ratings in the 2025 predictions:', df_2025.shape[0], '\n')

    ls = np.setdiff1d(list(prvdrs_2024), list(prvdrs_2025)).tolist()
    print(len(ls), "hospitals in 2024 that had star ratings but don't in the 2025 file")

    ls = np.setdiff1d(list(prvdrs_2025), list(prvdrs_2024)).tolist()
    print(len(ls), "hospitals in the 2025 file that have star ratings but didn't in 2024\n")

    ls = list(set(prvdrs_2024) & set(prvdrs_2025))
    
    
    if include_2023 == 1 and include_2022 == 1:
        print('Now including 2022 and 2023 stars')
        prvdrs_2023 = df_2023['PROVIDER_ID'].unique().tolist()
        prvdrs_2022 = df_2022['PROVIDER_ID'].unique().tolist()
        ls = list(set(prvdrs_2024) & set(prvdrs_2025) & set(prvdrs_2023) & set(prvdrs_2022))
        
    elif include_2023 == 1:
        print('Now including 2023 stars')
        prvdrs_2023 = df_2023['PROVIDER_ID'].unique().tolist()
        ls = list(set(prvdrs_2024) & set(prvdrs_2025) & set(prvdrs_2023))
        
    elif include_2022 == 1:
        print('Now including 2022 stars')
        prvdrs_2022 = df_2022['PROVIDER_ID'].unique().tolist()
        ls = list(set(prvdrs_2024) & set(prvdrs_2025) & set(prvdrs_2022))
    
    
    print(len(ls), 'HOSPITALS\n')
    df_2024 = df_2024[df_2024['PROVIDER_ID'].isin(ls)]
    df_2024.sort_values(by='PROVIDER_ID', ascending=False, inplace=True)
    df_2025 = df_2025[df_2025['PROVIDER_ID'].isin(ls)]
    df_2025.sort_values(by='PROVIDER_ID', ascending=False, inplace=True)

    prvdrs = df_2024['PROVIDER_ID'].unique()
    df_2025 = df_2025[df_2025['PROVIDER_ID'].isin(prvdrs)]

    prvdrs = df_2025['PROVIDER_ID'].unique()
    df_2024 = df_2024[df_2024['PROVIDER_ID'].isin(prvdrs)]

    df_2024.sort_values(by=['PROVIDER_ID'], inplace=True)
    df_2025.sort_values(by=['PROVIDER_ID'], inplace=True)

    if df_2024['PROVIDER_ID'].tolist() == df_2025['PROVIDER_ID'].tolist():
        print('After filtering, the ordered list of providers in pred and actual are the same')
    else:
        print('After filtering, the ordered list of providers in pred and actual are NOT the same')

    print('df_2025.shape:', df_2025.shape)
    print('df_2024.shape:', df_2024.shape, '\n')

    stars = [1,2,3,4,5]

    for star in stars:
        
        if include_2022 == 1 and include_2023 == 1:
            print('Predicted changes for hospitals that were', star, 'star in 2022, 2023, & 2024:')
            
            tdf_2022 = df_2022[df_2022['star'] == str(star)]
            prvdrs_2022 = tdf_2022['PROVIDER_ID'].tolist()
            
            tdf_2023 = df_2023[df_2023['star'] == str(star)]
            prvdrs_2023 = tdf_2023['PROVIDER_ID'].tolist()
            
            tdf_2024 = df_2024[df_2024['star'] == star]
            prvdrs_2024 = tdf_2024['PROVIDER_ID'].tolist()
            
            ls = list(set(prvdrs_2022) & set(prvdrs_2023) & set(prvdrs_2024))
            
            tdf_2024 = df_2024[df_2024['PROVIDER_ID'].isin(ls)]
            tdf_2025 = df_2025[df_2025['PROVIDER_ID'].isin(ls)]

            print(tdf_2025.shape[0], star, 'hospitals')
            stars_ls = tdf_2025['star'].tolist()
            
            
        elif include_2022 == 1:
            print('Predicted changes for hospitals that were', star, 'star in 2022 & 2024:')
            
            tdf_2022 = df_2022[df_2022['star'] == str(star)]
            prvdrs_2022 = tdf_2022['PROVIDER_ID'].tolist()
            
            tdf_2024 = df_2024[df_2024['star'] == star]
            prvdrs_2024 = tdf_2024['PROVIDER_ID'].tolist()
            
            ls = list(set(prvdrs_2024) & set(prvdrs_2022))
            
            tdf_2024 = df_2024[df_2024['PROVIDER_ID'].isin(ls)]
            tdf_2025 = df_2025[df_2025['PROVIDER_ID'].isin(ls)]

            print(tdf_2025.shape[0], star, 'hospitals')
            stars_ls = tdf_2025['star'].tolist()
            
        elif include_2023 == 1:
            print('Predicted changes for hospitals that were', star, 'star in 2023 & 2024:')
            
            tdf_2023 = df_2023[df_2023['star'] == str(star)]
            prvdrs_2023 = tdf_2023['PROVIDER_ID'].tolist()
            
            tdf_2024 = df_2024[df_2024['star'] == star]
            prvdrs_2024 = tdf_2024['PROVIDER_ID'].tolist()
            
            ls = list(set(prvdrs_2024) & set(prvdrs_2023))
            
            tdf_2024 = df_2024[df_2024['PROVIDER_ID'].isin(ls)]
            tdf_2025 = df_2025[df_2025['PROVIDER_ID'].isin(ls)]

            print(tdf_2025.shape[0], star, 'hospitals')
            stars_ls = tdf_2025['star'].tolist()
            
        else:
            print('Predicted changes for hospitals that were', star, 'star in 2024:')

            tdf_2024 = df_2024[df_2024['star'] == star]
            prvdrs = tdf_2024['PROVIDER_ID'].tolist()
            tdf_2025 = df_2025[df_2025['PROVIDER_ID'].isin(prvdrs)]

            print(tdf_2025.shape[0], star, 'hospitals')
            stars_ls = tdf_2025['star'].tolist()

        try:
            print('1 star:', stars_ls.count(1), stars_ls.count(1)/tdf_2025.shape[0], '%')
        except:
            pass
        try:
            print('2 star:', stars_ls.count(2), stars_ls.count(2)/tdf_2025.shape[0], '%')
        except:
            pass
        try:
            print('3 star:', stars_ls.count(3), stars_ls.count(3)/tdf_2025.shape[0], '%')
        except:
            pass
        try:
            print('4 star:', stars_ls.count(4), stars_ls.count(4)/tdf_2025.shape[0], '%')
        except:
            pass
        try:
            print('5 star:', stars_ls.count(5), stars_ls.count(5)/tdf_2025.shape[0], '%', '\n')
        except:
            pass
        
    stars_2024 = df_2024['star'].tolist()
    pred_stars = df_2025['star'].tolist()
    
    T_inc = 0
    T_dec = 0
    
    for i, star_2024 in enumerate(stars_2024):
        star_2025 = pred_stars[i]
        
        if star_2024 < star_2025:
            T_inc += 1
        elif star_2024 > star_2025:
            T_dec += 1
    
    print("Of the", len(stars_2024), "hospitals that received star ratings in 2024 and that are expected to receive star ratings in 2025:")
    p_inc = f'{round(100*T_inc/len(stars_2024), 4):.2f}%'
    print(p_inc + '(n = ', T_inc, ') are expected to improve by at least one star')
    p_dec = f'{round(100*T_dec/len(stars_2024), 4):.2f}%'
    print(p_dec + '(n = ', T_dec, ') are expected to worsen by at least one star')
     


## Predictions based on April 2024 data

In [3]:
path = stars_dir + '2024/2024-07 Stars Release/SAS_CSV_output/CMS_Stars_Jul_2024.csv'
df_2024 = pd.read_csv(path)

path = stars_dir + 'Reproduce_Stars_Input/2025/SAS_output/CMS_Stars_2025_predictions_from_April_2024_data.csv'
df_2025 = pd.read_csv(path)

## Get 2023 star ratings
n_dir = stars_dir + 'CareCompare'
ls = ['PROVIDER_ID', 'star']

df_2023 = pd.read_csv(n_dir + '/hospitals_01_2024/Hospital_General_Information.csv')
df_2023['Facility ID'] = df_2023['Facility ID'].astype(str)

cols1 = ['Facility ID', 'Facility Name', 'Hospital overall rating']
cols2 = ['PROVIDER_ID', 'Hospital Name', 'star']

for i, col in enumerate(cols1):
    if col in list(df_2023):
        print(col)
        df_2023.rename(columns={col: cols2[i]}, inplace=True)

df_2023 = df_2023.filter(items=ls, axis=1)

## Get 2022 star ratings
n_dir = stars_dir + 'CareCompare'
ls = ['PROVIDER_ID', 'star']

df_2022 = pd.read_csv(n_dir + '/hospitals_04_2023/Hospital_General_Information.csv')
df_2022['Facility ID'] = df_2022['Facility ID'].astype(str)

cols1 = ['Facility ID', 'Facility Name', 'Hospital overall rating']
cols2 = ['PROVIDER_ID', 'Hospital Name', 'star']

for i, col in enumerate(cols1):
    if col in list(df_2022):
        print(col)
        df_2022.rename(columns={col: cols2[i]}, inplace=True)

df_2022 = df_2022.filter(items=ls, axis=1)


compare_2024_2025(df_2022, df_2023, df_2024, df_2025, 0, 0)

tdf = df_2025[df_2025['PROVIDER_ID'].isin([140119, 140029, 140063])]
tdf = tdf.filter(items=['summary_score', 'cnt_grp', 'star'])
tdf.head()

Facility ID
Facility Name
Hospital overall rating
Facility ID
Facility Name
Hospital overall rating
Hospitals with overall star ratings in 2024: 2847
Hospitals with overall star ratings in the 2025 predictions: 2857 

7 hospitals in 2024 that had star ratings but don't in the 2025 file
17 hospitals in the 2025 file that have star ratings but didn't in 2024

2840 HOSPITALS

After filtering, the ordered list of providers in pred and actual are the same
df_2025.shape: (2840, 27)
df_2024.shape: (2840, 27) 

Predicted changes for hospitals that were 1 star in 2024:
277 1 hospitals
1 star: 221 0.7978339350180506 %
2 star: 53 0.19133574007220217 %
3 star: 3 0.010830324909747292 %
4 star: 0 0.0 %
5 star: 0 0.0 % 

Predicted changes for hospitals that were 2 star in 2024:
594 2 hospitals
1 star: 19 0.03198653198653199 %
2 star: 487 0.8198653198653199 %
3 star: 87 0.14646464646464646 %
4 star: 1 0.0016835016835016834 %
5 star: 0 0.0 % 

Predicted changes for hospitals that were 3 star in 2024:
8

Unnamed: 0,summary_score,cnt_grp,star
1068,0.192007,3) # of groups=5,4.0
1081,0.09145,3) # of groups=5,4.0
1105,0.775275,3) # of groups=5,5.0


## Predictions based on July 2024 data

In [4]:
path = stars_dir + '2024/2024-07 Stars Release/SAS_CSV_output/CMS_Stars_Jul_2024.csv'
df_2024 = pd.read_csv(path)

path = stars_dir + 'Reproduce_Stars_Input/2025/SAS_output/CMS_Stars_2025_predictions_from_July_2024_data.csv'
df_2025 = pd.read_csv(path)

## Get 2023 star ratings
n_dir = stars_dir + 'CareCompare'
ls = ['PROVIDER_ID', 'star']

df_2023 = pd.read_csv(n_dir + '/hospitals_01_2024/Hospital_General_Information.csv')
df_2023['Facility ID'] = df_2023['Facility ID'].astype(str)

cols1 = ['Facility ID', 'Facility Name', 'Hospital overall rating']
cols2 = ['PROVIDER_ID', 'Hospital Name', 'star']

for i, col in enumerate(cols1):
    if col in list(df_2023):
        print(col)
        df_2023.rename(columns={col: cols2[i]}, inplace=True)

df_2023 = df_2023.filter(items=ls, axis=1)


## Get 2022 star ratings
n_dir = stars_dir + 'CareCompare'
ls = ['PROVIDER_ID', 'star']

df_2022 = pd.read_csv(n_dir + '/hospitals_04_2023/Hospital_General_Information.csv')
df_2022['Facility ID'] = df_2022['Facility ID'].astype(str)

cols1 = ['Facility ID', 'Facility Name', 'Hospital overall rating']
cols2 = ['PROVIDER_ID', 'Hospital Name', 'star']

for i, col in enumerate(cols1):
    if col in list(df_2022):
        print(col)
        df_2022.rename(columns={col: cols2[i]}, inplace=True)

df_2022 = df_2022.filter(items=ls, axis=1)

compare_2024_2025(df_2022, df_2023, df_2024, df_2025, 0, 0)

tdf = df_2025[df_2025['PROVIDER_ID'].isin([140119, 140029, 140063])]
tdf = tdf.filter(items=['summary_score', 'cnt_grp', 'star'])
tdf.head()

Facility ID
Facility Name
Hospital overall rating
Facility ID
Facility Name
Hospital overall rating
Hospitals with overall star ratings in 2024: 2847
Hospitals with overall star ratings in the 2025 predictions: 2905 

61 hospitals in 2024 that had star ratings but don't in the 2025 file
119 hospitals in the 2025 file that have star ratings but didn't in 2024

2786 HOSPITALS

After filtering, the ordered list of providers in pred and actual are the same
df_2025.shape: (2786, 27)
df_2024.shape: (2786, 27) 

Predicted changes for hospitals that were 1 star in 2024:
276 1 hospitals
1 star: 180 0.6521739130434783 %
2 star: 85 0.3079710144927536 %
3 star: 9 0.03260869565217391 %
4 star: 1 0.0036231884057971015 %
5 star: 1 0.0036231884057971015 % 

Predicted changes for hospitals that were 2 star in 2024:
579 2 hospitals
1 star: 75 0.12953367875647667 %
2 star: 372 0.6424870466321243 %
3 star: 119 0.20552677029360966 %
4 star: 13 0.022452504317789293 %
5 star: 0 0.0 % 

Predicted changes for 

Unnamed: 0,summary_score,cnt_grp,star
1065,0.046146,3) # of groups=5,3.0
1078,0.183752,3) # of groups=5,4.0
1102,0.761263,3) # of groups=5,5.0
