In [3]:
import pandas as pd
import warnings
from IPython.utils import io
import sys
import numpy as np
from functools import reduce

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

stars_dir = '~/GitHub/stars-data-builder/'
hos_dir = '~/Desktop/Rush/CMS_HospitalArchives/'


# 
----

# Find areas of accuracy

The overall accuracy of estimates made in July 2023 was 71%. Perhaps, the accuracy was concentrated among particular hospitals.

In [51]:
def get_results(gen_info, stars_df_2024, exp_df):
    
    print('PPV = Positive Predictive Value = Precision')
    print('TPR = True Positive Rate\n')

    stars = [1,2,3,4,5]

    T_TP = 0
    T_TN = 0
    T_FP = 0
    T_FN = 0
    for star in stars:

        tdf = gen_info[gen_info['2023 overall star rating'] == str(star)]
        tdf = tdf[tdf['Retained 2022 Star'] == 1]
        
        hosps_2023 = tdf['PROVIDER_ID'].unique().tolist()

        TP = 0
        TN = 0
        FP = 0
        FN = 0
        for i, h_2023 in enumerate(hosps_2023):

            AP_df = stars_df_2024[stars_df_2024['PROVIDER_ID'] == h_2023]
            star_2024 = AP_df['star'].iloc[0]

            PP_df = exp_df[exp_df['PROVIDER_ID'] == h_2023]
            star_exp = PP_df['star'].iloc[0]

            if star_exp == star and star_2024 == star:
                TP += 1
            elif star_exp != star and star_2024 != star:
                TN += 1
            elif star_exp == star and star_2024 != star:
                FP += 1
            elif star_exp != star and star_2024 == star:
                FN += 1

        T_TP += TP
        T_TN += TN
        T_FP += FP
        T_FN += FN

        TPR = TP / (TP + FN) # aka Sensitivity
        FPR = FP / (FP + TN)
        TNR = TN / (TN + FP) # aka Specificity
        FNR = FN / (FN + TP)

        ACC = (TP + TN) / (TP + TN + FP + FN)
        PPV = TP / (TP + FP)
        NPV = TN / (TN + FN)

        F1 = 2 * (PPV * TPR) / (PPV + TPR)

        T = TP + TN + FP + FN
        print('For', T, 'hospitals that were', star, 'star in 2023:')
        print('    ' + f'{100*PPV:.2f}% PPV  ', TP+FP, 'hospitals were expected to stay a '+str(star)+'-star.', TP, 'actually did.')   
        print('    ' + f'{100*NPV:.2f}% NPV  ', TN+FN, 'hospitals were expected to change their rating.', TN, 'actually did.')   
        print('    ' + f'{100*TPR:.2f}% TPR  ', TP+FN, 'hospitals stayed a '+str(star)+'-star.', TP, 'of them were expected to.')
        print('    ' + f'{100*ACC:.2f}% Accuracy,  ', TP + TN, 'of', (TP + TN + FP + FN), 'hospitals kept or changed their star rating when expected to.')
    
        print('\n')


    TP = int(T_TP)
    TN = int(T_TN)
    FP = int(T_FP)
    FN = int(T_FN)

    TPR = TP / (TP + FN) # aka Sensitivity
    FPR = FP / (FP + TN)
    TNR = TN / (TN + FP) # aka Specificity
    FNR = FN / (FN + TP)

    ACC = (TP + TN) / (TP + TN + FP + FN)
    PPV = TP / (TP + FP)

    F1 = 2 * (PPV * TPR) / (PPV + TPR)

    print('FOR ALL', TP + TN + FP + FN, 'HOSPITALS IN 2023 THAT WERE ALSO IN 2024:')
    print('    ' + f'{100*PPV:.2f}% PPV,  ', TP+FP, 'hospitals were expected to keep their star rating.', TP, 'actually did.')   
    print('    ' + f'{100*NPV:.2f}% NPV,  ', TN+FN, 'hospitals were expected to change their rating.', TN, 'actually did.')   
    print('    ' + f'{100*TPR:.2f}% TPR,  ', TP+FN, 'hospitals kept their star rating.', TP, 'of them were expected to.')
    print('    ' + f'{100*ACC:.2f}% Accuracy,  ', TP + TN, 'of', (TP + TN + FP + FN), 'hospitals kept or changed their star rating when expected to.')
    return 

#      
------
# Load and merge estimates of star ratings for 2024 with the actual outcomes

In [52]:
dates = ['July_2023', 'Oct_2023', 'Nov_2023', 'Jan_2024']

for lab in dates:
    print('------------------ ', lab, ' ------------------')
    
    groups = ['all']#, '3) # of groups=5', '2) # of groups=4', '1) # of groups=3']
    for group in groups:
        print('----------- ', group, ' -----------')
        
        stars_df_2024 = pd.read_csv(stars_dir + '2024/2024-07 Stars Release/SAS_CSV_output/CMS_Stars_Jul_2024.csv')
        exp_df = pd.read_csv(stars_dir + 'Reproduce_Stars_Input/2024/SAS_output/CMS_Stars_2024_predictions_from_' + lab + '_data.csv')
        if group == 'all':
            pass
        else:
            stars_df_2024 = stars_df_2024[stars_df_2024['cnt_grp'] == group]
    
        ## Replace the imputed 666666 suffixes of VHA hospitals with their original 'F' suffix
        prvdrs1 = []
        for p in exp_df['PROVIDER_ID'].tolist():
            p = str(p)
            if '666666' in p:
                p = p[:-6] + 'F'
            while len(p) < 6:
                p = '0' + p
            prvdrs1.append(p)

        exp_df['PROVIDER_ID'] = prvdrs1

        print(stars_df_2024.shape)
        print(exp_df.shape)


        n_dir = stars_dir + 'CareCompare'
        ls = ['PROVIDER_ID', 'Hospital Name', 'State', '2023 overall star rating', '2022 overall star rating',
              'Hospital Type', 'Hospital Ownership', 'Emergency Services', 
              'month_year',
             ]

        tdf1 = pd.read_csv(n_dir + '/hospitals_01_2024/Hospital_General_Information.csv')
        tdf1['Facility ID'] = tdf1['Facility ID'].astype(str)

        tdf2 = pd.read_csv(n_dir + '/hospitals_04_2023/Hospital_General_Information.csv')
        tdf2['Facility ID'] = tdf2['Facility ID'].astype(str)

        cols1 = ['Facility ID', 'Facility Name', 'Hospital overall rating']
        cols2 = ['PROVIDER_ID', 'Hospital Name', '2022 overall star rating']
        cols3 = ['PROVIDER_ID', 'Hospital Name', '2023 overall star rating']

        for i, col in enumerate(cols1):
            if col in list(tdf1):
                tdf1.rename(columns={col: cols3[i]}, inplace=True)
            if col in list(tdf2):
                tdf2.rename(columns={col: cols2[i]}, inplace=True)


        tdf1 = tdf1.filter(items=ls, axis=1)
        tdf1['month_year'] = lab

        tdf2 = tdf2.filter(items=ls, axis=1)
        tdf2['month_year'] = lab


        print(list(tdf2))
        stars_2022 = tdf2['2022 overall star rating'].tolist()
        hosps_2022 = tdf2['PROVIDER_ID'].tolist()
        stars_2023 = tdf1['2023 overall star rating'].tolist()
        hosps_2023 = tdf1['PROVIDER_ID'].tolist()

        retained = []
        for i, hosp in enumerate(hosps_2023):
            star_2023 = stars_2023[i]

            if hosp in hosps_2022:
                ii = hosps_2022.index(hosp)
                star_2022 = stars_2022[ii]
                if star_2022 == star_2023:
                    retained.append(1)
                else:
                    retained.append(0)
            else:
                retained.append(0)

        tdf1['Retained 2022 Star'] = retained
        gen_info = tdf1.copy(deep=True)

        print('filtering out hospitals not included in 2024 stars estimates and outcomes:')
        
        gen_info = gen_info[gen_info['2023 overall star rating'].isin(['1','2','3','4','5'])]
        exp_df = exp_df[exp_df['star'].isin([1,2,3,4,5])]
        stars_df_2024 = stars_df_2024[stars_df_2024['star'].isin([1,2,3,4,5])] 
        
        hosps1 = stars_df_2024['PROVIDER_ID'].unique().tolist()
        hosps2 = gen_info['PROVIDER_ID'].unique().tolist()
        hosps3 = exp_df['PROVIDER_ID'].unique().tolist()
        
        
        hosps = list(set(hosps1) & set(hosps2) & set(hosps3))
        gen_info = gen_info[gen_info['PROVIDER_ID'].isin(hosps)]
        exp_df = exp_df[exp_df['PROVIDER_ID'].isin(hosps)]
        stars_df_2024 = stars_df_2024[stars_df_2024['PROVIDER_ID'].isin(hosps)] 
        
        print(gen_info.shape)

        get_results(gen_info, stars_df_2024, exp_df)
        print('\n\n')


------------------  July_2023  ------------------
-----------  all  -----------
(4626, 27)
(4677, 27)
['PROVIDER_ID', 'Hospital Name', 'State', '2022 overall star rating', 'Hospital Type', 'Hospital Ownership', 'Emergency Services', 'month_year']
filtering out hospitals not included in 2024 stars estimates and outcomes:
(2802, 9)
PPV = Positive Predictive Value = Precision
TPR = True Positive Rate

For 110 hospitals that were 1 star in 2023:
    83.91% PPV   87 hospitals were expected to stay a 1-star. 73 actually did.
    82.61% NPV   23 hospitals were expected to change their rating. 19 actually did.
    94.81% TPR   77 hospitals stayed a 1-star. 73 of them were expected to.
    83.64% Accuracy,   92 of 110 hospitals kept or changed their star rating when expected to.


For 295 hospitals that were 2 star in 2023:
    67.89% PPV   190 hospitals were expected to stay a 2-star. 129 actually did.
    75.24% NPV   105 hospitals were expected to change their rating. 79 actually did.
    83

filtering out hospitals not included in 2024 stars estimates and outcomes:
(2806, 9)
PPV = Positive Predictive Value = Precision
TPR = True Positive Rate

For 110 hospitals that were 1 star in 2023:
    100.00% PPV   77 hospitals were expected to stay a 1-star. 77 actually did.
    100.00% NPV   33 hospitals were expected to change their rating. 33 actually did.
    100.00% TPR   77 hospitals stayed a 1-star. 77 of them were expected to.
    100.00% Accuracy,   110 of 110 hospitals kept or changed their star rating when expected to.


For 296 hospitals that were 2 star in 2023:
    100.00% PPV   154 hospitals were expected to stay a 2-star. 154 actually did.
    99.30% NPV   142 hospitals were expected to change their rating. 141 actually did.
    99.35% TPR   155 hospitals stayed a 2-star. 154 of them were expected to.
    99.66% Accuracy,   295 of 296 hospitals kept or changed their star rating when expected to.


For 333 hospitals that were 3 star in 2023:
    99.43% PPV   174 hospi