In [91]:
import pandas as pd
import warnings
from IPython.utils import io
import sys
import numpy as np
from functools import reduce
#import xlsxwriter

from scipy.spatial import distance

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

stars_dir = '~/GitHub/stars-data-builder/'


In [121]:
def get_results(sas_df, main_df, star_year):
    
    prvdrs1 = main_df['PROVIDER_ID'].unique()
    print('Shape of the reconstructed output file:', main_df.shape)

    prvdrs2 = sas_df['PROVIDER_ID'].unique()
    print('Shape of the original SAS output file:', sas_df.shape, '\n')

    ls = np.setdiff1d(list(prvdrs1), list(prvdrs2)).tolist()
    print(len(ls), "hospitals in the derived results file that aren't in the actual Stars results:", ls)

    ls = np.setdiff1d(list(prvdrs2), list(prvdrs1)).tolist()
    print(len(ls), "hospitals in the actual Stars results file that aren't in the derived file:", ls, '\n')

    tdf = sas_df[sas_df['star'].isin([1,2,3,4,5])]
    print('No. of actual star ratings:', tdf.shape[0], '\n')
    
    ls = list(set(prvdrs1) & set(prvdrs2))
    sas_df = sas_df[sas_df['PROVIDER_ID'].isin(ls)]
    sas_df.sort_values(by='PROVIDER_ID', ascending=False, inplace=True)
    main_df = main_df[main_df['PROVIDER_ID'].isin(ls)]
    main_df.sort_values(by='PROVIDER_ID', ascending=False, inplace=True)
    
    if sas_df['PROVIDER_ID'].tolist() == main_df['PROVIDER_ID'].tolist():
        print('After filtering, the ordered list of providers are the same')
    else:
        print('After filtering, the ordered list of providers are NOT the same')

    print('Results for', len(ls), 'common providers\n')
    
    print('Percent Perfect | Avg difference:')
    for c in list(main_df):
        if c == 'PROVIDER_ID' or c == 'star':
            continue
        
        derived_x = main_df[c].tolist()
        original_x = sas_df[c].tolist()
        ids = main_df['PROVIDER_ID'].tolist()
        tdf = pd.DataFrame(columns=['PROVIDER_ID', 'derived_x', 'original_x'])
        tdf['PROVIDER_ID'] = ids
        tdf['derived_x'] = derived_x
        tdf['original_x'] = original_x
        s1 = tdf.shape
        tdf.dropna(how='any', axis=0, inplace=True)
        s2 = tdf.shape

        derived_x = tdf['derived_x']#.tolist()
        original_x = tdf['original_x']#.tolist()

        try:
            d = np.abs(derived_x - original_x)
            d = d.tolist()
            d1 = 100 * d.count(0)/len(d)
            print(c, ':', round(d1, 4), '|', round(np.nanmean(d), 6))
        except:
            
            pass

    print('\n')
    pred_stars = main_df['star'].tolist()
    pred_prvdrs = main_df['PROVIDER_ID'].tolist()
    actual_stars = sas_df['star'].tolist()
    actual_prvdrs = sas_df['PROVIDER_ID'].tolist()

    #print(sorted(list(set(pred_stars))))
    #print(sorted(list(set(actual_stars))))
    #return
    stars = [1,2,3,4,5,'all']
    
    for star in stars:
        if star == 'all':
            print('For all 2024 hospitals with star ratings')
        else:
            print('For 2024 hospitals with', str(star) + '-star ratings')
            
        T_correct = 0
        T_incorrect = 0
        diffs = []
        for i, p in enumerate(pred_stars):
            a = actual_stars[i]
            if a != star and star != 'all':
                continue
            
            p1 = pred_prvdrs[i]
            p2 = actual_prvdrs[i]

            if p1 != p2:
                print(p1, 'is not', p2)
                return

            if np.isnan(a) and np.isnan(p):
                continue

            elif np.isnan(a):
                continue

            else:
                if p == a:
                    T_correct += 1
                elif p != a:
                    T_incorrect += 1
                    if p > 0:
                        diffs.append(p - a)

        print('Overall star ratings:')
        print('total correct:', T_correct)
        print('total incorrect:', T_incorrect)
        print('% correct:', np.round(100 * T_correct/(T_correct + T_incorrect), 3), '\n')

        print('Results for differences')
        for i in list(set(diffs)):
            if i < 0:
                print('Estimates were ', np.abs(i), 'stars less than the actual for', diffs.count(i), 'hospitals ')
            elif i > 0:
                print('Estimates were ', np.abs(i), 'stars greater than the actual for', diffs.count(i), 'hospitals ')

        #print(list(set(diffs)))
        #print(diffs.count(1), diffs.count(-1))
        #print(len(diffs))

        if len(diffs) > 0:
            perc = round(100*(diffs.count(1) + diffs.count(-1))/len(diffs), 2)
            perc = f'{perc:.2f}%'
            print(perc + ' of abs. differences were no greater than 1')
            print('\n')
    
    stars = [1,2,3,4,5]
    ct = 0
    for star in stars:
        tdf_main = main_df[main_df['star'] == star]
        tdf_sas = sas_df[sas_df['star'] == star]
        
        print(star, 'star hospitals:')
        print('No. predicted:', tdf_main.shape[0])
        ct += tdf_main.shape[0]
        print('No. actual:', tdf_sas.shape[0], '\n')
        
    print('No. of predicted stars:', ct)
    


# 2021

In [122]:
main_df = pd.read_csv(stars_dir + 'Reproduce_Stars_Input/2021/SAS_output/CMS_Stars_Apr_2021.csv')
sas_df = pd.read_csv(stars_dir + '2021/2021-04 Stars Release/SAS_CSV_output/CMS_Stars_Apr_2021.csv')
    
get_results(sas_df, main_df, '2021')

Shape of the reconstructed output file: (4536, 27)
Shape of the original SAS output file: (4536, 27) 

2 hospitals in the derived results file that aren't in the actual Stars results: [250152, 440133]
2 hospitals in the actual Stars results file that aren't in the derived file: [360367, 451394] 

No. of actual star ratings: 3355 

After filtering, the ordered list of providers are the same
Results for 4534 common providers

Percent Perfect | Avg difference:
Std_Outcomes_Mortality_score : 0.0 | 0.000116
Std_Outcomes_Readmission_score : 0.0 | 0.000158
Std_Outcomes_Safety_score : 0.0 | 0.000159
Std_PatientExp_score : 0.0 | 0.00013
Std_Process_score : 0.0 | 0.000289
std_weight_PatientExperience : 100.0 | 0.0
std_weight_Readmission : 100.0 | 0.0
std_weight_Mortality : 100.0 | 0.0
std_weight_safety : 100.0 | 0.0
std_weight_Process : 100.0 | 0.0
weight_PatientExperience : 100.0 | 0.0
weight_Outcomes_Readmission : 99.9544 | 9.2e-05
weight_Outcomes_Mortality : 99.975 | 1.3e-05
weight_Outcomes_S

# 2022

In [123]:
main_df = pd.read_csv(stars_dir + 'Reproduce_Stars_Input/2022/SAS_output/CMS_Stars_Jul_2022.csv')
sas_df = pd.read_csv(stars_dir + '2022/2022-07 Stars Release/SAS_CSV_output/CMS_Stars_Jul_2022.csv')

get_results(sas_df, main_df, '2022')

Shape of the reconstructed output file: (4491, 27)
Shape of the original SAS output file: (4489, 27) 

5 hospitals in the derived results file that aren't in the actual Stars results: [250152, 390197, 420057, 450780, 451381]
3 hospitals in the actual Stars results file that aren't in the derived file: [370214, 670109, 670265] 

No. of actual star ratings: 3121 

After filtering, the ordered list of providers are the same
Results for 4486 common providers

Percent Perfect | Avg difference:
Std_Outcomes_Mortality_score : 0.0 | 0.000647
Std_Outcomes_Readmission_score : 0.0 | 0.006948
Std_Outcomes_Safety_score : 0.0 | 0.00285
Std_PatientExp_score : 0.0 | 0.001309
Std_Process_score : 0.0 | 0.002243
std_weight_PatientExperience : 100.0 | 0.0
std_weight_Readmission : 100.0 | 0.0
std_weight_Mortality : 100.0 | 0.0
std_weight_safety : 100.0 | 0.0
std_weight_Process : 100.0 | 0.0
weight_PatientExperience : 100.0 | 0.0
weight_Outcomes_Readmission : 99.6446 | 0.001136
weight_Outcomes_Mortality : 9

# 2023

In [124]:
main_df = pd.read_csv(stars_dir + 'Reproduce_Stars_Input/2023/SAS_output/CMS_Stars_Jul_2023.csv')

## Replace the imputed 666666 suffixes of VHA hospitals with their original 'F' suffix
prvdrs1 = []
for p in main_df['PROVIDER_ID'].tolist():
    p = str(p)
    if '666666' in p:
        p = p[:-6] + 'F'
    while len(p) < 6:
        p = '0' + p
    prvdrs1.append(p)
    
main_df['PROVIDER_ID'] = prvdrs1

sas_df = pd.read_csv(stars_dir + '2023/2023-07 Stars Release/SAS_CSV_output/CMS_Stars_Jul_2023.csv')

get_results(sas_df, main_df, '2023')    

Shape of the reconstructed output file: (4684, 27)
Shape of the original SAS output file: (4654, 27) 

33 hospitals in the derived results file that aren't in the actual Stars results: ['02013F', '02014F', '05015F', '05020F', '05022F', '05039F', '05041F', '06003F', '10013F', '10021F', '11032F', '11033F', '11035F', '12001F', '17002F', '18003F', '19050F', '21007F', '25039F', '26002F', '29001F', '33025F', '34011F', '34014F', '36006F', '42009F', '45068F', '45069F', '45070F', '49001F', '49005F', '49008F', '50005F']
3 hospitals in the actual Stars results file that aren't in the derived file: ['251329', '640001', '670265'] 

No. of actual star ratings: 3076 

After filtering, the ordered list of providers are the same
Results for 4651 common providers

Percent Perfect | Avg difference:
Std_Outcomes_Mortality_score : 0.0 | 0.001637
Std_Outcomes_Readmission_score : 0.0 | 0.002273
Std_Outcomes_Safety_score : 0.0 | 0.01296
Std_PatientExp_score : 0.0 | 0.009845
Std_Process_score : 0.0 | 0.005764


# 2024

## Predictions made in July 2023

In [125]:
main_df = pd.read_csv(stars_dir + 'Reproduce_Stars_Input/2024/SAS_output/CMS_Stars_2024_predictions_from_July_2023_data.csv')

## Replace the imputed 666666 suffixes of VHA hospitals with their original 'F' suffix
prvdrs1 = []
for p in main_df['PROVIDER_ID'].tolist():
    p = str(p)
    if '666666' in p:
        p = p[:-6] + 'F'
    while len(p) < 6:
        p = '0' + p
    prvdrs1.append(p)
    
main_df['PROVIDER_ID'] = prvdrs1

sas_df = pd.read_csv(stars_dir + '2024/2024-07 Stars Release/SAS_CSV_output/CMS_Stars_Jul_2024.csv')

get_results(sas_df, main_df, '2024')    

Shape of the reconstructed output file: (4677, 27)
Shape of the original SAS output file: (4626, 27) 

72 hospitals in the derived results file that aren't in the actual Stars results: ['02013F', '02014F', '030077', '05015F', '05020F', '05022F', '05039F', '05041F', '050568', '050742', '06003F', '070039', '100070', '100081', '10013F', '10021F', '110130', '11032F', '11033F', '11035F', '12001F', '140040', '150004', '151335', '160008', '17002F', '18003F', '190146', '190307', '190314', '19050F', '21007F', '240063', '241300', '250152', '250163', '25039F', '26002F', '260064', '260209', '29001F', '290022', '330236', '33025F', '34011F', '34014F', '360037', '36006F', '370229', '390072', '420066', '420068', '42009F', '421303', '430013', '450078', '450099', '450348', '450586', '45068F', '450697', '45069F', '45070F', '450877', '451325', '451328', '49001F', '49005F', '49008F', '50005F', '501301', '670004']
21 hospitals in the actual Stars results file that aren't in the derived file: ['031320', '041

## Predictions made in Oct 2023

In [126]:
main_df = pd.read_csv(stars_dir + 'Reproduce_Stars_Input/2024/SAS_output/CMS_Stars_2024_predictions_from_Oct_2023_data.csv')

## Replace the imputed 666666 suffixes of VHA hospitals with their original 'F' suffix
prvdrs1 = []
for p in main_df['PROVIDER_ID'].tolist():
    p = str(p)
    if '666666' in p:
        p = p[:-6] + 'F'
    while len(p) < 6:
        p = '0' + p
    prvdrs1.append(p)
    
main_df['PROVIDER_ID'] = prvdrs1

sas_df = pd.read_csv(stars_dir + '2024/2024-07 Stars Release/SAS_CSV_output/CMS_Stars_Jul_2024.csv')

get_results(sas_df, main_df, '2024')

Shape of the reconstructed output file: (4677, 27)
Shape of the original SAS output file: (4626, 27) 

72 hospitals in the derived results file that aren't in the actual Stars results: ['02013F', '02014F', '030077', '05015F', '05020F', '05022F', '05039F', '05041F', '050568', '050742', '06003F', '070039', '100070', '100081', '10013F', '10021F', '110130', '11032F', '11033F', '11035F', '12001F', '140040', '150004', '151335', '160008', '17002F', '18003F', '190146', '190307', '190314', '19050F', '21007F', '240063', '241300', '250152', '250163', '25039F', '26002F', '260064', '260209', '29001F', '290022', '330236', '33025F', '34011F', '34014F', '360037', '36006F', '370229', '390072', '420066', '420068', '42009F', '421303', '430013', '450078', '450099', '450348', '450586', '45068F', '450697', '45069F', '45070F', '450877', '451325', '451328', '49001F', '49005F', '49008F', '50005F', '501301', '670004']
21 hospitals in the actual Stars results file that aren't in the derived file: ['031320', '041

## Prediction made in Nov 2023

In [127]:
main_df = pd.read_csv(stars_dir + 'Reproduce_Stars_Input/2024/SAS_output/CMS_Stars_2024_predictions_from_Nov_2023_data.csv')

## Replace the imputed 666666 suffixes of VHA hospitals with their original 'F' suffix
prvdrs1 = []
for p in main_df['PROVIDER_ID'].tolist():
    p = str(p)
    if '666666' in p:
        p = p[:-6] + 'F'
    while len(p) < 6:
        p = '0' + p
    prvdrs1.append(p)
    
main_df['PROVIDER_ID'] = prvdrs1

sas_df = pd.read_csv(stars_dir + '2024/2024-07 Stars Release/SAS_CSV_output/CMS_Stars_Jul_2024.csv')

get_results(sas_df, main_df, '2024')

Shape of the reconstructed output file: (4673, 27)
Shape of the original SAS output file: (4626, 27) 

60 hospitals in the derived results file that aren't in the actual Stars results: ['02013F', '02014F', '05015F', '05020F', '05022F', '05039F', '05041F', '050568', '050742', '06003F', '070039', '100070', '100081', '10013F', '10021F', '11032F', '11033F', '11035F', '12001F', '140040', '150004', '160008', '17002F', '18003F', '190146', '190307', '190314', '19050F', '21007F', '250152', '25039F', '26002F', '260064', '260209', '29001F', '290022', '330236', '33025F', '34011F', '34014F', '360037', '36006F', '370229', '420066', '420068', '42009F', '421303', '450078', '450099', '450586', '45068F', '45069F', '45070F', '450877', '451328', '49001F', '49005F', '49008F', '50005F', '670004']
13 hospitals in the actual Stars results file that aren't in the derived file: ['030074', '031320', '041332', '170208', '251329', '281316', '281330', '431340', '451397', '451398', '640001', '670265', '670319'] 

No

## Predictions made in January 2024

In [128]:
main_df = pd.read_csv(stars_dir + 'Reproduce_Stars_Input/2024/SAS_output/CMS_Stars_2024_predictions_from_Jan_2024_data.csv')

## Replace the imputed 666666 suffixes of VHA hospitals with their original 'F' suffix
prvdrs1 = []
for p in main_df['PROVIDER_ID'].tolist():
    p = str(p)
    if '666666' in p:
        p = p[:-6] + 'F'
    while len(p) < 6:
        p = '0' + p
    prvdrs1.append(p)
    
main_df['PROVIDER_ID'] = prvdrs1

sas_df = pd.read_csv(stars_dir + '2024/2024-07 Stars Release/SAS_CSV_output/CMS_Stars_Jul_2024.csv')

get_results(sas_df, main_df, '2024')

Shape of the reconstructed output file: (4654, 27)
Shape of the original SAS output file: (4626, 27) 

34 hospitals in the derived results file that aren't in the actual Stars results: ['02013F', '02014F', '05015F', '05020F', '05022F', '05039F', '05041F', '06003F', '10013F', '10021F', '11032F', '11033F', '11035F', '12001F', '17002F', '18003F', '19050F', '21007F', '250152', '25039F', '26002F', '29001F', '33025F', '34011F', '34014F', '36006F', '370229', '45068F', '45069F', '45070F', '49001F', '49005F', '49008F', '50005F']
6 hospitals in the actual Stars results file that aren't in the derived file: ['170208', '251329', '451397', '451398', '640001', '670265'] 

No. of actual star ratings: 2847 

After filtering, the ordered list of providers are the same
Results for 4620 common providers

Percent Perfect | Avg difference:
Std_Outcomes_Mortality_score : 0.0 | 0.000247
Std_Outcomes_Readmission_score : 0.0 | 0.001185
Std_Outcomes_Safety_score : 0.0 | 0.003465
Std_PatientExp_score : 0.0 | 0.0