In [2]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import warnings
from scipy import stats
from scipy.stats import percentileofscore

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

pd.set_option("display.precision", 10)

## Import file for validation

This file was produced via the CMS Stars SAS code

In [3]:
stars_dir = '~/GitHub/stars-data-builder/'
path = stars_dir + 'Reproduce_Stars_Input/2024/SAS_output/CMS_Stars_2024_predictions_from_Jan_2024_data.csv'
valid_df = pd.read_csv(path)

valid_df = valid_df[~valid_df['star'].isin([np.nan, float("NaN")])]

cols = ['Std_Outcomes_Mortality_score', 'Std_Outcomes_Readmission_score', 
        'Std_Outcomes_Safety_score', 'Std_PatientExp_score', 
        'Std_Process_score', 'std_weight_PatientExperience', 
        'std_weight_Readmission', 'std_weight_Mortality', 
        'std_weight_safety', 'std_weight_Process', 
        'weight_PatientExperience', 'weight_Outcomes_Readmission', 
        'weight_Outcomes_Mortality', 'weight_Outcomes_Safety', 
        'weight_Process', 'summary_score', 
        'Outcomes_Mortality_cnt', 'Outcomes_safety_cnt', 
        'Outcomes_Readmission_cnt', 'Patient_Experience_cnt', 
        'Process_cnt', 'Total_measure_group_cnt', 
        'MortSafe_Group_cnt', 'report_indicator',
       ]

for col in cols:
    valid_df[col] = valid_df[col].astype(float)
print(valid_df.shape)
valid_df.head()

(2852, 27)


Unnamed: 0,PROVIDER_ID,Std_Outcomes_Mortality_score,Std_Outcomes_Readmission_score,Std_Outcomes_Safety_score,Std_PatientExp_score,Std_Process_score,std_weight_PatientExperience,std_weight_Readmission,std_weight_Mortality,std_weight_safety,std_weight_Process,weight_PatientExperience,weight_Outcomes_Readmission,weight_Outcomes_Mortality,weight_Outcomes_Safety,weight_Process,summary_score,Outcomes_Mortality_cnt,Outcomes_safety_cnt,Outcomes_Readmission_cnt,Patient_Experience_cnt,Process_cnt,Total_measure_group_cnt,MortSafe_Group_cnt,report_indicator,cnt_grp,star
0,10001,0.007134504,0.2312259238,-0.145520453,0.1268857714,-0.682878658,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.033605775,7.0,7.0,11.0,8.0,10.0,5.0,2.0,1.0,3) # of groups=5,3.0
1,10005,-1.441028,0.7203393325,-0.087122055,-0.264676975,-0.581469391,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.305723621,6.0,7.0,9.0,8.0,11.0,5.0,2.0,1.0,3) # of groups=5,2.0
2,10006,-1.463006419,-0.268615037,-0.172398466,-1.10406966,-0.76034002,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.75302051,7.0,7.0,9.0,8.0,11.0,5.0,2.0,1.0,3) # of groups=5,1.0
3,10007,-3.528709906,-0.470133322,0.3939170917,0.1891476221,-1.81045547,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.96872593,3.0,2.0,7.0,8.0,7.0,4.0,1.0,1.0,2) # of groups=4,1.0
5,10011,-0.476908618,-0.401501488,0.3462801505,-0.040622639,-1.055261064,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.252636898,7.0,7.0,9.0,8.0,8.0,5.0,2.0,1.0,3) # of groups=5,3.0


## Import data

In [4]:
raw_data = pd.read_csv(stars_dir + "Reproduce_Stars_Input/2024/Input_File/data_for_2024_prognostications_from_Jan2024.csv")
raw_data.dropna(how='all', axis=0, inplace=True)
for c in list(raw_data):
    if c == 'PROVIDER_ID': 
        continue
    else:
        raw_data[c] = raw_data[c].astype(float)
print(raw_data.shape)
raw_data.head()

(4654, 95)


Unnamed: 0,PROVIDER_ID,HAI_1_DEN_VOL,HAI_2_DEN_VOL,HAI_3_DEN_VOL,HAI_4_DEN_VOL,HAI_5_DEN_VOL,HAI_6_DEN_VOL,HAI_1_DEN_PRED,HAI_2_DEN_PRED,HAI_3_DEN_PRED,HAI_4_DEN_PRED,HAI_5_DEN_PRED,HAI_6_DEN_PRED,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,READM_30_HOSP_WIDE,READM_30_HIP_KNEE,EDAC_30_HF,READM_30_COPD,EDAC_30_AMI,EDAC_30_PN,MORT_30_STK,MORT_30_PN,MORT_30_HF,MORT_30_COPD,MORT_30_AMI,COMP_HIP_KNEE,READM_30_HOSP_WIDE_DEN,READM_30_HIP_KNEE_DEN,EDAC_30_HF_DEN,READM_30_COPD_DEN,EDAC_30_AMI_DEN,EDAC_30_PN_DEN,MORT_30_STK_DEN,MORT_30_PN_DEN,MORT_30_HF_DEN,MORT_30_COPD_DEN,MORT_30_AMI_DEN,COMP_HIP_KNEE_DEN,OP_2,OP_2_DEN,OP_3B,OP_3B_DEN,OP_8,OP_8_DEN,OP_10,OP_10_DEN,OP_13,OP_13_DEN,OP_18B,OP_18B_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,PSI_90_SAFETY,IMM_3_DEN,IMM_3,HCP_COVID_19_DEN,HCP_COVID_19,PC_01,PC_01_DEN,SEP_1,SEP_1_DEN,H_RESP_RATE_P,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,H_NUMB_COMP,PSI_90_SAFETY_DEN,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,OP_32,OP_32_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN
0,10001,9149.0,17310.0,214.0,76.0,104733.0,104733.0,9.597,24.766,5.994,0.712,11.4,67.066,0.938,0.363,1.335,,0.965,0.507,0.142,0.038,23.4,0.19,-15.4,23.6,0.148,0.18,0.089,0.088,0.12,0.027,2912.0,49.0,614.0,117.0,274.0,403.0,398.0,400.0,549.0,107.0,278.0,49.0,,,,,0.38,,0.061,,0.028,,214.0,348.0,0.05,52960.0,,,0.47,17.0,184.68,134.0,1.21,3905.0,0.95,,0.836,0.0,32.0,0.65,127.0,15.0,3.0,3.0,3.0,3.0,4.0,4.0,3.5,3.5,544.0,,0.041,132.0,0.105,126.0,12.9,170.0,11.9,202.0,4.9,202.0,1.1,668.0
1,10005,3194.0,8277.0,96.0,34.0,36794.0,34887.0,1.989,4.019,2.626,0.324,1.847,10.066,2.514,0.995,0.762,,0.541,0.497,0.128,0.034,22.1,0.166,,-6.2,0.153,0.233,0.149,0.099,0.136,0.023,1052.0,172.0,129.0,136.0,,285.0,81.0,289.0,121.0,126.0,27.0,155.0,,,57.0,16.0,0.477,,0.12,,0.042,,145.0,1074.0,0.03,56820.0,0.58,12.0,0.96,180.0,183.49,43.0,0.97,2700.0,0.8,,0.807,0.02,200.0,0.69,252.0,18.0,3.0,4.0,1.0,3.0,4.0,3.0,3.0,3.0,824.0,,,,,,14.2,739.0,7.9,107.0,5.5,107.0,1.9,406.0
2,10006,5343.0,8715.0,111.0,14.0,63727.0,60304.0,5.801,11.166,2.95,0.124,5.283,27.805,0.172,0.358,0.0,,1.514,0.072,0.134,0.053,-4.7,0.176,28.1,-0.4,0.172,0.195,0.125,0.099,0.165,0.046,2310.0,138.0,441.0,158.0,273.0,472.0,227.0,469.0,388.0,148.0,254.0,145.0,,,,,0.462,,0.101,,0.045,,168.0,360.0,0.01,42286.0,0.75,16.0,0.85,82.0,173.63,96.0,1.17,2536.0,0.67,,0.796,0.04,28.0,0.57,126.0,19.0,2.0,3.0,2.0,2.0,3.0,2.0,2.0,2.5,1503.0,,0.036,95.0,0.124,89.0,12.1,1355.0,,,,,1.4,484.0
3,10007,343.0,1193.0,3.0,,5511.0,5511.0,0.21,0.652,0.089,,0.133,2.66,,,,,,0.376,0.157,0.042,-1.9,0.2,,-9.4,,0.285,0.125,0.137,,,258.0,26.0,31.0,34.0,,72.0,,88.0,26.0,34.0,,,,,,,,,0.034,,,,132.0,1275.0,0.04,11202.0,,,0.23,111.0,,,0.95,350.0,0.53,,0.601,,,0.93,43.0,24.0,3.0,5.0,4.0,3.0,3.0,3.0,3.5,3.0,189.0,,,,,,13.4,109.0,,,,,1.2,59.0
4,10008,41.0,444.0,,,2376.0,2376.0,0.023,0.242,,,0.055,0.436,,,,,,,0.148,,,,,,,,,,,,69.0,,,,,,,,,,,,,,,,,,0.0,,,,116.0,340.0,0.0,6239.0,,,0.67,24.0,,,,126.0,0.45,,0.797,,,,,26.0,,,,,,,,,46.0,,,,,,12.9,42.0,,,,,,


## Filter data

In [5]:
# Define the measures you're interested in
measures = ['MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF',
            'MORT_30_PN', 'MORT_30_STK', 'PSI_4_SURG_COMP', 'COMP_HIP_KNEE',
            'HAI_1', 'HAI_2', 'HAI_3', 'HAI_4', 'HAI_5', 'HAI_6', 
            'PSI_90_SAFETY', 'EDAC_30_AMI', 'EDAC_30_HF',
            'EDAC_30_PN', 'OP_32', 'READM_30_CABG', 'READM_30_COPD',
            'READM_30_HIP_KNEE', 'READM_30_HOSP_WIDE', 'OP_35_ADM', 
            'OP_35_ED', 'OP_36', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 
            'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 
            'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 
            'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'HCP_COVID_19', 
            'IMM_3', 'OP_10', 'OP_13', 'OP_18B', 'OP_2', 'OP_22',
            'OP_23', 'OP_29', 'OP_3B', 'OP_8', 'PC_01', 'SEP_1',
           ]

print(len(measures), 'measures')
prvdrs = raw_data['PROVIDER_ID']
raw_data = raw_data.filter(items=measures)
filtered_data = raw_data.dropna(axis=1, thresh=101)
filtered_measures = list(filtered_data)

excluded = [item for item in measures if item not in filtered_measures]
print('Excluded measure(s):', excluded)
filtered_data.dropna(how='all', subset=filtered_measures, axis=0, inplace=True)

print('Shape of filtered dataframe:', filtered_data.shape)
print('Final no. of measures:', filtered_data.shape[1])
filtered_data['PROVIDER_ID'] = prvdrs
filtered_data = filtered_data[filtered_data.columns[-1:].tolist() + filtered_data.columns[:-1].tolist()]

tdf = filtered_data.copy(deep=True)

prvdrs1 = []
for p in tdf['PROVIDER_ID'].tolist():
    p = str(p)
    if '666666' in p:
        p = p[:-6]
        p = p + 'F'
    while len(p) < 6:
        p = '0' + p
    prvdrs1.append(p)
    
tdf['PROVIDER_ID'] = prvdrs1

tdf.to_pickle(stars_dir + 'FilesForApp/filtered_raw_data.pkl')
filtered_data.head()


47 measures
Excluded measure(s): ['OP_2']
Shape of filtered dataframe: (4654, 46)
Final no. of measures: 46


Unnamed: 0,PROVIDER_ID,MORT_30_AMI,MORT_30_CABG,MORT_30_COPD,MORT_30_HF,MORT_30_PN,MORT_30_STK,PSI_4_SURG_COMP,COMP_HIP_KNEE,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,PSI_90_SAFETY,EDAC_30_AMI,EDAC_30_HF,EDAC_30_PN,OP_32,READM_30_CABG,READM_30_COPD,READM_30_HIP_KNEE,READM_30_HOSP_WIDE,OP_35_ADM,OP_35_ED,OP_36,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,HCP_COVID_19,IMM_3,OP_10,OP_13,OP_18B,OP_22,OP_23,OP_29,OP_3B,OP_8,PC_01,SEP_1
0,10001,0.12,0.041,0.088,0.089,0.18,0.148,184.68,0.027,0.938,0.363,1.335,,0.965,0.507,1.21,-15.4,23.4,23.6,12.9,0.105,0.19,0.038,0.142,11.9,4.9,1.1,3.0,3.0,3.0,3.0,4.0,4.0,3.5,3.5,0.836,0.95,0.061,0.028,214.0,0.05,,0.47,,0.38,0.0,0.65
1,10005,0.136,,0.099,0.149,0.233,0.153,183.49,0.023,2.514,0.995,0.762,,0.541,0.497,0.97,,22.1,-6.2,14.2,,0.166,0.034,0.128,7.9,5.5,1.9,3.0,4.0,1.0,3.0,4.0,3.0,3.0,3.0,0.807,0.8,0.12,0.042,145.0,0.03,0.58,0.96,57.0,0.477,0.02,0.69
2,10006,0.165,0.036,0.099,0.125,0.195,0.172,173.63,0.046,0.172,0.358,0.0,,1.514,0.072,1.17,28.1,-4.7,-0.4,12.1,0.124,0.176,0.053,0.134,,,1.4,2.0,3.0,2.0,2.0,3.0,2.0,2.0,2.5,0.796,0.67,0.101,0.045,168.0,0.01,0.75,0.85,,0.462,0.04,0.57
3,10007,,,0.137,0.125,0.285,,,,,,,,,0.376,0.95,,-1.9,-9.4,13.4,,0.2,0.042,0.157,,,1.2,3.0,5.0,4.0,3.0,3.0,3.0,3.5,3.0,0.601,0.53,0.034,,132.0,0.04,,0.23,,,,0.93
4,10008,,,,,,,,,,,,,,,,,,,12.9,,,,0.148,,,,,,,,,,,,0.797,0.45,0.0,,116.0,0.0,,0.67,,,,


## Z-score data and reverse select measures

In [6]:
ddof = 1
zscore_df = filtered_data.copy(deep=True)
for m in measures:
    if m in excluded:
        continue
        
    zscore_df[m] = stats.zscore(zscore_df[m], ddof=ddof, nan_policy='omit')

print('Shape of z-scored dataframe:', zscore_df.shape)

rev_measures = ['MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF',
                'MORT_30_PN', 'MORT_30_STK', 'PSI_4_SURG_COMP', 'COMP_HIP_KNEE', 
                'HAI_1', 'HAI_2', 'HAI_3', 'HAI_4', 'HAI_5', 'HAI_6',
                'PSI_90_SAFETY', 'EDAC_30_AMI', 'EDAC_30_HF', 'EDAC_30_PN',
                'OP_32', 'READM_30_CABG', 'READM_30_COPD', 
                'READM_30_HIP_KNEE', 'READM_30_HOSP_WIDE',
                'OP_35_ADM', 'OP_35_ED', 'OP_36', 'OP_22',
                'PC_01', 'OP_3B', 'OP_18B', 'OP_8', 
                'OP_10','OP_13',
               ]
for m in rev_measures:
    zscore_df[m] = -1*zscore_df[m]
    zscore_df[m] = zscore_df[m]

zscore_df.head()

# Examples from SAS file
# 0.4705137179
# -0.91438786 # SAS rounded 
# -3.424521969

Shape of z-scored dataframe: (4654, 47)


Unnamed: 0,PROVIDER_ID,MORT_30_AMI,MORT_30_CABG,MORT_30_COPD,MORT_30_HF,MORT_30_PN,MORT_30_STK,PSI_4_SURG_COMP,COMP_HIP_KNEE,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,PSI_90_SAFETY,EDAC_30_AMI,EDAC_30_HF,EDAC_30_PN,OP_32,READM_30_CABG,READM_30_COPD,READM_30_HIP_KNEE,READM_30_HOSP_WIDE,OP_35_ADM,OP_35_ED,OP_36,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,HCP_COVID_19,IMM_3,OP_10,OP_13,OP_18B,OP_22,OP_23,OP_29,OP_3B,OP_8,PC_01,SEP_1
0,10001,0.4705137179,-1.5350742405,0.2829957684,1.5108845871,0.1300305525,-0.5542620037,-0.7483321586,0.8793924805,-0.2331689661,0.48853745,-0.7116600065,,-0.210158867,0.0059299762,-1.1621596577,1.1232307267,-0.7917569396,-0.6346206597,0.2926120951,0.5848354227,0.2938048537,0.8863152668,0.42374607,-1.0329911759,0.6276573209,-0.2532054567,-0.380632695,-0.4078647314,-0.2523135255,0.1100686308,0.4713109565,0.8607688087,0.1296570796,0.3528201905,-0.4315950001,0.9400052796,-0.0288656185,0.4187845622,-0.9452525091,-1.1424873298,,-3.1208869753,,0.0004123468,0.5837759922,0.3696093386
1,10005,-0.9143878597,,-0.5249631546,-1.5848085649,-1.9536338559,-0.8290759217,-0.6925039174,1.6088875234,-2.4345339372,-0.5396239924,0.1145984019,,0.4216690755,0.0268715981,0.176161121,,-0.7370645765,0.4860195315,-1.0368289435,,2.3759021373,1.6416592219,2.0328416327,1.8894389833,-0.0726712385,-2.9668760824,-0.380632695,0.650115599,-1.995765203,0.1100686308,0.4713109565,-0.1351064678,-0.3950916974,-0.168489642,-0.7225439756,0.1237563383,-1.2107851979,-0.383916169,0.3087365827,-0.2748866468,-0.6220448114,0.3344519438,0.2778029214,-1.3425854224,0.0928376915,0.6040189896
2,10006,-3.424521969,-0.8319564096,-0.5249631546,-0.3465313041,-0.4596857895,-1.87336881,-0.229927062,-2.5857089733,0.8367838054,0.4966716386,1.2133923063,,-1.0282568397,0.9168905293,-0.9391061946,-1.1188725698,0.3904395247,0.2679083534,1.1107296572,-1.4624571504,1.5083616024,-1.9462245648,1.3432292487,,,-1.2708319414,-1.3703384574,-0.4078647314,-1.1240393642,-0.957203986,-0.5655095002,-1.1309817444,-1.4445892514,-0.6897994745,-0.8329039318,-0.5836594108,-0.8301670283,-0.5559234686,-0.1092597812,0.5927140362,0.2428119286,-0.4412363849,,-1.134905355,-0.3981006093,-0.0992099633
3,10007,,,-3.3160939797,-0.3465313041,-3.9979838415,,,,,,,,,0.2802652232,0.2876878525,,0.2726405887,0.6063567333,-0.2187113813,,-0.5737356812,0.1309713117,-1.3002848901,,,-0.592414285,-0.380632695,1.7080959294,0.6194123133,0.1100686308,-0.5655095002,-0.1351064678,0.1296570796,-0.168489642,-2.7892849738,-1.3454917559,0.5120128331,,0.5449953971,-0.7086869883,,-4.8132978744,,,,2.0104768953
4,10008,,,,,,,,,,,,,,,,,,,0.2926120951,,,,-0.2658663141,,,,,,,,,,,,-0.8228712085,-1.7808245246,1.1931190314,,0.8357754763,1.0265143777,,-1.7105445593,,,,


## Measure Group Scores

In [7]:

final_df = pd.DataFrame(columns=['PROVIDER_ID'])
final_df['PROVIDER_ID'] = zscore_df['PROVIDER_ID']

# 7 Mortality measures
mort_measures = ['MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF', 
                 'MORT_30_PN', 'MORT_30_STK', 'PSI_4_SURG_COMP']
final_df['Std_Outcomes_Mortality_score'] = stats.zscore(zscore_df[mort_measures].mean(axis=1), ddof=ddof, nan_policy='omit')
final_df['Outcomes_Mortality_cnt'] = zscore_df[mort_measures].apply(lambda row: row.notna().sum(), axis=1)


# 11 Readmission measures
readm_measures = ['EDAC_30_AMI', 'EDAC_30_HF', 'EDAC_30_PN', 'OP_32',
                  'READM_30_CABG', 'READM_30_COPD', 'READM_30_HIP_KNEE', 
                  'READM_30_HOSP_WIDE', 'OP_35_ADM', 'OP_35_ED', 'OP_36']
final_df['Std_Outcomes_Readmission_score'] = stats.zscore(zscore_df[readm_measures].mean(axis=1), ddof=ddof, nan_policy='omit')
final_df['Outcomes_Readmission_cnt'] = zscore_df[readm_measures].apply(lambda row: row.notna().sum(), axis=1)


# 8 SAFETY measures
safety_measures = ['COMP_HIP_KNEE',  'HAI_1', 'HAI_2', 'HAI_3', 'HAI_4', 
                   'HAI_5', 'HAI_6', 'PSI_90_SAFETY']
final_df['Std_Outcomes_Safety_score'] = stats.zscore(zscore_df[safety_measures].mean(axis=1), ddof=ddof, nan_policy='omit')
final_df['Outcomes_safety_cnt'] = zscore_df[safety_measures].apply(lambda row: row.notna().sum(), axis=1)


# 8 Patient experience measures
patexp_measures = ['H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 
                   'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 
                   'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 
                   'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING']
final_df['Std_PatientExp_score'] = stats.zscore(zscore_df[patexp_measures].mean(axis=1), ddof=ddof, nan_policy='omit')
final_df['Patient_Experience_cnt'] = zscore_df[patexp_measures].apply(lambda row: row.notna().sum(), axis=1)


# 13 Process measures
proc_measures = ['HCP_COVID_19', 'IMM_3', 'OP_10', 'OP_13', 'OP_18B', 
                 #'OP_2', 
                 'OP_22', 'OP_23', 'OP_29', 'OP_3B',  
                 'OP_8', 'PC_01', 'SEP_1']
final_df['Std_Process_score'] = stats.zscore(zscore_df[proc_measures].mean(axis=1), ddof=ddof, nan_policy='omit')
final_df['Process_cnt'] = zscore_df[proc_measures].apply(lambda row: row.notna().sum(), axis=1)


mort_cnts = final_df['Outcomes_Mortality_cnt'].tolist()
safe_cnts = final_df['Outcomes_safety_cnt'].tolist()
read_cnts = final_df['Outcomes_Readmission_cnt'].tolist()
pate_cnts = final_df['Patient_Experience_cnt'].tolist()
proc_cnts = final_df['Process_cnt'].tolist()

tot_cnts = []
msg_cnts = []
grp_cnts = []
for i, m in enumerate(mort_cnts):
    ct = 0
    ct2 = 0
    if m > 2:
        ct += 1
        ct2 +=1
    if safe_cnts[i] > 2:
        ct += 1
        ct2 += 1
    if read_cnts[i] > 2:
        ct += 1
    if pate_cnts[i] > 2:
        ct += 1
    if proc_cnts[i] > 2:
        ct += 1
        
    tot_cnts.append(ct)
    msg_cnts.append(ct2)
    if ct == 3:
        grp_cnts.append('1) # of groups=3')
    elif ct == 4:
        grp_cnts.append('2) # of groups=4')
    elif ct == 5:
        grp_cnts.append('3) # of groups=5')
    else:
        grp_cnts.append('Not grouped')

final_df['Total_measure_group_cnt'] = tot_cnts
final_df['MortSafe_Group_cnt'] = msg_cnts
final_df['cnt_grp'] = grp_cnts


# Add standard group measure weights
final_df['std_weight_PatientExperience'] = 0.22
final_df['std_weight_Readmission'] = 0.22
final_df['std_weight_Mortality'] = 0.22
final_df['std_weight_safety'] = 0.22
final_df['std_weight_Process'] = 0.12

print(final_df.shape)
final_df.head()

(4654, 19)


Unnamed: 0,PROVIDER_ID,Std_Outcomes_Mortality_score,Outcomes_Mortality_cnt,Std_Outcomes_Readmission_score,Outcomes_Readmission_cnt,Std_Outcomes_Safety_score,Outcomes_safety_cnt,Std_PatientExp_score,Patient_Experience_cnt,Std_Process_score,Process_cnt,Total_measure_group_cnt,MortSafe_Group_cnt,cnt_grp,std_weight_PatientExperience,std_weight_Readmission,std_weight_Mortality,std_weight_safety,std_weight_Process
0,10001,0.007134504,7,0.2312259238,11,-0.1455204533,7,0.1268857714,8,-0.6803237137,10,5,2,3) # of groups=5,0.22,0.22,0.22,0.22,0.12
1,10005,-1.4410280005,6,0.7203393325,9,-0.0871220551,7,-0.2646769751,8,-0.492389631,12,5,2,3) # of groups=5,0.22,0.22,0.22,0.22,0.12
2,10006,-1.4630064187,7,-0.2686150368,9,-0.1723984657,7,-1.1040696602,8,-0.7577044641,11,5,2,3) # of groups=5,0.22,0.22,0.22,0.22,0.12
3,10007,-3.5287099063,3,-0.4701333217,7,0.3939170917,2,0.1891476221,8,-1.806727088,7,4,1,2) # of groups=4,0.22,0.22,0.22,0.22,0.12
4,10008,,0,-0.0006194855,2,,0,,0,-0.4462841583,6,1,0,Not grouped,0.22,0.22,0.22,0.22,0.12


## Add redistributed measure group weights


In [8]:
import pandas as pd

# Sample DataFrame setup
# final_df = ... (your DataFrame)

# Standard weights and their corresponding score columns
weights_info = {
    'Std_PatientExp_score': ('weight_PatientExperience', 0.22),
    'Std_Outcomes_Readmission_score': ('weight_Outcomes_Readmission', 0.22),
    'Std_Outcomes_Mortality_score': ('weight_Outcomes_Mortality', 0.22),
    'Std_Outcomes_Safety_score': ('weight_Outcomes_Safety', 0.22),
    'Std_Process_score': ('weight_Process', 0.12)
}

# Function to adjust weights
def adjust_weights(row):
    # Extract scores and check for NaN
    scores = {score: row[score] for score in weights_info.keys()}
    non_missing_scores = {k: v for k, v in scores.items() if pd.notnull(v)}
    
    # Sum of weights for non-missing scores
    sum_weights = sum(weights_info[k][1] for k in non_missing_scores.keys())
    
    # Assign adjusted weights or 0 if score is missing
    for score, (new_col, weight) in weights_info.items():
        if score in non_missing_scores:
            row[new_col] = weight / sum_weights
        else:
            row[new_col] = 0  # Set weight to 0 if score is missing
    
    return row

# Apply the function to each row
final_df = final_df.apply(adjust_weights, axis=1)

final_df['PROVIDER_ID'] = final_df['PROVIDER_ID'].astype(int)
final_df.head()

Unnamed: 0,PROVIDER_ID,Std_Outcomes_Mortality_score,Outcomes_Mortality_cnt,Std_Outcomes_Readmission_score,Outcomes_Readmission_cnt,Std_Outcomes_Safety_score,Outcomes_safety_cnt,Std_PatientExp_score,Patient_Experience_cnt,Std_Process_score,Process_cnt,Total_measure_group_cnt,MortSafe_Group_cnt,cnt_grp,std_weight_PatientExperience,std_weight_Readmission,std_weight_Mortality,std_weight_safety,std_weight_Process,weight_PatientExperience,weight_Outcomes_Readmission,weight_Outcomes_Mortality,weight_Outcomes_Safety,weight_Process
0,10001,0.007134504,7,0.2312259238,11,-0.1455204533,7,0.1268857714,8,-0.6803237137,10,5,2,3) # of groups=5,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12
1,10005,-1.4410280005,6,0.7203393325,9,-0.0871220551,7,-0.2646769751,8,-0.492389631,12,5,2,3) # of groups=5,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12
2,10006,-1.4630064187,7,-0.2686150368,9,-0.1723984657,7,-1.1040696602,8,-0.7577044641,11,5,2,3) # of groups=5,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12
3,10007,-3.5287099063,3,-0.4701333217,7,0.3939170917,2,0.1891476221,8,-1.806727088,7,4,1,2) # of groups=4,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12
4,10008,,0,-0.0006194855,2,,0,,0,-0.4462841583,6,1,0,Not grouped,0.22,0.22,0.22,0.22,0.12,0.0,0.6470588235,0.0,0.0,0.3529411765


## Calculate summary scores

In [9]:
# Define score columns and their corresponding adjusted weight columns
score_columns = [
    'Std_PatientExp_score',
    'Std_Outcomes_Readmission_score',
    'Std_Outcomes_Mortality_score',
    'Std_Outcomes_Safety_score',
    'Std_Process_score'
]
weight_columns = [
    'weight_PatientExperience',
    'weight_Outcomes_Readmission',
    'weight_Outcomes_Mortality',
    'weight_Outcomes_Safety',
    'weight_Process'
]

# Calculate weighted average for each row
final_df['summary_score'] = final_df.apply(lambda row: sum(row[score] * row[weight] for score, weight in zip(score_columns, weight_columns) if pd.notnull(row[score])), axis=1)

# 'weighted_average' column now contains the weighted average of the 5 measures for each row

final_df.head()

Unnamed: 0,PROVIDER_ID,Std_Outcomes_Mortality_score,Outcomes_Mortality_cnt,Std_Outcomes_Readmission_score,Outcomes_Readmission_cnt,Std_Outcomes_Safety_score,Outcomes_safety_cnt,Std_PatientExp_score,Patient_Experience_cnt,Std_Process_score,Process_cnt,Total_measure_group_cnt,MortSafe_Group_cnt,cnt_grp,std_weight_PatientExperience,std_weight_Readmission,std_weight_Mortality,std_weight_safety,std_weight_Process,weight_PatientExperience,weight_Outcomes_Readmission,weight_Outcomes_Mortality,weight_Outcomes_Safety,weight_Process,summary_score
0,10001,0.007134504,7,0.2312259238,11,-0.1455204533,7,0.1268857714,8,-0.6803237137,10,5,2,3) # of groups=5,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.0332991815
1,10005,-1.4410280005,6,0.7203393325,9,-0.0871220551,7,-0.2646769751,8,-0.492389631,12,5,2,3) # of groups=5,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.2950340493
2,10006,-1.4630064187,7,-0.2686150368,9,-0.1723984657,7,-1.1040696602,8,-0.7577044641,11,5,2,3) # of groups=5,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.7527042436
3,10007,-3.5287099063,3,-0.4701333217,7,0.3939170917,2,0.1891476221,8,-1.806727088,7,4,1,2) # of groups=4,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.9682785237
4,10008,,0,-0.0006194855,2,,0,,0,-0.4462841583,6,1,0,Not grouped,0.22,0.22,0.22,0.22,0.12,0.0,0.6470588235,0.0,0.0,0.3529411765,-0.1579128994


## Add columns for measure counts

In [10]:
print(final_df.shape)
final_df = final_df[final_df['cnt_grp'] != 'Not grouped']
final_df = final_df[final_df['MortSafe_Group_cnt'] > 0]
final_df['report_indicator'] = 1
print(final_df.shape)


(4654, 25)
(2852, 26)


## Assign Star rating

In [11]:
def adjust_edge_cases(row, centers, col):
    """
    Adjust cluster labels for edge cases, accounting for cases where the distance to the assigned cluster is 0.0.
    
    Args:
    - row: The row of the DataFrame being processed.
    - centers: Array of cluster centers.
    - col: The column name of the measure used for clustering.
    
    Returns:
    - Adjusted cluster label.
    """
    distances = np.abs(centers - row[col])
    closest, second_closest = np.partition(distances, 1)[:2]
    # Check if the closest distance is 0.0; if so, do not adjust the label
    if closest == 0:
        return row['cluster'] + 1

    # If the closest and second closest centers are very close, consider adjusting the label
    if np.isclose(closest, second_closest, atol=0.0001):  # 'atol' might need adjustment
        return row['cluster'] + 1  # Increment the cluster label for edge cases
    else:
        return row['cluster']


def kmeans_clustering(df, n_clusters=5, col='summary_score'):
    # Step 1: Initial Data Preparation - Determine quintile medians as initial seeds
    quintiles = np.percentile(df[col].dropna(),
                              [20.0, 40.0, 60.0, 80.0],
                              method='interpolated_inverted_cdf', # this is good
                              #method='linear',
                             )
    df['grp'] = pd.cut(df[col], bins=[-np.inf] + quintiles.tolist() + [np.inf], labels=False) + 1

    # Step 2: Initial K-Means Clustering - Compute median for initial seeds
    initial_seeds = df.groupby('grp')[col].median().dropna().values.reshape(-1, 1)

    kmeans_initial = KMeans(n_clusters=len(initial_seeds), 
                            init=initial_seeds, 
                            n_init=100, 
                            max_iter=1000, 
                            random_state=0,
                            tol=0.000001,
                            #algorithm='auto',
                           )
    kmeans_initial.fit(df[[col]].dropna())

    # Use cluster centers from initial k-means as seeds for the main clustering
    main_seeds = kmeans_initial.cluster_centers_

    # Step 3: Second K-Means Clustering - Main clustering with refined seeds
    kmeans_main = KMeans(n_clusters=n_clusters, 
                         init=main_seeds, 
                         n_init=100, 
                         max_iter=1000, 
                         random_state=0,
                         tol=0.000001,
                         #algorithm='auto',
                        )
    df['cluster'] = kmeans_main.fit_predict(df[[col]].dropna())

    # Post-clustering adjustment for edge cases
    centers = kmeans_main.cluster_centers_.flatten()
    df['cluster'] = df.apply(adjust_edge_cases, centers=centers, col=col, axis=1)

    # Step 4: Cluster Ordering and Labeling - Order clusters and assign 'star' ratings
    cluster_means = df.groupby('cluster')[col].mean().sort_values().index
    cluster_mapping = {old: new for new, old in enumerate(cluster_means, 1)}

    df['star'] = df['cluster'].map(cluster_mapping)
    df.drop('cluster', axis=1, inplace=True)

    return df


# Usage
# Assuming 'final_df' is your DataFrame and it includes a column named 'summary_score'
#final_df, cluster_centers = kmeans_clustering(final_df, n_clusters=5, col='summary_score')

dfg3 = final_df[final_df['cnt_grp'] == '1) # of groups=3']
dfg3 = kmeans_clustering(dfg3)
dfg4 = final_df[final_df['cnt_grp'] == '2) # of groups=4']
dfg4 = kmeans_clustering(dfg4)
dfg5 = final_df[final_df['cnt_grp'] == '3) # of groups=5']
dfg5 = kmeans_clustering(dfg5)

complete_df = pd.concat([dfg3, dfg4, dfg5])
complete_df.sort_values(by=['PROVIDER_ID'], ascending=True, inplace=True)

In [12]:
complete_df = complete_df.filter(items=list(valid_df))
print(complete_df.shape)
complete_df.head()

(2852, 27)


Unnamed: 0,PROVIDER_ID,Std_Outcomes_Mortality_score,Std_Outcomes_Readmission_score,Std_Outcomes_Safety_score,Std_PatientExp_score,Std_Process_score,std_weight_PatientExperience,std_weight_Readmission,std_weight_Mortality,std_weight_safety,std_weight_Process,weight_PatientExperience,weight_Outcomes_Readmission,weight_Outcomes_Mortality,weight_Outcomes_Safety,weight_Process,summary_score,Outcomes_Mortality_cnt,Outcomes_safety_cnt,Outcomes_Readmission_cnt,Patient_Experience_cnt,Process_cnt,Total_measure_group_cnt,MortSafe_Group_cnt,report_indicator,cnt_grp,star
0,10001,0.007134504,0.2312259238,-0.1455204533,0.1268857714,-0.6803237137,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.0332991815,7,7,11,8,10,5,2,1,3) # of groups=5,3
1,10005,-1.4410280005,0.7203393325,-0.0871220551,-0.2646769751,-0.492389631,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.2950340493,6,7,9,8,12,5,2,1,3) # of groups=5,2
2,10006,-1.4630064187,-0.2686150368,-0.1723984657,-1.1040696602,-0.7577044641,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.7527042436,7,7,9,8,11,5,2,1,3) # of groups=5,1
3,10007,-3.5287099063,-0.4701333217,0.3939170917,0.1891476221,-1.806727088,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.9682785237,3,2,7,8,7,4,1,1,2) # of groups=4,1
5,10011,-0.4769086181,-0.4015014882,0.3462801505,-0.0406226387,-1.0523185914,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.2522838018,7,7,9,8,8,5,2,1,3) # of groups=5,3


In [13]:
valid_df.head()

Unnamed: 0,PROVIDER_ID,Std_Outcomes_Mortality_score,Std_Outcomes_Readmission_score,Std_Outcomes_Safety_score,Std_PatientExp_score,Std_Process_score,std_weight_PatientExperience,std_weight_Readmission,std_weight_Mortality,std_weight_safety,std_weight_Process,weight_PatientExperience,weight_Outcomes_Readmission,weight_Outcomes_Mortality,weight_Outcomes_Safety,weight_Process,summary_score,Outcomes_Mortality_cnt,Outcomes_safety_cnt,Outcomes_Readmission_cnt,Patient_Experience_cnt,Process_cnt,Total_measure_group_cnt,MortSafe_Group_cnt,report_indicator,cnt_grp,star
0,10001,0.007134504,0.2312259238,-0.145520453,0.1268857714,-0.682878658,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.033605775,7.0,7.0,11.0,8.0,10.0,5.0,2.0,1.0,3) # of groups=5,3.0
1,10005,-1.441028,0.7203393325,-0.087122055,-0.264676975,-0.581469391,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.305723621,6.0,7.0,9.0,8.0,11.0,5.0,2.0,1.0,3) # of groups=5,2.0
2,10006,-1.463006419,-0.268615037,-0.172398466,-1.10406966,-0.76034002,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.75302051,7.0,7.0,9.0,8.0,11.0,5.0,2.0,1.0,3) # of groups=5,1.0
3,10007,-3.528709906,-0.470133322,0.3939170917,0.1891476221,-1.81045547,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.96872593,3.0,2.0,7.0,8.0,7.0,4.0,1.0,1.0,2) # of groups=4,1.0
5,10011,-0.476908618,-0.401501488,0.3462801505,-0.040622639,-1.055261064,0.22,0.22,0.22,0.22,0.12,0.22,0.22,0.22,0.22,0.12,-0.252636898,7.0,7.0,9.0,8.0,8.0,5.0,2.0,1.0,3) # of groups=5,3.0


In [14]:
complete_df['PROVIDER_ID'] = complete_df['PROVIDER_ID'].astype(str)
valid_df['PROVIDER_ID'] = valid_df['PROVIDER_ID'].astype(str)

complete_df['star'] = complete_df['star'].astype(float)
valid_df['star'] = valid_df['star'].astype(float)

In [15]:
cols = ['PROVIDER_ID',
        'Outcomes_Mortality_cnt',
       'Outcomes_safety_cnt',
       'Outcomes_Readmission_cnt',
       'Patient_Experience_cnt',
       'Process_cnt',
       'Total_measure_group_cnt',
       'MortSafe_Group_cnt',
       'report_indicator',
       'cnt_grp',
       'star']


for col in cols:
    
    ct = 0
    ls1 = complete_df[col].tolist()
    ls2 = valid_df[col].tolist()

    for i, v1 in enumerate(ls1):
        v2 = ls2[i]
        if v1 != v2:
            ct += 1
            print(v1, v2)
            #break
            
    print(col, ct, 'incorrect')        
    

PROVIDER_ID 0 incorrect
Outcomes_Mortality_cnt 0 incorrect
Outcomes_safety_cnt 0 incorrect
Outcomes_Readmission_cnt 0 incorrect
Patient_Experience_cnt 0 incorrect
12 11.0
11 10.0
11 10.0
12 11.0
10 9.0
11 10.0
8 7.0
12 11.0
8 7.0
10 9.0
10 9.0
10 9.0
10 9.0
8 7.0
9 8.0
11 10.0
10 9.0
11 10.0
11 10.0
11 10.0
10 9.0
10 9.0
12 11.0
9 8.0
10 9.0
10 9.0
8 7.0
8 7.0
8 7.0
10 9.0
9 8.0
9 8.0
8 7.0
11 10.0
11 10.0
11 10.0
11 10.0
8 7.0
9 8.0
9 8.0
7 6.0
6 5.0
8 7.0
10 9.0
10 9.0
9 8.0
9 8.0
10 9.0
9 8.0
8 7.0
10 9.0
9 8.0
10 9.0
11 10.0
10 9.0
10 9.0
8 7.0
11 10.0
11 10.0
11 10.0
11 10.0
12 11.0
10 9.0
11 10.0
9 8.0
11 10.0
10 9.0
11 10.0
10 9.0
5 4.0
6 5.0
11 10.0
12 11.0
9 8.0
11 10.0
11 10.0
11 10.0
11 10.0
10 9.0
10 9.0
10 9.0
10 9.0
11 10.0
9 8.0
11 10.0
9 8.0
8 7.0
9 8.0
11 10.0
8 7.0
10 9.0
11 10.0
10 9.0
10 9.0
8 7.0
10 9.0
10 9.0
8 7.0
10 9.0
10 9.0
12 11.0
11 10.0
10 9.0
10 9.0
9 8.0
9 8.0
11 10.0
8 7.0
11 10.0
10 9.0
12 11.0
9 8.0
9 8.0
9 8.0
10 9.0
12 11.0
12 11.0
12 11.0
10 9.0
12

In [15]:
print(list(valid_df))

['PROVIDER_ID', 'Std_Outcomes_Mortality_score', 'Std_Outcomes_Readmission_score', 'Std_Outcomes_Safety_score', 'Std_PatientExp_score', 'Std_Process_score', 'std_weight_PatientExperience', 'std_weight_Readmission', 'std_weight_Mortality', 'std_weight_safety', 'std_weight_Process', 'weight_PatientExperience', 'weight_Outcomes_Readmission', 'weight_Outcomes_Mortality', 'weight_Outcomes_Safety', 'weight_Process', 'summary_score', 'Outcomes_Mortality_cnt', 'Outcomes_safety_cnt', 'Outcomes_Readmission_cnt', 'Patient_Experience_cnt', 'Process_cnt', 'Total_measure_group_cnt', 'MortSafe_Group_cnt', 'report_indicator', 'cnt_grp', 'star']


In [101]:
cols = ['Std_Outcomes_Mortality_score', 'Std_Outcomes_Readmission_score', 
        'Std_Outcomes_Safety_score', 'Std_PatientExp_score', 
        'Std_Process_score', 'std_weight_PatientExperience', 
        'std_weight_Readmission', 'std_weight_Mortality', 
        'std_weight_safety', 'std_weight_Process', 
        'weight_PatientExperience', 'weight_Outcomes_Readmission', 
        'weight_Outcomes_Mortality', 'weight_Outcomes_Safety', 
        'weight_Process', 'summary_score', 
        'Outcomes_Mortality_cnt', 'Outcomes_safety_cnt', 
        'Outcomes_Readmission_cnt', 'Patient_Experience_cnt', 
        'Process_cnt', 'Total_measure_group_cnt', 
        'MortSafe_Group_cnt', 'report_indicator',
       ]

for col in cols:
    diffs = []
    ls1 = complete_df[col].astype(float).tolist()
    ls2 = valid_df[col].astype(float).tolist()
    for i, v1 in enumerate(ls1):
        v2 = ls2[i]
        if np.abs(v1 - v2) > 0.1:
            print(col, v1, v2)
            break
        diffs.append(np.abs(v2-v1))
    
    print(col, "{:.20f}".format(np.nanmax(diffs)))



Std_Outcomes_Mortality_score 0.00000000049950985015
Std_Outcomes_Readmission_score 0.00000000049929749224
Std_Outcomes_Safety_score 0.00000000049980419803
Std_PatientExp_score 0.00000000049997994633
Std_Process_score 0.00000000049958837067
std_weight_PatientExperience 0.00000000000000000000
std_weight_Readmission 0.00000000000000000000
std_weight_Mortality 0.00000000000000000000
std_weight_safety 0.00000000000000000000
std_weight_Process 0.00000000000000000000
weight_PatientExperience 0.00000000004871797410
weight_Outcomes_Readmission 0.00000000004871797410
weight_Outcomes_Mortality 0.00000000004871797410
weight_Outcomes_Safety 0.00000000004871797410
weight_Process 0.00000000004615383076
summary_score 0.00000000049959825166
Outcomes_Mortality_cnt 0.00000000000000000000
Outcomes_safety_cnt 0.00000000000000000000
Outcomes_Readmission_cnt 0.00000000000000000000
Patient_Experience_cnt 0.00000000000000000000
Process_cnt 0.00000000000000000000
Total_measure_group_cnt 0.00000000000000000000
M

In [102]:
valid_df.tail()

Unnamed: 0,PROVIDER_ID,Std_Outcomes_Mortality_score,Std_Outcomes_Readmission_score,Std_Outcomes_Safety_score,Std_PatientExp_score,Std_Process_score,std_weight_PatientExperience,std_weight_Readmission,std_weight_Mortality,std_weight_safety,std_weight_Process,weight_PatientExperience,weight_Outcomes_Readmission,weight_Outcomes_Mortality,weight_Outcomes_Safety,weight_Process,summary_score,Outcomes_Mortality_cnt,Outcomes_safety_cnt,Outcomes_Readmission_cnt,Patient_Experience_cnt,Process_cnt,Total_measure_group_cnt,MortSafe_Group_cnt,report_indicator,cnt_grp,star
4639,51004666666,1.8610573243,-0.858101528,0.6444312528,1.410301145,,0.22,0.22,0.22,0.22,0.12,0.25,0.25,0.25,0.25,,0.7644220485,4.0,3.0,5.0,8.0,0.0,4.0,2.0,1.0,2) # of groups=4,5.0
4640,51005666666,1.502997272,-0.015131233,1.3186191733,1.2596289457,,0.22,0.22,0.22,0.22,0.12,0.25,0.25,0.25,0.25,,1.0165285395,3.0,2.0,5.0,8.0,0.0,3.0,1.0,1.0,1) # of groups=3,5.0
4642,52003666666,0.5200712264,-2.187218487,0.0436962201,1.6370413385,,0.22,0.22,0.22,0.22,0.12,0.25,0.25,0.25,0.25,,0.0033975745,5.0,4.0,6.0,8.0,0.0,4.0,2.0,1.0,2) # of groups=4,3.0
4643,52004666666,1.3341979303,-2.41225773,0.2143934324,1.8642674758,,0.22,0.22,0.22,0.22,0.12,0.25,0.25,0.25,0.25,,0.2501502771,5.0,4.0,6.0,8.0,0.0,4.0,2.0,1.0,2) # of groups=4,3.0
4644,53004666666,1.157521277,1.7403164942,-0.678871406,1.6400791776,,0.22,0.22,0.22,0.22,0.12,0.25,0.25,0.25,0.25,,0.9647613856,3.0,1.0,4.0,8.0,0.0,3.0,1.0,1.0,1) # of groups=3,5.0


In [103]:
complete_df.tail()

Unnamed: 0,PROVIDER_ID,Std_Outcomes_Mortality_score,Std_Outcomes_Readmission_score,Std_Outcomes_Safety_score,Std_PatientExp_score,Std_Process_score,std_weight_PatientExperience,std_weight_Readmission,std_weight_Mortality,std_weight_safety,std_weight_Process,weight_PatientExperience,weight_Outcomes_Readmission,weight_Outcomes_Mortality,weight_Outcomes_Safety,weight_Process,summary_score,Outcomes_Mortality_cnt,Outcomes_safety_cnt,Outcomes_Readmission_cnt,Patient_Experience_cnt,Process_cnt,Total_measure_group_cnt,MortSafe_Group_cnt,report_indicator,cnt_grp,star
4805,51004666666,1.8610573243,-0.8581015282,0.6444312528,1.410301145,,0.22,0.22,0.22,0.22,0.12,0.25,0.25,0.25,0.25,0.0,0.7644220485,4,3,5,8,0,4,2,1,2) # of groups=4,5.0
4806,51005666666,1.502997272,-0.0151312332,1.3186191733,1.2596289457,,0.22,0.22,0.22,0.22,0.12,0.25,0.25,0.25,0.25,0.0,1.0165285395,3,2,5,8,0,3,1,1,1) # of groups=3,5.0
4808,52003666666,0.5200712264,-2.187218487,0.0436962201,1.6370413385,,0.22,0.22,0.22,0.22,0.12,0.25,0.25,0.25,0.25,0.0,0.0033975745,5,4,6,8,0,4,2,1,2) # of groups=4,3.0
4809,52004666666,1.3341979303,-2.4122577299,0.2143934324,1.8642674758,,0.22,0.22,0.22,0.22,0.12,0.25,0.25,0.25,0.25,0.0,0.2501502771,5,4,6,8,0,4,2,1,2) # of groups=4,3.0
4810,53004666666,1.157521277,1.7403164942,-0.6788714062,1.6400791776,,0.22,0.22,0.22,0.22,0.12,0.25,0.25,0.25,0.25,0.0,0.9647613856,3,1,4,8,0,3,1,1,1) # of groups=3,5.0
