In [5]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import warnings
from scipy import stats
from scipy.stats import percentileofscore
from IPython.utils import io

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

pd.set_option("display.precision", 10)
stars_dir = '~/GitHub/stars-data-builder/'


## Import data

In [6]:
with io.capture_output() as captured: 
    raw_data = pd.read_sas(stars_dir + '2024/2024-07 Stars Release/alldata_2024jul.sas7bdat')
raw_data = raw_data.applymap(lambda x: x.decode() if isinstance(x, bytes) else x)

raw_data.dropna(how='all', axis=0, inplace=True)
for c in list(raw_data):
    if c == 'PROVIDER_ID': 
        continue
    else:
        raw_data[c] = raw_data[c].astype(float)

print(raw_data.shape)
raw_data.head()

(4658, 95)


Unnamed: 0,PROVIDER_ID,HAI_1_DEN_VOL,HAI_2_DEN_VOL,HAI_3_DEN_VOL,HAI_4_DEN_VOL,HAI_5_DEN_VOL,HAI_6_DEN_VOL,HAI_1_DEN_PRED,HAI_2_DEN_PRED,HAI_3_DEN_PRED,HAI_4_DEN_PRED,HAI_5_DEN_PRED,HAI_6_DEN_PRED,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,READM_30_HOSP_WIDE,READM_30_HIP_KNEE,EDAC_30_HF,READM_30_COPD,EDAC_30_AMI,EDAC_30_PN,MORT_30_STK,MORT_30_PN,MORT_30_HF,MORT_30_COPD,MORT_30_AMI,COMP_HIP_KNEE,READM_30_HOSP_WIDE_DEN,READM_30_HIP_KNEE_DEN,EDAC_30_HF_DEN,READM_30_COPD_DEN,EDAC_30_AMI_DEN,EDAC_30_PN_DEN,MORT_30_STK_DEN,MORT_30_PN_DEN,MORT_30_HF_DEN,MORT_30_COPD_DEN,MORT_30_AMI_DEN,COMP_HIP_KNEE_DEN,OP_2,OP_2_DEN,OP_3B,OP_3B_DEN,OP_8,OP_8_DEN,OP_10,OP_10_DEN,OP_13,OP_13_DEN,OP_18B,OP_18B_DEN,OP_22,OP_22_DEN,OP_23,OP_23_DEN,OP_29,OP_29_DEN,PSI_4_SURG_COMP,PSI_4_SURG_COMP_DEN,PSI_90_SAFETY,IMM_3_DEN,IMM_3,HCP_COVID_19_DEN,HCP_COVID_19,PC_01,PC_01_DEN,SEP_1,SEP_1_DEN,H_RESP_RATE_P,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,H_NUMB_COMP,PSI_90_SAFETY_DEN,MORT_30_CABG,MORT_30_CABG_DEN,READM_30_CABG,READM_30_CABG_DEN,OP_32,OP_32_DEN,OP_35_ADM,OP_35_ADM_DEN,OP_35_ED,OP_35_ED_DEN,OP_36,OP_36_DEN
0,10001,9149.0,17310.0,214.0,,104733.0,104733.0,9.597,24.766,5.994,,11.4,67.066,0.938,0.363,1.335,,0.965,0.507,0.142,0.038,23.4,0.19,-15.4,23.6,0.148,0.18,0.089,0.088,0.12,0.027,2912.0,49.0,614.0,117.0,274.0,403.0,398.0,400.0,549.0,107.0,278.0,49.0,,,,,0.38,79.0,0.061,1410.0,0.028,178.0,214.0,348.0,0.05,52960.0,,,0.47,17.0,184.68,134.0,1.21,3905.0,0.95,2496.0,0.836,0.0,32.0,0.65,127.0,15.0,3.0,3.0,3.0,3.0,4.0,4.0,3.5,3.5,544.0,2542.0385003778,0.041,132.0,0.105,126.0,12.9,170.0,11.9,202.0,4.9,202.0,1.1,668.0
1,10005,3194.0,8277.0,96.0,,36794.0,34887.0,1.989,4.019,2.626,,1.847,10.066,2.514,0.995,0.762,,0.541,0.497,0.128,0.034,22.1,0.166,,-6.2,0.153,0.233,0.149,0.099,0.136,0.023,1052.0,172.0,129.0,136.0,,285.0,81.0,289.0,121.0,126.0,27.0,155.0,,,57.0,16.0,0.477,130.0,0.12,1057.0,0.042,189.0,145.0,1074.0,0.03,56820.0,0.58,12.0,0.96,180.0,183.49,43.0,0.97,2700.0,0.8,2552.0,0.807,0.02,200.0,0.69,252.0,18.0,3.0,4.0,1.0,3.0,4.0,3.0,3.0,3.0,824.0,978.028993786,,,,,14.2,739.0,7.9,107.0,5.5,107.0,1.9,406.0
2,10006,5343.0,8715.0,111.0,,63727.0,60304.0,5.801,11.166,2.95,,5.283,27.805,0.172,0.358,0.0,,1.514,0.072,0.134,0.053,-4.7,0.176,28.1,-0.4,0.172,0.195,0.125,0.099,0.165,0.046,2310.0,138.0,441.0,158.0,273.0,472.0,227.0,469.0,388.0,148.0,254.0,145.0,,,,,0.462,39.0,0.101,978.0,0.045,221.0,168.0,360.0,0.01,42286.0,0.75,16.0,0.85,82.0,173.63,96.0,1.17,2536.0,0.67,1882.0,0.796,0.04,28.0,0.57,126.0,19.0,2.0,3.0,2.0,2.0,3.0,2.0,2.0,2.5,1503.0,1753.9798989748,0.036,95.0,0.124,89.0,12.1,1355.0,,,,,1.4,484.0
3,10007,,,,,,5511.0,,,,,,2.66,,,,,,0.376,0.157,0.042,-1.9,0.2,,-9.4,,0.285,0.125,0.137,,,258.0,26.0,31.0,34.0,,72.0,,88.0,26.0,34.0,,,,,,,,,0.034,146.0,,,132.0,1275.0,0.04,11202.0,,,0.23,111.0,,,0.95,350.0,0.53,252.0,0.601,,,0.93,43.0,24.0,3.0,5.0,4.0,3.0,3.0,3.0,3.5,3.0,189.0,228.2861925107,,,,,13.4,109.0,,,,,1.2,59.0
4,10008,,,,,,,,,,,,,,,,,,,0.148,,,,,,,,,,,,69.0,,,,,,,,,,,,,,,,,,0.0,85.0,,,116.0,340.0,0.0,6239.0,,,0.67,24.0,,,,126.0,0.45,163.0,0.797,,,,,,,,,,,,,,,,,,,,12.9,42.0,,,,,,


## Filter data

In [7]:
# Define the measures you're interested in
measures = ['MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF',
            'MORT_30_PN', 'MORT_30_STK', 'PSI_4_SURG_COMP', 'COMP_HIP_KNEE',
            'HAI_1', 'HAI_2', 'HAI_3', 'HAI_4', 'HAI_5', 'HAI_6', 
            'PSI_90_SAFETY', 'EDAC_30_AMI', 'EDAC_30_HF',
            'EDAC_30_PN', 'OP_32', 'READM_30_CABG', 'READM_30_COPD',
            'READM_30_HIP_KNEE', 'READM_30_HOSP_WIDE', 'OP_35_ADM', 
            'OP_35_ED', 'OP_36', 'H_COMP_1_STAR_RATING', 'H_COMP_2_STAR_RATING', 
            'H_COMP_3_STAR_RATING', 'H_COMP_5_STAR_RATING', 
            'H_COMP_6_STAR_RATING', 'H_COMP_7_STAR_RATING', 
            'H_GLOB_STAR_RATING', 'H_INDI_STAR_RATING', 'HCP_COVID_19', 
            'IMM_3', 'OP_10', 'OP_13', 'OP_18B', 'OP_2', 'OP_22',
            'OP_23', 'OP_29', 'OP_3B', 'OP_8', 'PC_01', 'SEP_1',
           ]

print(len(measures), 'measures')
prvdrs = raw_data['PROVIDER_ID']
raw_data = raw_data.filter(items=measures)
filtered_data = raw_data.dropna(axis=1, thresh=101)
filtered_measures = list(filtered_data)

excluded = [item for item in measures if item not in filtered_measures]
print('Excluded measure(s):', excluded)
filtered_data.dropna(how='all', subset=filtered_measures, axis=0, inplace=True)

print('Shape of filtered dataframe:', filtered_data.shape)
print('Final no. of measures:', filtered_data.shape[1])

filtered_data['PROVIDER_ID'] = prvdrs
filtered_data = filtered_data[filtered_data.columns[-1:].tolist() + filtered_data.columns[:-1].tolist()]


## Replace the imputed 666666 suffixes of VHA hospitals with their original 'F' suffix
prvdrs1 = []
for p in filtered_data['PROVIDER_ID'].tolist():
    p = str(p)
    if '666666' in p:
        p = p[:-6] + 'F'
    while len(p) < 6:
        p = '0' + p
    prvdrs1.append(p)
    
filtered_data['PROVIDER_ID'] = prvdrs1

filtered_data.to_pickle(stars_dir + 'FilesForApp/data_for_whatifs.pkl')
filtered_data.head()

47 measures
Excluded measure(s): ['OP_2']
Shape of filtered dataframe: (4626, 46)
Final no. of measures: 46


Unnamed: 0,PROVIDER_ID,MORT_30_AMI,MORT_30_CABG,MORT_30_COPD,MORT_30_HF,MORT_30_PN,MORT_30_STK,PSI_4_SURG_COMP,COMP_HIP_KNEE,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,PSI_90_SAFETY,EDAC_30_AMI,EDAC_30_HF,EDAC_30_PN,OP_32,READM_30_CABG,READM_30_COPD,READM_30_HIP_KNEE,READM_30_HOSP_WIDE,OP_35_ADM,OP_35_ED,OP_36,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,HCP_COVID_19,IMM_3,OP_10,OP_13,OP_18B,OP_22,OP_23,OP_29,OP_3B,OP_8,PC_01,SEP_1
0,10001,0.12,0.041,0.088,0.089,0.18,0.148,184.68,0.027,0.938,0.363,1.335,,0.965,0.507,1.21,-15.4,23.4,23.6,12.9,0.105,0.19,0.038,0.142,11.9,4.9,1.1,3.0,3.0,3.0,3.0,4.0,4.0,3.5,3.5,0.836,0.95,0.061,0.028,214.0,0.05,,0.47,,0.38,0.0,0.65
1,10005,0.136,,0.099,0.149,0.233,0.153,183.49,0.023,2.514,0.995,0.762,,0.541,0.497,0.97,,22.1,-6.2,14.2,,0.166,0.034,0.128,7.9,5.5,1.9,3.0,4.0,1.0,3.0,4.0,3.0,3.0,3.0,0.807,0.8,0.12,0.042,145.0,0.03,0.58,0.96,57.0,0.477,0.02,0.69
2,10006,0.165,0.036,0.099,0.125,0.195,0.172,173.63,0.046,0.172,0.358,0.0,,1.514,0.072,1.17,28.1,-4.7,-0.4,12.1,0.124,0.176,0.053,0.134,,,1.4,2.0,3.0,2.0,2.0,3.0,2.0,2.0,2.5,0.796,0.67,0.101,0.045,168.0,0.01,0.75,0.85,,0.462,0.04,0.57
3,10007,,,0.137,0.125,0.285,,,,,,,,,0.376,0.95,,-1.9,-9.4,13.4,,0.2,0.042,0.157,,,1.2,3.0,5.0,4.0,3.0,3.0,3.0,3.5,3.0,0.601,0.53,0.034,,132.0,0.04,,0.23,,,,0.93
4,10008,,,,,,,,,,,,,,,,,,,12.9,,,,0.148,,,,,,,,,,,,0.797,0.45,0.0,,116.0,0.0,,0.67,,,,


In [8]:
filtered_data.tail()

Unnamed: 0,PROVIDER_ID,MORT_30_AMI,MORT_30_CABG,MORT_30_COPD,MORT_30_HF,MORT_30_PN,MORT_30_STK,PSI_4_SURG_COMP,COMP_HIP_KNEE,HAI_1,HAI_2,HAI_3,HAI_4,HAI_5,HAI_6,PSI_90_SAFETY,EDAC_30_AMI,EDAC_30_HF,EDAC_30_PN,OP_32,READM_30_CABG,READM_30_COPD,READM_30_HIP_KNEE,READM_30_HOSP_WIDE,OP_35_ADM,OP_35_ED,OP_36,H_COMP_1_STAR_RATING,H_COMP_2_STAR_RATING,H_COMP_3_STAR_RATING,H_COMP_5_STAR_RATING,H_COMP_6_STAR_RATING,H_COMP_7_STAR_RATING,H_GLOB_STAR_RATING,H_INDI_STAR_RATING,HCP_COVID_19,IMM_3,OP_10,OP_13,OP_18B,OP_22,OP_23,OP_29,OP_3B,OP_8,PC_01,SEP_1
4653,670300,,,,,,,,,,,1.552,,,0.239,0.95,,-7.4,,13.5,,,,0.142,,,,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,0.947,0.95,0.027,,144.0,0.01,,0.83,,,,0.66
4654,670309,,,,,,,,,,,0.0,,,0.344,0.98,,,,12.9,,,,0.148,,,1.3,2.0,2.0,3.0,2.0,3.0,3.0,3.5,4.0,0.814,0.79,0.016,,154.0,0.01,,0.97,,,0.04,0.9
4655,670310,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.996,0.86,,,58.0,0.03,,,,,,
4656,670314,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.93,0.94,0.021,,99.0,0.02,,,,,,
4657,670319,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.508,,,,,,,,,,,
