## Loading the data

In [1]:
import pandas as pd

RAW_PATH = "../data/raw/hvbp_tps.csv"
df = pd.read_csv(RAW_PATH)

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2489 entries, 0 to 2488
Data columns (total 17 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   Fiscal Year                                                       2489 non-null   int64  
 1   Facility ID                                                       2489 non-null   int64  
 2   Facility Name                                                     2489 non-null   object 
 3   Address                                                           2489 non-null   object 
 4   City/Town                                                         2489 non-null   object 
 5   State                                                             2489 non-null   object 
 6   ZIP Code                                                          2489 non-null   int64  
 7   County/Parish                    

## checking the Columns 

In [2]:
obj_cols = [
    "Unweighted Normalized Clinical Outcomes Domain Score",
    "Weighted Normalized Clinical Outcomes Domain Score",
    "Unweighted Person And Community Engagement Domain Score",
    "Weighted Person And Community Engagement Domain Score",
    "Unweighted Normalized Safety Domain Score",
    "Weighted Safety Domain Score",
]

for col in obj_cols:
    print(f"\n{col} – unique samples:")
    print(df[col].unique()[:15])   # first 15 distinct values 



Unweighted Normalized Clinical Outcomes Domain Score – unique samples:
['0.000000000000' '2.500000000000' '10.000000000000' '15.000000000000'
 '27.500000000000' '13.333333333333' '22.500000000000' '31.666666666667'
 '6.666666666667' '33.333333333333' '26.666666666667' '14.000000000000'
 '1.666666666667' 'Not Available' '20.000000000000']

Weighted Normalized Clinical Outcomes Domain Score – unique samples:
['0.000000000000' '0.833333333333' '2.500000000000' '3.750000000000'
 '6.875000000000' '3.333333333333' '5.625000000000' '7.916666666667'
 '0.625000000000' '1.666666666667' '8.333333333333' '6.666666666667'
 '3.500000000000' '0.416666666667' 'Not Available']

Unweighted Person And Community Engagement Domain Score – unique samples:
['38.000000000000' '53.000000000000' '12.000000000000' '35.000000000000'
 '11.000000000000' '17.000000000000' '13.000000000000' '21.000000000000'
 '15.000000000000' '5.000000000000' '25.000000000000' '8.000000000000'
 '7.000000000000' '20.000000000000' '1

## Changing the columns with Object dtype to Float by changing the "Not Available" entries to Na

In [3]:
df[obj_cols] = (
    df[obj_cols]
    .replace({"Not Available": pd.NA, "": pd.NA, " ": pd.NA})
    .apply(pd.to_numeric, errors="coerce")       
)


print(df[obj_cols].dtypes)       
print(df[obj_cols].isna().sum())  


Unweighted Normalized Clinical Outcomes Domain Score       float64
Weighted Normalized Clinical Outcomes Domain Score         float64
Unweighted Person And Community Engagement Domain Score    float64
Weighted Person And Community Engagement Domain Score      float64
Unweighted Normalized Safety Domain Score                  float64
Weighted Safety Domain Score                               float64
dtype: object
Unweighted Normalized Clinical Outcomes Domain Score        44
Weighted Normalized Clinical Outcomes Domain Score          44
Unweighted Person And Community Engagement Domain Score      4
Weighted Person And Community Engagement Domain Score        4
Unweighted Normalized Safety Domain Score                  408
Weighted Safety Domain Score                               408
dtype: int64


## Rename the columns

In [5]:
rename_map = {
    "Weighted Normalized Clinical Outcomes Domain Score":  "wt_clinical",
    "Weighted Person And Community Engagement Domain Score": "wt_pce",
    "Weighted Safety Domain Score":                         "wt_safety",
    "Weighted Efficiency And Cost Reduction Domain Score":  "wt_efficiency",
    "Total Performance Score":                              "tps",
}
df = df.rename(columns=rename_map)


In [6]:

cols_keep = ["Facility ID", "Facility Name", "State", "County/Parish"] + list(rename_map.values())
df = df[cols_keep]

In [7]:
df.head()

Unnamed: 0,Facility ID,Facility Name,State,County/Parish,wt_clinical,wt_pce,wt_safety,wt_efficiency,tps
0,190044,ACADIA GENERAL HOSPITAL,LA,ACADIA,0.0,9.5,8.125,2.5,20.125
1,490037,RIVERSIDE SHORE MEMORIAL HOSPITAL,VA,ACCOMACK,0.833333,17.666667,,16.666667,35.166667
2,130007,SAINT ALPHONSUS REGIONAL MEDICAL CENTER,ID,ADA,2.5,3.0,7.5,2.5,15.5
3,130006,ST LUKE'S REGIONAL MEDICAL CENTER,ID,ADA,0.0,8.75,4.5,7.5,20.75
4,260022,NORTHEAST REGIONAL MEDICAL CENTER,MO,ADAIR,3.75,2.75,16.25,12.5,35.25


In [8]:
out_path = "../data/processed/hvbp_2025_clean.parquet"
df.to_parquet(out_path, index=False)

In [9]:
PROC_CSV = "../data/processed/hvbp_2025_clean.csv"
df.to_csv(PROC_CSV, index=False)

## Data Quality check to know if the sum of the columns add up to the TPS column

In [10]:
WEIGHTED_COLS = [
    "wt_clinical",
    "wt_pce",
    "wt_safety",
    "wt_efficiency",
]

df["weighted_sum"] = df[WEIGHTED_COLS].sum(axis=1)


mismatch = df[~(df["weighted_sum"].sub(df["tps"]).abs() < 0.01)]

print(f"Rows failing the sum-check: {len(mismatch)}")


Rows failing the sum-check: 0


In [11]:
df.isna().sum().sort_values(ascending=False).head(10)

wt_safety        408
wt_clinical       44
wt_pce             4
Facility ID        0
Facility Name      0
County/Parish      0
State              0
wt_efficiency      0
tps                0
weighted_sum       0
dtype: int64