In [126]:
def preprocess_func():
    
    # Importing modules
    import numpy as np
    import pandas as pd
    
    # Importing the datasets with combined feature, clusters, and mcare_count
    df_raw = pd.read_csv('combined_features.csv')
    df_clusters = pd.read_csv('clusters_only_using_NormCost.csv')
    df_mcare_count = pd.read_csv('priv_mcare_f_pay_2022Oct18.csv')[['msa', 'year', 'site', 'group', 'mcare_count']]
    
    # Left joining them accordingly to have it all in one dataframe
    df_with_clusters = df_raw.merge(df_clusters, how='left', on='group')
    df_with_clusters = df_with_clusters.merge(df_mcare_count, how='left', on=['msa', 'year', 'site', 'group'])
    
    # Dropping the features that are similar to others
    # NOTE: year and group are not dropped since we need them for further experimentation
    df_preprocessed = df_with_clusters.drop(['msa', 'FIPS.State.Code', 'poverty_rate', 'emp', 'ap', 'State',
                                             'priv_pay_mean', 'mcare_pay_mean', 'mcare_pay_sd', 'priv_pay_iqr'],
                                            axis=1)
    
    # One-Hot Encoding / Mapping site values
    # we set site as 1 for impatient and 0 for outpatient or ASC.
    # This helps make sure that the coefficient of impatient are always more than ASC
    # This addresses a part of the of monotonicity constraint
    
    def map_site(val):
        if val == 'Inpatient':
            return 1
        return 0
    
    df_preprocessed['site'] = df_preprocessed['site'].map(lambda x: map_site(x))
    
    # NOTE, this function does not perform:
    #    1. Target encoding for CBSA_NAME
    #    2. k-NN Imputation
    
    return df_preprocessed   

In [127]:
processed_data = preprocess_func()
processed_data.head()

Unnamed: 0,year,site,group,priv_count,priv_pay_median,mcare_los,mcare_pay_median,CBSA_NAME,lon,lat,...,annual_births,frac_veteran,frac_disability,non_citizen,employment_rate,frac_priv_insurance,frac_mcare_insurance,frac_no_insurance,cluster,mcare_count
0,2018,1,breast reconstruction,8,16147.33,2.0,8298.49,"Akron, OH",-81.519005,41.081445,...,160665.0,0.06,0.14,0.4,0.63,0.68,0.4,0.06,0,
1,2018,1,breast reconstruction,4,10420.675,2.888889,8003.4,"Albany-Schenectady-Troy, NY",-73.653621,42.763648,...,208912.0,0.06,0.13,0.38,0.63,0.74,0.38,0.03,0,
2,2018,1,breast reconstruction,1,11658.0,4.222222,8083.55,"Allentown-Bethlehem-Easton, PA-NJ",-75.504376,40.583364,...,191596.0,0.06,0.13,0.42,0.64,0.72,0.37,0.06,0,
3,2018,1,breast reconstruction,1,24543.0,,,"Altoona, PA",-78.394736,40.518681,...,,,,,,,,,0,
4,2018,1,breast reconstruction,9,27320.61,2.916667,12005.46,"Anaheim-Santa Ana-Irvine, CA",-117.888522,33.750247,...,,,,,,,,,0,12.0


In [128]:
processed_data.to_csv('processed_data.csv')