In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

pd.options.mode.chained_assignment = None
sns.set_style("darkgrid");
plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'

In [31]:
df_raw = pd.read_csv('combined_features.csv')
df_raw.head()

Unnamed: 0,msa,year,site,group,priv_count,priv_pay_mean,priv_pay_median,priv_pay_iqr,mcare_los,mcare_pay_mean,...,frac_veteran,frac_disability,non_citizen,employment_rate,frac_priv_insurance,frac_mcare_insurance,frac_no_insurance,poverty_rate,emp,ap
0,10420,2018,Inpatient,breast reconstruction,8,19937.08375,16147.33,5692.86,2.0,8313.8475,...,0.06,0.14,0.4,0.63,0.68,0.4,0.06,0.09,296941.0,14549253.0
1,10580,2018,Inpatient,breast reconstruction,4,14837.26,10420.675,4474.06,2.888889,9230.5,...,0.06,0.13,0.38,0.63,0.74,0.38,0.03,0.07,364490.0,19095678.0
2,10900,2018,Inpatient,breast reconstruction,1,11658.0,11658.0,0.0,4.222222,10730.78778,...,0.06,0.13,0.42,0.64,0.72,0.37,0.06,0.08,338802.0,17175532.0
3,11020,2018,Inpatient,breast reconstruction,1,24543.0,24543.0,0.0,,,...,,,,,,,,,53539.0,2215160.0
4,11244,2018,Inpatient,breast reconstruction,9,25467.99,27320.61,17249.0,2.916667,13611.27167,...,,,,,,,,,,


In [32]:
df_clusters = pd.read_csv('clusters_only_using_NormCost.csv')
df_clusters.head()

Unnamed: 0,group,cluster
0,ankle_fix,2
1,ant_cerv_fusion,2
2,ant_tls_fusion,1
3,bariatric,0
4,breast reconstruction,0


In [33]:
df_with_clusters = df_raw.merge(df_clusters, how='left', on='group')
df_with_clusters.head()

Unnamed: 0,msa,year,site,group,priv_count,priv_pay_mean,priv_pay_median,priv_pay_iqr,mcare_los,mcare_pay_mean,...,frac_disability,non_citizen,employment_rate,frac_priv_insurance,frac_mcare_insurance,frac_no_insurance,poverty_rate,emp,ap,cluster
0,10420,2018,Inpatient,breast reconstruction,8,19937.08375,16147.33,5692.86,2.0,8313.8475,...,0.14,0.4,0.63,0.68,0.4,0.06,0.09,296941.0,14549253.0,0
1,10580,2018,Inpatient,breast reconstruction,4,14837.26,10420.675,4474.06,2.888889,9230.5,...,0.13,0.38,0.63,0.74,0.38,0.03,0.07,364490.0,19095678.0,0
2,10900,2018,Inpatient,breast reconstruction,1,11658.0,11658.0,0.0,4.222222,10730.78778,...,0.13,0.42,0.64,0.72,0.37,0.06,0.08,338802.0,17175532.0,0
3,11020,2018,Inpatient,breast reconstruction,1,24543.0,24543.0,0.0,,,...,,,,,,,,53539.0,2215160.0,0
4,11244,2018,Inpatient,breast reconstruction,9,25467.99,27320.61,17249.0,2.916667,13611.27167,...,,,,,,,,,,0


In [34]:
df_preprocessed = df_with_clusters.drop(['FIPS.State.Code', 'poverty_rate', 'emp', 'ap', 'State', 'group'],
                          axis=1)
df_preprocessed.head()

Unnamed: 0,msa,year,site,priv_count,priv_pay_mean,priv_pay_median,priv_pay_iqr,mcare_los,mcare_pay_mean,mcare_pay_median,...,frac_educated,annual_births,frac_veteran,frac_disability,non_citizen,employment_rate,frac_priv_insurance,frac_mcare_insurance,frac_no_insurance,cluster
0,10420,2018,Inpatient,8,19937.08375,16147.33,5692.86,2.0,8313.8475,8298.49,...,0.92,160665.0,0.06,0.14,0.4,0.63,0.68,0.4,0.06,0
1,10580,2018,Inpatient,4,14837.26,10420.675,4474.06,2.888889,9230.5,8003.4,...,0.93,208912.0,0.06,0.13,0.38,0.63,0.74,0.38,0.03,0
2,10900,2018,Inpatient,1,11658.0,11658.0,0.0,4.222222,10730.78778,8083.55,...,0.91,191596.0,0.06,0.13,0.42,0.64,0.72,0.37,0.06,0
3,11020,2018,Inpatient,1,24543.0,24543.0,0.0,,,,...,,,,,,,,,,0
4,11244,2018,Inpatient,9,25467.99,27320.61,17249.0,2.916667,13611.27167,12005.46,...,,,,,,,,,,0


In [35]:
# OHE
def map_site(val):
    if val == 'Inpatient':
        return 1
    return 0
    
df_preprocessed['site'] = df_preprocessed['site'].map(lambda x: map_site(x))

In [36]:
# Target Encoding

from category_encoders import TargetEncoder

te = TargetEncoder(handle_unknown='ignore')
df_preprocessed['CBSA_NAME'] = te.fit_transform(df_preprocessed['CBSA_NAME'], df_preprocessed['priv_pay_median'])

In [40]:
from KNN_function import knn_func
df_imputed = knn_func(df_preprocessed)

In [46]:
df_imputed.dtypes

msa                               float64
year                              float64
site                              float64
priv_count                        float64
priv_pay_mean                     float64
priv_pay_median                   float64
priv_pay_iqr                      float64
mcare_los                         float64
mcare_pay_mean                    float64
mcare_pay_median                  float64
mcare_pay_sd                      float64
CBSA_NAME                         float64
lon                               float64
lat                               float64
Hospitals                         float64
PctTeaching                       float64
PctLargeHospital                  float64
PctPrivate                        float64
total_population                  float64
median_age                        float64
sex_ratio                         float64
State_Poverty_Percent_All_Ages    float64
State_Median_Household_Income     float64
income_pc                         

In [44]:
np.sum(df_imputed.isna())

msa                               0
year                              0
site                              0
priv_count                        0
priv_pay_mean                     0
priv_pay_median                   0
priv_pay_iqr                      0
mcare_los                         0
mcare_pay_mean                    0
mcare_pay_median                  0
mcare_pay_sd                      0
CBSA_NAME                         0
lon                               0
lat                               0
Hospitals                         0
PctTeaching                       0
PctLargeHospital                  0
PctPrivate                        0
total_population                  0
median_age                        0
sex_ratio                         0
State_Poverty_Percent_All_Ages    0
State_Median_Household_Income     0
income_pc                         0
num_races                         0
household_size                    0
frac_married                      0
frac_school                 

In [49]:
df_imputed.to_csv('feature_matrix_draft1.csv')