# Baseline Model - Python Version

## Library Imports

In [1]:
# Necessary code to import our helper functions
import sys
sys.path.append("../..")

In [2]:
# Library imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_percentage_error
from Common_Functions import add_unique_identifier, data_cleaning, hospital_data_agg

  from pandas import MultiIndex, Int64Index


In [3]:
def data_split(data, count_col_name = 'priv_count', count_thresh = 50):
    """
    This function splits the data into model set and future set. Model set is the data used to train, evaluate
    test the model. Future data is what the model needs to predict on.
    
    Args:
    data (pandas: DataFrame) - a pandas data frame with at least 3 columns - "priv_pay_mean", "priv_pay_median" and count_col_name
    count_col_name (str) - name of the column which is thresholded to make the split
    count_thresh (int) - threshold value used to split data on count_col_name
    
    Returns:
    model_data (pandas: DataFrame) - data frame with observations that will be used to train and test model
    future_data (pandas: DataFrame) - data frame with all observations on which model will make predictions
    """
    data = data[(data['priv_pay_median'] > 0) | (data['priv_pay_median'].isnull())]
    future_data = data[(data[count_col_name] <= count_thresh) | (data[count_col_name].isnull())]
    model_data = data[data[count_col_name] > count_thresh]
    model_data = model_data[model_data.priv_pay_median.notnull()]
    return model_data, future_data

## Data Import

In [4]:
data = pd.read_csv("../../Feature Matrix/processed_data.csv")

## Model Parameters

In [5]:
COUNT_THRESH = 34
RDM_SEED = 123
TRAIN_TEST_PROPORTION = 0.8

## Data Transformation

### One-Hot Categorical Encoding and Dropping NAs

In [6]:
data.drop(columns = ['CBSA_NAME'], inplace=True)
data = data_cleaning(data)

### Data Split

In [7]:
working_set, predict_set = data_split(data, count_thresh = COUNT_THRESH)

In [8]:
model_data = working_set
predict_data = predict_set

In [9]:
display(model_data)

Unnamed: 0.1,Unnamed: 0,year,site,priv_count,priv_pay_median,mcare_los,mcare_pay_median,lon,lat,Hospitals,...,group_revision_tha,group_revision_tka,group_robotic_assisted_surgery,group_rtc_slap_bank,group_septoplasty,group_tha,group_thoracic,group_tka,group_tpa,group_tsa
40,40,2018,1,63,24289.900,2.549296,8794.190,-96.920913,32.707875,114.0,...,0,0,0,0,0,0,0,0,0,0
70,70,2018,1,51,21408.000,3.543210,10395.160,-95.622552,29.598443,181.0,...,0,0,0,0,0,0,0,0,0,0
112,112,2018,1,64,29757.100,3.918699,14174.100,-74.005954,40.712776,143.0,...,0,0,0,0,0,0,0,0,0,0
219,219,2019,1,66,25240.905,3.241935,10144.445,-96.920913,32.707875,114.0,...,0,0,0,0,0,0,0,0,0,0
275,275,2019,1,45,34963.900,3.262295,14008.190,-74.005954,40.712776,143.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41969,41969,2019,0,55,8614.250,0.000000,6291.420,-83.079090,42.810540,30.0,...,0,0,0,0,0,0,0,0,0,0
41993,41993,2020,0,62,11590.770,0.000000,6628.030,-84.294090,34.075380,80.0,...,0,0,0,0,0,0,0,0,0,0
42065,42065,2020,0,39,20492.920,0.000000,7015.710,-95.622550,29.598440,181.0,...,0,0,0,0,0,0,0,0,0,0
42110,42110,2020,0,44,13777.100,0.000000,8517.130,-74.005950,40.712780,143.0,...,0,0,0,0,0,0,0,0,0,0


### Train / Test Split

In [10]:
X_input = model_data.drop(columns=["priv_pay_median"])
y_input = model_data["priv_pay_median"]

X_train, X_test, y_train, y_test = train_test_split(X_input,
                                                    y_input,
                                                    train_size = TRAIN_TEST_PROPORTION,
                                                    random_state = RDM_SEED)


### Monotonicity Contraint - Inpatient must be most expensive, all else equal

In [11]:
mono = {'site': 1}

params = {
    'base_score':0.5,
    'booster':'gbtree',
    'colsample_bylevel':1,
    'colsample_bytree':1,
    'enable_categorical':False,
    'gamma':0,
    'gpu_id':-1,
    'interaction_constraints':'',
    'max_delta_step':0,
    'min_child_weight':1,
    'missing':None,
    'n_estimators':250,
    'n_jobs':8,
    'predictor':'auto',
    'reg_alpha':0,
    'scale_pos_weight':1,
    'tree_method':'exact',
    'validate_parameters':1,
    'monotone_constraints':mono,
    'learning_rate':1,
    'max_depth':25,
    'num_parallel_tree':250,
    'objective':'reg:squarederror',
    'subsample':0.8,
    'random_state':123,
    'reg_lambda':0
}

xgb_mono_model = xgb.XGBRFRegressor(monotone_constraints = mono, n_estimators = 250, max_depth=25)

In [12]:
xgb_mono_model.fit(X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [13]:
y_train_pred_xgb = xgb_mono_model.predict(X_train)
y_test_pred_xgb = xgb_mono_model.predict(X_test)

In [14]:
print(f"Monotonic XGBoost with Threshold >{COUNT_THRESH} claims for training set:")
print(f"Train MAPE: {mean_absolute_percentage_error(y_true=y_train, y_pred=y_train_pred_xgb)}")
print(f"Test MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_test_pred_xgb)}\n")

Monotonic XGBoost with Threshold >34 claims for training set:
Train MAPE: 0.04856107764307591
Test MAPE: 0.15299322425019787



In [15]:
print(X_train.columns)
xgb_mono_model.feature_importances_

Index(['Unnamed: 0', 'year', 'site', 'priv_count', 'mcare_los',
       'mcare_pay_median', 'lon', 'lat', 'Hospitals', 'PctTeaching',
       'PctLargeHospital', 'PctPrivate', 'total_population', 'median_age',
       'sex_ratio', 'State_Poverty_Percent_All_Ages',
       'State_Median_Household_Income', 'income_pc', 'num_races',
       'household_size', 'frac_married', 'frac_school', 'frac_college',
       'frac_educated', 'annual_births', 'frac_veteran', 'frac_disability',
       'non_citizen', 'employment_rate', 'frac_priv_insurance',
       'frac_mcare_insurance', 'frac_no_insurance', 'cluster', 'mcare_count',
       'group_ankle_fix', 'group_ant_cerv_fusion', 'group_ant_tls_fusion',
       'group_bariatric', 'group_breast reconstruction', 'group_bsp',
       'group_bunionectomy', 'group_cardiac ablation',
       'group_cardiac ablation_additional_discrete',
       'group_cardiac ablation_linear_focal',
       'group_cardiac_ablaton_anesthesia', 'group_cardiac_ablaton_ice',
       'gro

array([9.2638969e-05, 4.9204235e-05, 1.9999980e-03, 1.0888505e-04,
       6.0096779e-04, 1.2320700e-02, 5.9282861e-04, 1.6453635e-03,
       1.3231790e-03, 1.4695366e-03, 1.3599931e-03, 2.3074588e-03,
       5.3718998e-03, 9.1604242e-04, 2.2519466e-03, 2.8031151e-04,
       5.5688014e-04, 3.0098043e-04, 4.7166436e-03, 1.4471639e-03,
       4.1784090e-03, 1.9651582e-03, 3.0937276e-03, 1.7110382e-03,
       1.0335204e-02, 1.2497130e-03, 8.9047467e-03, 2.6516598e-03,
       2.0938204e-03, 2.2679502e-03, 2.0771823e-03, 1.0181512e-02,
       1.2958618e-02, 2.7210670e-04, 4.6603777e-04, 1.4466555e-03,
       1.5379035e-01, 5.6067255e-04, 2.2306245e-04, 1.7808010e-04,
       1.0517055e-04, 5.5704970e-04, 5.5468513e-04, 1.3936297e-03,
       7.7694893e-04, 2.0571254e-04, 1.4171355e-05, 7.8006677e-04,
       0.0000000e+00, 5.1001924e-05, 2.4062032e-03, 2.0206370e-03,
       5.8746251e-04, 2.1987881e-03, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 4.9826082e-05, 0.0000000e+00, 0.0000000e