# Baseline Model - Python Version

## Library Imports

In [1]:
# Necessary code to import our helper functions
import sys
sys.path.append("../..")

In [2]:
# Library imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_percentage_error
from Common_Functions import data_split, add_unique_identifier, data_cleaning, hospital_data_agg

  from pandas import MultiIndex, Int64Index


## Data Import

In [3]:
data = pd.read_csv("../../Data_Files/JnJ_Files/priv_mcare_f_pay_2022Oct18.csv")
hospital_data = pd.read_csv("../../Data_Files/JnJ_Files/Hospital_Master_Sheet.csv")

## Model Parameters

In [4]:
COUNT_THRESH = 34
RDM_SEED = 123
TRAIN_TEST_PROPORTION = 0.8

## Data Transformation

### One-Hot Categorical Encoding and Dropping NAs

In [5]:
data.drop(columns = ['CBSA_NAME'], inplace=True)
data = data_cleaning(data)

### Hospital Data

In [6]:
hospital_msa = hospital_data_agg(hospital_data)

display(hospital_msa)

Unnamed: 0,msa,Hospitals,PctTeaching,PctLargeHospital,Urban,PctPrivate
0,1,64,0.062500,0.000000,0.0,0.484375
1,2,13,0.076923,0.000000,0.0,0.153846
2,3,18,0.111111,0.000000,0.0,0.444444
3,4,71,0.098592,0.028169,0.0,0.591549
4,5,42,0.023810,0.000000,0.0,0.214286
...,...,...,...,...,...,...
479,99944,4,0.000000,0.000000,1.0,0.000000
480,99945,1,0.000000,0.000000,1.0,0.000000
481,99949,1,0.000000,0.000000,1.0,1.000000
482,99951,2,0.500000,0.000000,1.0,0.000000


### Data Split

In [7]:
working_set, predict_set = data_split(data, count_thresh = COUNT_THRESH)

In [8]:
model_data = working_set.merge(hospital_msa,how="left", on="msa").drop(columns=["priv_pay_mean",
                                                                                "priv_pay_iqr",
                                                                                "mcare_pay_mean",
                                                                                "mcare_pay_sd",
                                                                                "Urban",
                                                                                "msa"])
predict_data = predict_set.merge(hospital_msa,how="left", on="msa").drop(columns=["priv_pay_mean",
                                                                                  "priv_pay_iqr",
                                                                                  "mcare_pay_mean",
                                                                                  "mcare_pay_sd",
                                                                                  "Urban",
                                                                                  "msa"])

In [9]:
display(model_data)

Unnamed: 0,year,priv_count,priv_pay_median,mcare_count,mcare_los,mcare_pay_median,lon,lat,site_ASC,site_Inpatient,...,State_Vermont,State_Virginia,State_Washington,State_West Virginia,State_Wisconsin,State_Wyoming,Hospitals,PctTeaching,PctLargeHospital,PctPrivate
0,2018,35.0,10083.34,88.0,0.0,4376.17,-83.743038,42.280826,0,0,...,0,0,0,0,0,0,6,0.500000,0.333333,0.500000
1,2019,35.0,9076.20,56.0,0.0,5985.04,-105.270546,40.014986,0,0,...,0,0,0,0,0,0,5,0.000000,0.000000,0.600000
2,2019,35.0,17251.33,121.0,0.0,4386.17,-111.891047,40.760779,0,0,...,0,0,0,0,0,0,18,0.388889,0.055556,0.777778
3,2020,35.0,11520.39,20.0,0.0,3896.76,-91.665623,41.977880,0,0,...,0,0,0,0,0,0,6,0.166667,0.166667,0.333333
4,2020,35.0,16731.80,97.0,0.0,3719.49,-78.928824,33.919657,0,0,...,0,0,0,0,0,0,7,0.142857,0.000000,0.571429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4518,2018,1674.0,8942.50,1112.0,0.0,3749.44,-95.622552,29.598443,0,0,...,0,0,0,0,0,0,181,0.088398,0.060773,0.823204
4519,2018,1843.0,14929.63,2576.0,0.0,4331.08,-96.920913,32.707875,0,0,...,0,0,0,0,0,0,114,0.105263,0.052632,0.807018
4520,2018,1900.0,8746.36,1084.0,0.0,3647.39,-84.294090,34.075376,0,0,...,0,0,0,0,0,0,80,0.162500,0.050000,0.725000
4521,2018,1919.0,10701.58,4248.0,0.0,5153.90,-74.005954,40.712776,0,0,...,0,0,0,0,0,0,143,0.552448,0.230769,0.643357


### Train / Test Split

In [10]:
X_input = model_data.drop(columns=["priv_pay_median"])
y_input = model_data["priv_pay_median"]

X_train, X_test, y_train, y_test = train_test_split(X_input,
                                                    y_input,
                                                    train_size = TRAIN_TEST_PROPORTION,
                                                    random_state = RDM_SEED)


### Monotonicity Contraint - Inpatient must be most expensive, all else equal

In [11]:
mono = {'site_Inpatient': 1}

params = {
    'base_score':0.5,
    'booster':'gbtree',
    'colsample_bylevel':1,
    'colsample_bytree':1,
    'enable_categorical':False,
    'gamma':0,
    'gpu_id':-1,
    'interaction_constraints':'',
    'max_delta_step':0,
    'min_child_weight':1,
    'missing':None,
    'n_estimators':250,
    'n_jobs':8,
    'predictor':'auto',
    'reg_alpha':0,
    'scale_pos_weight':1,
    'tree_method':'exact',
    'validate_parameters':1,
    'monotone_constraints':mono,
    'learning_rate':1,
    'max_depth':25,
    'num_parallel_tree':250,
    'objective':'reg:squarederror',
    'subsample':0.8,
    'random_state':123,
    'reg_lambda':0
}

xgb_mono_model = xgb.XGBRFRegressor(monotone_constraints = mono, n_estimators = 250, max_depth=25)

In [12]:
xgb_mono_model.fit(X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [13]:
y_train_pred_xgb = xgb_mono_model.predict(X_train)
y_test_pred_xgb = xgb_mono_model.predict(X_test)

In [14]:
print(f"Monotonic XGBoost with Threshold >{COUNT_THRESH} claims for training set:")
print(f"Train MAPE: {mean_absolute_percentage_error(y_true=y_train, y_pred=y_train_pred_xgb)}")
print(f"Test MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_test_pred_xgb)}\n")

Monotonic XGBoost with Threshold >34 claims for training set:
Train MAPE: 0.04438636386863898
Test MAPE: 0.16157810504441503



In [15]:
print(X_train.columns)
xgb_mono_model.feature_importances_

Index(['year', 'priv_count', 'mcare_count', 'mcare_los', 'mcare_pay_median',
       'lon', 'lat', 'site_ASC', 'site_Inpatient', 'site_Outpatient',
       ...
       'State_Vermont', 'State_Virginia', 'State_Washington',
       'State_West Virginia', 'State_Wisconsin', 'State_Wyoming', 'Hospitals',
       'PctTeaching', 'PctLargeHospital', 'PctPrivate'],
      dtype='object', length=116)


array([9.1861199e-05, 1.8381640e-04, 3.6137647e-04, 4.4745961e-03,
       2.1484580e-02, 2.1869140e-03, 2.8763202e-03, 0.0000000e+00,
       2.1541568e-02, 3.0643936e-02, 1.2601755e-04, 1.2757004e-03,
       1.1974151e-01, 2.4858364e-03, 0.0000000e+00, 5.3489732e-04,
       2.6548776e-04, 2.4502826e-04, 2.6437454e-03, 2.1836252e-03,
       2.2342799e-03, 1.1459516e-03, 1.2179418e-03, 7.3660834e-05,
       1.9004333e-03, 0.0000000e+00, 1.9800050e-04, 1.3078104e-03,
       4.3743270e-04, 1.2941813e-03, 3.1366264e-03, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 1.4329418e-04, 0.0000000e+00,
       0.0000000e+00, 1.6174743e-04, 1.1394635e-04, 6.6951255e-04,
       0.0000000e+00, 1.1025753e-03, 4.0490315e-05, 0.0000000e+00,
       1.4178280e-02, 6.3124472e-01, 5.1433011e-04, 1.9458412e-04,
       1.0122882e-03, 1.3944195e-04, 0.0000000e+00, 0.0000000e+00,
       5.4903637e-04, 3.0859961e-04, 1.6726034e-04, 0.0000000e+00,
       9.3606015e-04, 1.5145373e-03, 1.2245701e-03, 1.0974123e