# Compartmentalized Baseline Model - Python Version

## Library Imports

In [1]:
# Necessary code to import our helper functions
import sys
sys.path.append("..")

In [2]:
# Library imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
from Common_Functions import data_split, add_unique_identifier

## Data Import

In [3]:
data = pd.read_csv("../Data_Files/JnJ_Files/priv_mcare_f_pay_2022Oct18.csv")
hospital_data = pd.read_csv("../Data_Files/JnJ_Files/Hospital_Master_Sheet.csv")

## Model Parameters

In [4]:
COUNT_THRESH = 49
RDM_SEED = 123
TRAIN_TEST_PROPORTION = 0.8

## Data Transformation

### One-Hot Categorical Encoding

In [5]:
data = pd.get_dummies(data)

### Drop NAs

In [6]:
data = data.dropna()

### Hospital Data

In [7]:
hospital_data["Is_Teaching"] = (hospital_data["teaching"] == "YES").astype(int)
hospital_data["Beds_Over_500"] = (hospital_data["beds_grp"] == "500+").astype(int)
hospital_data["Is_Urban"] = (hospital_data["urban_rural"] == "URBAN").astype(int)
hospital_data["Is_Private"] = ((hospital_data["ownership"] == "PRIVATE (NOT FOR PROFIT)") |
                               (hospital_data["ownership"] == "PRIVATE (FOR PROFIT)")).astype(int)


hospital_msa = hospital_data.groupby("MSA_CD").agg({"prvdr_num": "count",
                                                    "Is_Teaching":"mean",
                                                    "Beds_Over_500":"mean",
                                                    "Is_Urban":"mean",
                                                    "Is_Private":"mean"
})
hospital_msa.reset_index(inplace=True)
hospital_msa.rename(columns = {"MSA_CD":"msa",
                               "prvdr_num": "Hospitals",
                               "Is_Teaching":"PctTeaching",
                               "Beds_Over_500":"PctLargeHospital",
                               "Is_Urban":"Urban",
                               "Is_Private":"PctPrivate"
                              },
                    inplace=True)

display(hospital_msa)

Unnamed: 0,msa,Hospitals,PctTeaching,PctLargeHospital,Urban,PctPrivate
0,1,64,0.062500,0.000000,0.0,0.484375
1,2,13,0.076923,0.000000,0.0,0.153846
2,3,18,0.111111,0.000000,0.0,0.444444
3,4,71,0.098592,0.028169,0.0,0.591549
4,5,42,0.023810,0.000000,0.0,0.214286
...,...,...,...,...,...,...
479,99944,4,0.000000,0.000000,1.0,0.000000
480,99945,1,0.000000,0.000000,1.0,0.000000
481,99949,1,0.000000,0.000000,1.0,1.000000
482,99951,2,0.500000,0.000000,1.0,0.000000


### Data Split

In [8]:
working_set, predict_set = data_split(data, count_thresh = COUNT_THRESH)

In [9]:
model_data = working_set.merge(hospital_msa,how="left", on="msa").drop(columns=["priv_pay_mean",
                                                                                "priv_pay_iqr",
                                                                                "mcare_pay_mean",
                                                                                "mcare_pay_sd",
                                                                                "Urban",
                                                                                "msa"])
predict_data = predict_set.merge(hospital_msa,how="left", on="msa").drop(columns=["priv_pay_mean",
                                                                                  "priv_pay_iqr",
                                                                                  "mcare_pay_mean",
                                                                                  "mcare_pay_sd",
                                                                                  "Urban",
                                                                                  "msa"])

In [10]:
display(model_data)

Unnamed: 0,year,priv_count,priv_pay_median,mcare_count,mcare_los,mcare_pay_median,lon,lat,site_ASC,site_Inpatient,...,State_Vermont,State_Virginia,State_Washington,State_West Virginia,State_Wisconsin,State_Wyoming,Hospitals,PctTeaching,PctLargeHospital,PctPrivate
0,2018,50.0,16233.530,27.0,0.000000,3372.33,-78.928824,33.919657,0,0,...,0,0,0,0,0,0,7,0.142857,0.000000,0.571429
1,2020,50.0,18060.000,53.0,0.000000,9916.43,-122.266155,37.844527,0,0,...,0,0,0,0,0,0,42,0.238095,0.071429,0.547619
2,2019,50.0,8059.035,323.0,0.000000,4244.26,-115.146665,36.097195,0,0,...,0,0,0,0,0,0,28,0.392857,0.071429,0.928571
3,2019,50.0,8851.020,486.0,0.000000,3601.26,-97.516428,35.467560,0,0,...,0,0,0,0,0,0,45,0.244444,0.088889,0.466667
4,2019,50.0,17827.615,83.0,3.891566,19673.49,-80.133611,25.806053,0,1,...,0,0,0,0,0,0,39,0.410256,0.153846,0.871795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3305,2018,1674.0,8942.500,1112.0,0.000000,3749.44,-95.622552,29.598443,0,0,...,0,0,0,0,0,0,181,0.088398,0.060773,0.823204
3306,2018,1843.0,14929.630,2576.0,0.000000,4331.08,-96.920913,32.707875,0,0,...,0,0,0,0,0,0,114,0.105263,0.052632,0.807018
3307,2018,1900.0,8746.360,1084.0,0.000000,3647.39,-84.294090,34.075376,0,0,...,0,0,0,0,0,0,80,0.162500,0.050000,0.725000
3308,2018,1919.0,10701.580,4248.0,0.000000,5153.90,-74.005954,40.712776,0,0,...,0,0,0,0,0,0,143,0.552448,0.230769,0.643357


### Train / Test Split

In [11]:
X_input = model_data.drop(columns=["priv_pay_median"])
y_input = model_data["priv_pay_median"]

X_train, X_test, y_train, y_test = train_test_split(X_input,
                                                    y_input,
                                                    train_size = TRAIN_TEST_PROPORTION,
                                                    random_state = RDM_SEED)

### Random Forest Classifier
Note: some arguments are altered (One-Hot Encoding made this infeasible)

In [12]:
baseline_model = RandomForestRegressor(n_estimators=500, random_state = RDM_SEED)

In [13]:
baseline_model.fit(X_train, y_train)

In [14]:
y_train_pred = baseline_model.predict(X_train)

In [15]:
y_test_pred = baseline_model.predict(X_test)

In [16]:
print(f"With Threshold >{COUNT_THRESH} claims for training set:\n")
print(f"Train MAPE: {mean_absolute_percentage_error(y_true=y_train, y_pred=y_train_pred)}")
print(f"Test MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_test_pred)}")

With Threshold >49 claims for training set:

Train MAPE: 0.08033590542552521
Test MAPE: 0.2054228450025808
