# Baseline Model - Python Version

## Library Imports

In [1]:
# Necessary code to import our helper functions
import sys
sys.path.append("..")

In [2]:
# Library imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
from Common_Functions import data_split, add_unique_identifier, data_cleaning, hospital_data_agg

## Data Import

In [3]:
data = pd.read_csv("../Data_Files/JnJ_Files/priv_mcare_f_pay_2022Oct18.csv")
hospital_data = pd.read_csv("../Data_Files/JnJ_Files/Hospital_Master_Sheet.csv")

## Model Parameters

In [4]:
COUNT_THRESH = 49
RDM_SEED = 123
TRAIN_TEST_PROPORTION = 0.8

## Data Transformation

### One-Hot Categorical Encoding and Dropping NAs

In [5]:
data = data_cleaning(data)

### Hospital Data

In [7]:
hospital_msa = hospital_data_agg(hospital_data)

display(hospital_msa)

Unnamed: 0,msa,Hospitals,PctTeaching,PctLargeHospital,Urban,PctPrivate
0,1,64,0.062500,0.000000,0.0,0.484375
1,2,13,0.076923,0.000000,0.0,0.153846
2,3,18,0.111111,0.000000,0.0,0.444444
3,4,71,0.098592,0.028169,0.0,0.591549
4,5,42,0.023810,0.000000,0.0,0.214286
...,...,...,...,...,...,...
479,99944,4,0.000000,0.000000,1.0,0.000000
480,99945,1,0.000000,0.000000,1.0,0.000000
481,99949,1,0.000000,0.000000,1.0,1.000000
482,99951,2,0.500000,0.000000,1.0,0.000000


### Data Split

In [None]:
working_set, predict_set = data_split(data, count_thresh = COUNT_THRESH)

In [None]:
model_data = working_set.merge(hospital_msa,how="left", on="msa").drop(columns=["priv_pay_mean",
                                                                                "priv_pay_iqr",
                                                                                "mcare_pay_mean",
                                                                                "mcare_pay_sd",
                                                                                "Urban",
                                                                                "msa"])
predict_data = predict_set.merge(hospital_msa,how="left", on="msa").drop(columns=["priv_pay_mean",
                                                                                  "priv_pay_iqr",
                                                                                  "mcare_pay_mean",
                                                                                  "mcare_pay_sd",
                                                                                  "Urban",
                                                                                  "msa"])

In [None]:
display(model_data)

### Train / Test Split

In [None]:
X_input = model_data.drop(columns=["priv_pay_median"])
y_input = model_data["priv_pay_median"]

X_train, X_test, y_train, y_test = train_test_split(X_input,
                                                    y_input,
                                                    train_size = TRAIN_TEST_PROPORTION,
                                                    random_state = RDM_SEED)

### Random Forest Classifier
Note: some arguments are altered (One-Hot Encoding made this infeasible)

In [None]:
baseline_model = RandomForestRegressor(n_estimators=500)

In [None]:
baseline_model.fit(X_train, y_train)

In [None]:
y_train_pred = baseline_model.predict(X_train)

In [None]:
y_test_pred = baseline_model.predict(X_test)

In [None]:
print(f"With Threshold >{COUNT_THRESH} claims for training set:\n")
print(f"Train MAPE: {mean_absolute_percentage_error(y_true=y_train, y_pred=y_train_pred)}")
print(f"Test MAPE: {mean_absolute_percentage_error(y_true=y_test, y_pred=y_test_pred)}")