# Tree-based Models, with Clusters

## Library Imports

In [1]:
# Necessary code to import our helper functions
import sys
sys.path.append("../..")

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Library imports
import numpy as np
import pandas as pd
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from matplotlib import pyplot as plt
import xgboost as xgb
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
from Common_Functions import data_split, add_unique_identifier, data_cleaning, hospital_data_agg, optimal_k

In [3]:
# Method from Shruti's code
def standardize_data(train_data, val_data):
    train_temp = train_data.drop(columns = ['site','cluster','lat','lon'])
    val_temp = val_data.drop(columns = ['site','cluster','lat','lon'])
    
    scaler = MinMaxScaler()
    
    train_data_scaled = scaler.fit_transform(train_temp)
    train_data_scaled = pd.DataFrame(train_data_scaled, columns = train_temp.columns)
    train_data_scaled['cluster'] = train_data['cluster'].to_list()
    train_data_scaled['site'] = train_data['site'].to_list()
    train_data_scaled['lat'] = train_data['lat'].to_list()
    train_data_scaled['lon'] = train_data['lon'].to_list()
    
    val_data_scaled = scaler.transform(val_temp)
    val_data_scaled = pd.DataFrame(val_data_scaled, columns = val_temp.columns)
    val_data_scaled['cluster'] = val_data['cluster'].to_list()
    val_data_scaled['site'] = val_data['site'].to_list()
    val_data_scaled['lat'] = val_data['lat'].to_list()
    val_data_scaled['lon'] = val_data['lon'].to_list()
    
    return train_data_scaled, val_data_scaled

In [4]:
# Method slightly modified from Shruti's code
def impute_knn(train_data, val_data, optimal_k):
    train_data_scaled, val_data_scaled = standardize_data(train_data, val_data)

    knn = KNNImputer(n_neighbors = optimal_k)

    # imputing values
    train_data_scaled[list(train_data_scaled.columns)] = knn.fit_transform(train_data_scaled)
    val_data_scaled[list(val_data_scaled.columns)] = knn.transform(val_data_scaled)
    
    return train_data_scaled, val_data_scaled

## Data Import

In [5]:
data = pd.read_csv("../../Feature Matrix/processed_data.csv")
data.dropna(subset = ['mcare_count'], inplace = True)
data.drop(columns = ['year'], inplace = True)
data.replace([np.inf, -np.inf], 0, inplace=True)

## Model Parameters

In [6]:
COUNT_THRESH = 34
RDM_SEED = 42
TRAIN_TEST_PROPORTION = 0.8
MONOTONE_MODEL = True

## Data Transformation

### One-Hot Categorical Encoding and Dropping NAs

In [7]:
data = data_cleaning(data, dropna = False, one_hot = False)

### Data Split

In [8]:
working_set, predict_set = data_split(data, count_thresh = COUNT_THRESH)

In [9]:
model_data = working_set

In [10]:
display(model_data)

Unnamed: 0,site,group,priv_count,priv_pay_median,mcare_los,mcare_pay_median,CBSA_NAME,lon,lat,Hospitals,...,annual_births,frac_veteran,frac_disability,non_citizen,employment_rate,frac_priv_insurance,frac_mcare_insurance,frac_no_insurance,cluster,mcare_count
40,1,breast reconstruction,63,24289.900,2.549296,8794.190,"Dallas-Fort Worth-Arlington, TX",-96.920913,32.707875,114.0,...,1974825.0,0.06,0.10,0.59,0.69,0.66,0.25,0.17,0,71.0
70,1,breast reconstruction,51,21408.000,3.543210,10395.160,"Houston-The Woodlands-Sugar Land, TX",-95.622552,29.598443,181.0,...,1808878.0,0.05,0.10,0.57,0.66,0.60,0.28,0.19,0,81.0
112,1,breast reconstruction,64,29757.100,3.918699,14174.100,"New York-Newark-Jersey City, NY-NJ-PA",-74.005954,40.712776,143.0,...,4668590.0,0.03,0.11,0.41,0.65,0.66,0.39,0.07,0,123.0
219,1,breast reconstruction,66,25240.905,3.241935,10144.445,"Dallas-Fort Worth-Arlington, TX",-96.920913,32.707875,114.0,...,1974825.0,0.06,0.10,0.59,0.69,0.66,0.25,0.17,0,62.0
275,1,breast reconstruction,45,34963.900,3.262295,14008.190,"New York-Newark-Jersey City, NY-NJ-PA",-74.005954,40.712776,143.0,...,4668590.0,0.03,0.11,0.41,0.65,0.66,0.39,0.07,0,122.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44974,0,ant_cerv_fusion,54,17137.490,,6934.483,"Atlanta-Sandy Springs-Alpharetta, GA",-84.294090,34.075380,80.0,...,1573561.0,0.07,0.11,0.48,0.67,0.70,0.28,0.12,2,0.0
44991,0,ant_cerv_fusion,39,11807.250,,6905.252,"Charlotte-Concord-Gastonia, NC-SC",-80.721440,35.122320,26.0,...,673803.0,0.06,0.11,0.57,0.67,0.69,0.31,0.10,2,0.0
45038,0,ant_cerv_fusion,35,18421.250,,6724.500,"Memphis, TN-MS-AR",-89.850500,35.038720,28.0,...,327202.0,0.07,0.13,0.64,0.64,0.64,0.37,0.11,2,0.0
45100,0,ant_cerv_fusion,51,13926.250,,7334.787,"Atlanta-Sandy Springs-Alpharetta, GA",-84.294090,34.075380,80.0,...,1573561.0,0.07,0.11,0.48,0.67,0.70,0.28,0.12,2,0.0


## Split Model Data by Cluster

In [11]:
X_input = model_data.drop(columns=["priv_pay_median"])
y_input = model_data["priv_pay_median"]

X_list = []
y_list = []

X_dev_list = []
y_dev_list = []
X_test_list = []
y_test_list = []
optimal_k_list = []

for cluster_label in model_data["cluster"].unique():
    
    X_clu = X_input[X_input["cluster"] == cluster_label]
    y_clu = y_input[X_input["cluster"] == cluster_label]
    
    X_dev_clu, X_test_clu, y_dev_clu, y_test_clu = train_test_split(X_clu,
                                                                    y_clu,
                                                                    train_size = TRAIN_TEST_PROPORTION,
                                                                    random_state = RDM_SEED)
    
    # Target encoding group variable
    te_group = TargetEncoder(min_samples_leaf=1)
    X_dev_clu['group_encoded'] = te_group.fit_transform(X_dev_clu['group'],y_dev_clu)
    X_dev_clu.drop(columns = 'group', inplace = True)
    X_test_clu['group_encoded'] = te_group.transform(X_test_clu['group'])
    X_test_clu.drop(columns = 'group', inplace = True)

    # Target encoding CBSA Name
    te_CBSA_NAME = TargetEncoder(min_samples_leaf=1)
    X_dev_clu['CBSA_NAME_encoded'] = te_CBSA_NAME.fit_transform(X_dev_clu['CBSA_NAME'],y_dev_clu)
    X_dev_clu.drop(columns = 'CBSA_NAME', inplace = True)
    X_test_clu['CBSA_NAME_encoded'] = te_CBSA_NAME.transform(X_test_clu['CBSA_NAME'])
    X_test_clu.drop(columns = 'CBSA_NAME', inplace = True)
    
    # KNN Imputation
    knn_data = X_dev_clu.copy()
    knn_data['priv_pay_median'] = y_dev_clu

    optimal_k_list.append(optimal_k(knn_data))
    print(optimal_k_list)
    
    display(X_dev_clu)
    display(y_dev_clu)
    
    X_dev_clu, X_test_clu = impute_knn(X_dev_clu, X_test_clu, optimal_k_list[-1])
    display(X_dev_clu)
    display(y_dev_clu)
    
    X_dev_list.append(X_dev_clu)
    y_dev_list.append(y_dev_clu)
    X_test_list.append(X_test_clu)
    y_test_list.append(y_test_clu)
    
    print(X_dev_clu.shape[0] / X_test_clu.shape[0])

[10]


Unnamed: 0,site,priv_count,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,...,frac_disability,non_citizen,employment_rate,frac_priv_insurance,frac_mcare_insurance,frac_no_insurance,cluster,mcare_count,group_encoded,CBSA_NAME_encoded
21481,0,41,0.000000,4591.315,-81.339941,28.582816,23.0,0.217391,0.086957,0.826087,...,0.12,0.47,0.64,0.66,0.32,0.12,0,44.0,6831.855759,15647.640385
22929,0,218,0.000000,3748.520,-82.998794,39.961176,29.0,0.241379,0.137931,0.655172,...,0.12,0.48,0.67,0.70,0.33,0.08,0,72.0,8095.117704,16160.215484
32546,1,100,2.608696,6513.090,-82.010510,33.473500,13.0,0.461539,0.153846,0.538462,...,0.16,0.44,0.61,0.65,0.38,0.11,0,23.0,13445.381119,9390.670218
3041,0,38,0.000000,10581.030,-104.845462,39.514285,37.0,0.405405,0.135135,0.756757,...,0.10,0.52,0.72,0.71,0.30,0.08,0,251.0,30582.151227,12741.899464
12639,0,124,0.000000,4487.780,-83.079087,42.810536,30.0,0.433333,0.033333,0.733333,...,0.13,0.38,0.62,0.70,0.41,0.05,0,204.0,11049.219082,8825.555299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23362,0,37,0.000000,3428.880,-88.039891,30.695366,8.0,0.500000,0.125000,0.500000,...,,,,,,,0,17.0,8095.117704,5520.372057
20937,0,38,0.000000,4776.510,-71.412834,41.823989,21.0,0.571429,0.095238,0.809524,...,0.14,0.41,0.65,0.69,0.41,0.04,0,112.0,6831.855759,9829.512414
21191,0,95,0.000000,5370.910,-74.005954,40.712776,29.0,0.655172,0.137931,0.827586,...,0.11,0.41,0.65,0.66,0.39,0.07,0,118.0,6831.855759,13783.677232
22172,0,458,0.000000,3582.860,-82.388578,28.274347,49.0,0.285714,0.081633,0.836735,...,0.14,0.40,0.61,0.64,0.37,0.11,0,424.0,9679.484533,15784.754333


21481    15194.680
22929     9129.255
32546    12832.670
3041     30946.355
12639     7606.380
           ...    
23362     6084.840
20937     5845.890
21191     7273.610
22172    10414.140
8330     15377.745
Name: priv_pay_median, Length: 2243, dtype: float64

Unnamed: 0,priv_count,mcare_los,mcare_pay_median,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,...,frac_priv_insurance,frac_mcare_insurance,frac_no_insurance,mcare_count,group_encoded,CBSA_NAME_encoded,cluster,site,lat,lon
0,0.002239,0.000000,0.229658,0.108374,0.217391,0.260870,0.826087,0.132510,0.470149,0.390374,...,0.652174,0.454545,0.321429,0.010206,0.134959,0.638434,0.0,0.0,28.582816,-81.339941
1,0.068284,0.000000,0.187501,0.137931,0.241379,0.413793,0.655172,0.105032,0.399254,0.459893,...,0.739130,0.484848,0.178571,0.016701,0.180970,0.668710,0.0,0.0,39.961176,-82.998794
2,0.024254,0.429842,0.325785,0.059113,0.461539,0.461539,0.538462,0.027073,0.455224,0.358289,...,0.630435,0.636364,0.285714,0.005335,0.375839,0.268847,0.0,1.0,33.473500,-82.010510
3,0.001119,0.000000,0.529264,0.177340,0.405405,0.405405,0.756757,0.146767,0.425373,0.625668,...,0.760870,0.393939,0.178571,0.058223,1.000000,0.466797,0.0,0.0,39.514285,-104.845462
4,0.033209,0.000000,0.224479,0.142857,0.433333,0.100000,0.733333,0.217513,0.537313,0.347594,...,0.739130,0.727273,0.071429,0.047321,0.288565,0.235466,0.0,0.0,42.810536,-83.079087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2238,0.000746,0.000000,0.171513,0.034483,0.500000,0.375000,0.500000,0.017641,0.485075,0.171123,...,0.658696,0.678788,0.332143,0.003943,0.180970,0.040235,0.0,0.0,30.695366,-88.039891
2239,0.001119,0.000000,0.238921,0.098522,0.571429,0.285714,0.809524,0.080890,0.555970,0.315508,...,0.717391,0.727273,0.035714,0.025980,0.134959,0.294768,0.0,0.0,41.823989,-71.412834
2240,0.022388,0.000000,0.268653,0.137931,0.655172,0.413793,0.827586,1.000000,0.507463,0.288770,...,0.652174,0.666667,0.142857,0.027372,0.134959,0.528333,0.0,0.0,40.712776,-74.005954
2241,0.157836,0.000000,0.179215,0.236453,0.285714,0.244898,0.836735,0.159312,0.611940,0.331551,...,0.608696,0.606061,0.285714,0.098353,0.238676,0.646533,0.0,0.0,28.274347,-82.388578


21481    15194.680
22929     9129.255
32546    12832.670
3041     30946.355
12639     7606.380
           ...    
23362     6084.840
20937     5845.890
21191     7273.610
22172    10414.140
8330     15377.745
Name: priv_pay_median, Length: 2243, dtype: float64

3.998217468805704
[10, 16]


Unnamed: 0,site,priv_count,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,...,frac_disability,non_citizen,employment_rate,frac_priv_insurance,frac_mcare_insurance,frac_no_insurance,cluster,mcare_count,group_encoded,CBSA_NAME_encoded
16216,1,42,1.822951,12227.93,-89.588986,40.693649,7.0,0.428571,0.142857,0.714286,...,,,,,,,2,305.0,26433.691416,23637.448557
13383,1,109,2.751639,11745.97,-80.128575,26.307280,25.0,0.280000,0.160000,0.640000,...,,,,,,,2,1220.0,29217.053056,18639.546946
9477,1,64,1.755937,15780.08,-83.079087,42.810536,30.0,0.433333,0.033333,0.733333,...,0.13,0.38,0.62,0.70,0.41,0.05,2,758.0,35422.578145,20351.342295
15886,1,66,1.795580,12748.37,-89.588986,40.693649,7.0,0.428571,0.142857,0.714286,...,,,,,,,2,362.0,26433.691416,23637.448557
19947,0,98,0.000000,4299.44,-96.920913,32.707875,58.0,0.103448,0.051724,0.758621,...,0.10,0.59,0.69,0.66,0.25,0.17,2,87.0,11312.698917,32405.013443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31104,0,537,0.000000,2743.56,-74.005950,40.712780,143.0,0.552448,0.230769,0.643357,...,0.11,0.41,0.65,0.66,0.39,0.07,2,1164.0,9408.282873,34593.284842
34821,0,52,0.000000,19857.15,-117.455600,33.973340,54.0,0.259259,0.018519,0.666667,...,0.12,0.46,0.61,0.58,0.42,0.09,2,285.0,35235.176678,21848.085278
20388,0,45,0.000000,4390.45,-111.891047,40.760779,18.0,0.388889,0.055556,0.777778,...,0.10,0.56,0.71,0.77,0.21,0.10,2,76.0,11312.698917,19259.040649
36830,0,63,0.000000,16575.66,-97.678900,30.508260,50.0,0.180000,0.040000,0.880000,...,0.10,0.58,0.71,0.74,0.23,0.13,2,939.0,36130.730090,24756.871592


16216    28605.625
13383    22806.820
9477     28631.315
15886    23053.280
19947    15696.945
           ...    
31104     7346.390
34821    22569.470
20388    13193.310
36830    30735.880
31098    12452.920
Name: priv_pay_median, Length: 1330, dtype: float64

Unnamed: 0,priv_count,mcare_los,mcare_pay_median,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,...,frac_priv_insurance,frac_mcare_insurance,frac_no_insurance,mcare_count,group_encoded,CBSA_NAME_encoded,cluster,site,lat,lon
0,0.010101,0.199844,0.374187,0.029557,0.642857,0.428571,0.714286,0.012400,0.429348,0.496,...,0.670455,0.404018,0.226562,0.034170,0.541297,0.361730,2.0,1.0,40.693649,-89.588986
1,0.106782,0.301652,0.359438,0.118227,0.420000,0.480000,0.640000,0.061500,0.576427,0.426,...,0.184659,0.642857,0.578125,0.136679,0.629790,0.228669,2.0,1.0,26.307280,-80.128575
2,0.041847,0.192497,0.482886,0.142857,0.650000,0.100000,0.733333,0.214648,0.434783,0.352,...,0.545455,0.714286,0.125000,0.084920,0.827085,0.274243,2.0,1.0,42.810536,-83.079087
3,0.044733,0.196843,0.390113,0.029557,0.642857,0.428571,0.714286,0.012400,0.429348,0.496,...,0.670455,0.404018,0.226562,0.040556,0.541297,0.361730,2.0,1.0,40.693649,-89.588986
4,0.090909,0.000000,0.131567,0.280788,0.155172,0.155172,0.758621,0.387716,0.179348,0.488,...,0.363636,0.142857,0.875000,0.009747,0.060548,0.595151,2.0,0.0,32.707875,-96.920913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1325,0.724387,0.000000,0.083956,0.699507,0.828671,0.692308,0.643357,1.000000,0.391304,0.264,...,0.363636,0.642857,0.250000,0.130406,0.000000,0.653410,2.0,0.0,40.712780,-74.005950
1326,0.024531,0.000000,0.607648,0.261084,0.388889,0.055556,0.666667,0.229327,0.168478,0.696,...,0.000000,0.750000,0.375000,0.031929,0.821127,0.314091,2.0,0.0,33.973340,-117.455600
1327,0.014430,0.000000,0.134352,0.083744,0.583333,0.166667,0.777778,0.056482,0.070652,0.872,...,0.863636,0.000000,0.437500,0.008514,0.060548,0.245162,2.0,0.0,40.760779,-111.891047
1328,0.040404,0.000000,0.507231,0.241379,0.270000,0.120000,0.880000,0.112024,0.195652,0.792,...,0.727273,0.071429,0.625000,0.105198,0.849600,0.391532,2.0,0.0,30.508260,-97.678900


16216    28605.625
13383    22806.820
9477     28631.315
15886    23053.280
19947    15696.945
           ...    
31104     7346.390
34821    22569.470
20388    13193.310
36830    30735.880
31098    12452.920
Name: priv_pay_median, Length: 1330, dtype: float64

3.993993993993994
[10, 16, 8]


Unnamed: 0,site,priv_count,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,...,frac_disability,non_citizen,employment_rate,frac_priv_insurance,frac_mcare_insurance,frac_no_insurance,cluster,mcare_count,group_encoded,CBSA_NAME_encoded
7024,1,42,4.121607,27054.510,-86.580447,36.214401,38.0,0.210526,0.105263,0.710526,...,0.11,0.58,0.68,0.74,0.29,0.09,1,921.0,68703.495287,65845.784609
6628,1,46,3.667526,28876.405,-85.644749,42.869473,14.0,0.428571,0.071429,0.357143,...,0.11,0.50,0.67,0.75,0.33,0.05,1,388.0,68703.495287,63766.853504
29546,1,59,8.544554,12651.070,-83.079090,42.810540,30.0,0.433333,0.033333,0.733333,...,0.13,0.38,0.62,0.70,0.41,0.05,1,202.0,35192.470323,47460.663716
6456,1,48,2.414286,24873.490,-81.091203,32.080899,7.0,0.285714,0.142857,0.571429,...,,,,,,,1,210.0,68703.495287,64231.125179
7048,1,130,3.806612,30484.855,-111.841250,33.306160,74.0,0.229730,0.067568,0.702703,...,0.12,0.54,0.64,0.66,0.35,0.11,1,1210.0,68703.495287,62661.547520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6708,1,90,6.211849,36044.870,-74.005954,40.712776,29.0,0.655172,0.137931,0.827586,...,0.11,0.41,0.65,0.66,0.39,0.07,1,557.0,68703.495287,89047.912136
5260,1,41,3.255172,30387.710,-96.920913,32.707875,58.0,0.103448,0.051724,0.758621,...,0.10,0.59,0.69,0.66,0.25,0.17,1,145.0,89572.188078,84962.543497
6590,1,106,4.514693,34071.680,-104.845462,39.514285,37.0,0.405405,0.135135,0.756757,...,0.10,0.52,0.72,0.71,0.30,0.08,1,1123.0,68703.495287,87661.302044
29204,1,55,10.062940,18614.600,-80.133610,25.806050,39.0,0.410256,0.153846,0.871795,...,,,,,,,1,143.0,35192.470323,56260.243232


7024      69179.665
6628      64338.170
29546     34751.170
6456      50126.240
7048      88247.495
            ...    
6708     122839.720
5260      99013.590
6590      90832.130
29204     26613.980
6690      70677.605
Name: priv_pay_median, Length: 168, dtype: float64

Unnamed: 0,priv_count,mcare_los,mcare_pay_median,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,...,frac_priv_insurance,frac_mcare_insurance,frac_no_insurance,mcare_count,group_encoded,CBSA_NAME_encoded,cluster,site,lat,lon
0,0.022951,0.195611,0.414706,0.161616,0.315789,0.368421,0.633684,0.082832,0.253333,0.474747,...,0.80000,0.315789,0.375000,0.405113,0.616241,0.485346,1.0,1.0,36.214401,-86.580447
1,0.036066,0.143586,0.459483,0.040404,0.642857,0.250000,0.040000,0.035210,0.160000,0.808081,...,0.85000,0.526316,0.125000,0.143068,0.616241,0.446322,1.0,1.0,42.869473,-85.644749
2,0.078689,0.702355,0.060706,0.121212,0.650000,0.116667,0.672000,0.204314,0.666667,0.444444,...,0.60000,0.947368,0.125000,0.051622,0.000000,0.140235,1.0,1.0,42.810540,-83.079090
3,0.042623,0.000000,0.361102,0.005051,0.428571,0.500000,0.400000,0.000000,0.213333,0.393939,...,0.49375,0.618421,0.460938,0.055556,0.616241,0.455037,1.0,1.0,32.080899,-81.091203
4,0.311475,0.159521,0.499015,0.343434,0.344595,0.236486,0.620541,0.234323,0.306667,0.777778,...,0.40000,0.631579,0.500000,0.547198,0.616241,0.425574,1.0,1.0,33.306160,-111.841250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,0.180328,0.435093,0.635666,0.116162,0.982759,0.482759,0.830345,1.000000,0.560000,0.333333,...,0.40000,0.842105,0.250000,0.226155,0.616241,0.920879,1.0,1.0,40.712776,-74.005954
164,0.019672,0.096342,0.496627,0.262626,0.155172,0.181034,0.714483,0.379659,0.040000,0.616162,...,0.40000,0.105263,0.875000,0.023599,1.000000,0.844192,1.0,1.0,32.707875,-96.920913
165,0.232787,0.240647,0.587170,0.156566,0.608108,0.472973,0.711351,0.132374,0.266667,0.969697,...,0.65000,0.368421,0.312500,0.504425,0.616241,0.894851,1.0,1.0,39.514285,-104.845462
166,0.065574,0.876319,0.207274,0.166667,0.615385,0.538462,0.904615,0.115581,0.640000,0.486111,...,0.31250,0.625000,0.554688,0.022616,0.000000,0.305414,1.0,1.0,25.806050,-80.133610


7024      69179.665
6628      64338.170
29546     34751.170
6456      50126.240
7048      88247.495
            ...    
6708     122839.720
5260      99013.590
6590      90832.130
29204     26613.980
6690      70677.605
Name: priv_pay_median, Length: 168, dtype: float64

4.0


## Run XGBoost model

In [12]:
train_mapes = []
train_sizes = []
test_mapes = []
test_sizes = []

# Train test split
for idx in range(0,len(X_dev_list)):
    print(f"Index is: {idx}")
    
    # Parameterization
    mono = {'site': 1}

    param_grid = {
#         'booster':['gbtree','dart'],
        'colsample_bylevel':[1],
        'colsample_bytree':[1],
        'enable_categorical':[False],
        'gamma':[0],
        'gpu_id':[-1],
        'interaction_constraints':[''],
        'max_delta_step':[0],
        'min_child_weight':[1],
        'missing':[np.nan],
#         'n_estimators':[175,200,225,250],
        'n_estimators':[10,25,50,75,100,125],#175,225,250],
        'n_jobs':[8],
        'predictor':['auto'],
        'reg_alpha':[0],
        'scale_pos_weight':[1],
        'tree_method':['exact'],
        'validate_parameters':[1],
        'learning_rate':[1],
#         'max_depth':[10,15,20,25],
        'max_depth':[2,4,6],#,10,20,25],
        'num_parallel_tree':[250],
        'objective':['reg:squarederror'],
        'subsample':[0.8],
        'random_state':[RDM_SEED],
#         'reg_lambda':[0,0.1,0.15,0.2,0.25,0.3]
        'reg_lambda':[0,0.1,0.2],
#         'tree_method':['approx','hist']

    }
    if MONOTONE_MODEL:
        param_grid['monotone_constraints'] = [mono]

    # Create, run, and tune (if applicable) model
    xgb_param_tuning_model = xgb.XGBRFRegressor(
#                                                 n_estimators = 250,
#                                                 max_depth=25,
#                                                 reg_lambda=0,
                                                booster = 'gbtree',
        tree_method = 'hist'
                                               )
    
    xgb_mono_model = GridSearchCV(xgb_param_tuning_model, param_grid, scoring='neg_mean_absolute_percentage_error')
    xgb_mono_model.fit(X_dev_list[idx], y_dev_list[idx])
    
    # Output optimal params (if applicable)
    print(f"Best parameters (if grid search was applied): {xgb_mono_model.best_params_}")
    print(f"Val score: {xgb_mono_model.best_score_}")
#     print(f"Feature importances:")
#     import_df = pd.concat([pd.Series(X_dev_list[idx].columns,name="feature"), pd.Series(xgb_mono_model.best_estimator_.feature_importances_,name="importance")], axis=1)
#     display(import_df.sort_values(by="importance",ascending=False))

    
    # Predict on train and test data
    y_train_pred_xgb = xgb_mono_model.predict(X_dev_list[idx])
    y_test_pred_xgb = xgb_mono_model.predict(X_test_list[idx])

    # Store results
    train_sizes.append(len(X_dev_list[idx]))
    test_sizes.append(len(X_test_list[idx]))
    train_mapes.append(mean_absolute_percentage_error(y_true=y_dev_list[idx], y_pred=y_train_pred_xgb))
    test_mapes.append(mean_absolute_percentage_error(y_true=y_test_list[idx], y_pred=y_test_pred_xgb))
    

train_mapes = np.array(train_mapes)
train_sizes = np.array(train_sizes)
test_mapes = np.array(test_mapes)
test_sizes = np.array(test_sizes)

# Output results?
print(f"Random Forest with Threshold >{COUNT_THRESH} claims for training set:")
print(f"Train MAPEs: {train_mapes}")
print(f"Train sizes: {train_sizes}")
print(f"Test MAPEs: {test_mapes}")
print(f"Test sizes: {test_sizes}")
print(f"Total train MAPE: {((train_mapes * train_sizes) / (train_sizes.sum())).sum()}")
print(f"Total test MAPE: {((test_mapes * test_sizes) / (test_sizes.sum())).sum()}")

Index is: 0
Best parameters (if grid search was applied): {'colsample_bylevel': 1, 'colsample_bytree': 1, 'enable_categorical': False, 'gamma': 0, 'gpu_id': -1, 'interaction_constraints': '', 'learning_rate': 1, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': {'site': 1}, 'n_estimators': 125, 'n_jobs': 8, 'num_parallel_tree': 250, 'objective': 'reg:squarederror', 'predictor': 'auto', 'random_state': 42, 'reg_alpha': 0, 'reg_lambda': 0.2, 'scale_pos_weight': 1, 'subsample': 0.8, 'tree_method': 'exact', 'validate_parameters': 1}
Val score: -0.24307053185805377
Index is: 1
Best parameters (if grid search was applied): {'colsample_bylevel': 1, 'colsample_bytree': 1, 'enable_categorical': False, 'gamma': 0, 'gpu_id': -1, 'interaction_constraints': '', 'learning_rate': 1, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': {'site': 1}, 'n_estimators': 50, 'n_jobs': 8, 'num_parallel_tree': 250,