# Tree-based Models, with Clusters

## Library Imports

In [1]:
# Necessary code to import our helper functions
import sys
sys.path.append("../..")

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Library imports
import numpy as np
import pandas as pd
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from matplotlib import pyplot as plt
import xgboost as xgb
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
from Common_Functions import data_split, add_unique_identifier, data_cleaning, hospital_data_agg, optimal_k

In [3]:
# Method from Shruti's code
def standardize_data(train_data, val_data):
    train_temp = train_data.drop(columns = ['site','cluster','lat','lon'])
    val_temp = val_data.drop(columns = ['site','cluster','lat','lon'])
    
    scaler = MinMaxScaler()
    
    train_data_scaled = scaler.fit_transform(train_temp)
    train_data_scaled = pd.DataFrame(train_data_scaled, columns = train_temp.columns)
    train_data_scaled['cluster'] = train_data['cluster'].to_list()
    train_data_scaled['site'] = train_data['site'].to_list()
    train_data_scaled['lat'] = train_data['lat'].to_list()
    train_data_scaled['lon'] = train_data['lon'].to_list()
    
    val_data_scaled = scaler.transform(val_temp)
    val_data_scaled = pd.DataFrame(val_data_scaled, columns = val_temp.columns)
    val_data_scaled['cluster'] = val_data['cluster'].to_list()
    val_data_scaled['site'] = val_data['site'].to_list()
    val_data_scaled['lat'] = val_data['lat'].to_list()
    val_data_scaled['lon'] = val_data['lon'].to_list()
    
    return train_data_scaled, val_data_scaled

In [4]:
# Method slightly modified from Shruti's code
def impute_knn(train_data, val_data, optimal_k):
    train_data_scaled, val_data_scaled = standardize_data(train_data, val_data)

    knn = KNNImputer(n_neighbors = optimal_k)

    # imputing values
    train_data_scaled[list(train_data_scaled.columns)] = knn.fit_transform(train_data_scaled)
    val_data_scaled[list(val_data_scaled.columns)] = knn.transform(val_data_scaled)
    
    return train_data_scaled, val_data_scaled

## Data Import

In [5]:
data = pd.read_csv("../../Feature Matrix/processed_data.csv")
data.dropna(subset = ['mcare_count'], inplace = True)
data.drop(columns = ['year'], inplace = True)
data.replace([np.inf, -np.inf], 0, inplace=True)

## Model Parameters

In [6]:
COUNT_THRESH = 34
RDM_SEED = 123
TRAIN_TEST_PROPORTION = 0.8
MONOTONE_MODEL = True

## Data Transformation

### One-Hot Categorical Encoding and Dropping NAs

In [7]:
data = data_cleaning(data, dropna = False, one_hot = False)

### Data Split

In [8]:
working_set, predict_set = data_split(data, count_thresh = COUNT_THRESH)

In [9]:
model_data = working_set
model_data.drop(columns = ['num_races',
                            'household_size',
                            'frac_married',
                            'frac_school',
                            'frac_college',
                            'frac_educated',
                            'annual_births',
                            'frac_veteran',
                            'frac_disability',
                            'non_citizen',
                            'employment_rate',
                            'frac_priv_insurance',
                            'frac_mcare_insurance',
                            'frac_no_insurance',
                           'priv_count',
                           'mcare_count',
                           'PctLargeHospital',
                           'Hospitals',
                           'State_Median_Household_Income',
                           'State_Poverty_Percent_All_Ages'], inplace = True)

In [10]:
display(model_data)

Unnamed: 0,site,group,priv_pay_median,mcare_los,mcare_pay_median,CBSA_NAME,lon,lat,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,State_Poverty_Percent_All_Ages,State_Median_Household_Income,income_pc,cluster
40,1,breast reconstruction,24289.900,2.549296,8794.190,"Dallas-Fort Worth-Arlington, TX",-96.920913,32.707875,114.0,0.105263,0.052632,0.807018,7759615.0,35.6,97.9,14.9,60632.0,57562.0,0
70,1,breast reconstruction,21408.000,3.543210,10395.160,"Houston-The Woodlands-Sugar Land, TX",-95.622552,29.598443,181.0,0.088398,0.060773,0.823204,7206841.0,35.3,99.0,14.9,60632.0,57875.0,0
112,1,breast reconstruction,29757.100,3.918699,14174.100,"New York-Newark-Jersey City, NY-NJ-PA",-74.005954,40.712776,143.0,0.552448,0.230769,0.643357,19768458.0,39.5,95.1,9.5,81777.0,75166.0,0
219,1,breast reconstruction,25240.905,3.241935,10144.445,"Dallas-Fort Worth-Arlington, TX",-96.920913,32.707875,114.0,0.105263,0.052632,0.807018,7759615.0,35.6,97.9,13.6,64044.0,59409.0,0
275,1,breast reconstruction,34963.900,3.262295,14008.190,"New York-Newark-Jersey City, NY-NJ-PA",-74.005954,40.712776,143.0,0.552448,0.230769,0.643357,19768458.0,39.5,95.1,9.1,85786.0,77911.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44974,0,ant_cerv_fusion,17137.490,,6934.483,"Atlanta-Sandy Springs-Alpharetta, GA",-84.294090,34.075380,80.0,0.162500,0.050000,0.725000,6144970.0,37.2,93.8,14.5,58634.0,53354.0,2
44991,0,ant_cerv_fusion,11807.250,,6905.252,"Charlotte-Concord-Gastonia, NC-SC",-80.721440,35.122320,26.0,0.115385,0.076923,0.576923,2701046.0,38.1,94.8,14.1,53922.0,52232.0,2
45038,0,ant_cerv_fusion,18421.250,,6724.500,"Memphis, TN-MS-AR",-89.850500,35.038720,28.0,0.357143,0.142857,0.571429,1336438.0,36.6,91.8,16.8,47094.0,46481.0,2
45100,0,ant_cerv_fusion,13926.250,,7334.787,"Atlanta-Sandy Springs-Alpharetta, GA",-84.294090,34.075380,80.0,0.162500,0.050000,0.725000,6144970.0,37.2,93.8,13.5,61950.0,55668.0,2


## Split Model Data by Cluster

In [11]:
X_input = model_data.drop(columns=["priv_pay_median"])
y_input = model_data["priv_pay_median"]

X_list = []
y_list = []

X_dev_list = []
y_dev_list = []
X_test_list = []
y_test_list = []
optimal_k_list = []

for cluster_label in model_data["cluster"].unique():
    
    X_clu = X_input[X_input["cluster"] == cluster_label]
    y_clu = y_input[X_input["cluster"] == cluster_label]
    
    X_dev_clu, X_test_clu, y_dev_clu, y_test_clu = train_test_split(X_clu,
                                                                    y_clu,
                                                                    train_size = TRAIN_TEST_PROPORTION,
                                                                    random_state = RDM_SEED)
    
    # Target encoding group variable
    te_group = TargetEncoder(min_samples_leaf=1)
    X_dev_clu['group_encoded'] = te_group.fit_transform(X_dev_clu['group'],y_dev_clu)
    X_dev_clu.drop(columns = 'group', inplace = True)
    X_test_clu['group_encoded'] = te_group.transform(X_test_clu['group'])
    X_test_clu.drop(columns = 'group', inplace = True)

    # Target encoding CBSA Name
    te_CBSA_NAME = TargetEncoder(min_samples_leaf=1)
    X_dev_clu['CBSA_NAME_encoded'] = te_CBSA_NAME.fit_transform(X_dev_clu['CBSA_NAME'],y_dev_clu)
    X_dev_clu.drop(columns = 'CBSA_NAME', inplace = True)
    X_test_clu['CBSA_NAME_encoded'] = te_CBSA_NAME.transform(X_test_clu['CBSA_NAME'])
    X_test_clu.drop(columns = 'CBSA_NAME', inplace = True)
    
    # KNN Imputation
    knn_data = X_dev_clu.copy()
    knn_data['priv_pay_median'] = y_dev_clu

    optimal_k_list.append(optimal_k(knn_data))
    print(optimal_k_list)
    
    display(X_dev_clu)
    display(y_dev_clu)
    
    X_dev_clu, X_test_clu = impute_knn(X_dev_clu, X_test_clu, optimal_k_list[-1])
    display(X_dev_clu)
    display(y_dev_clu)
    
    X_dev_list.append(X_dev_clu)
    y_dev_list.append(y_dev_clu)
    X_test_list.append(X_test_clu)
    y_test_list.append(y_test_clu)
    
    print(X_dev_clu.shape[0] / X_test_clu.shape[0])

[16]


Unnamed: 0,site,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,State_Poverty_Percent_All_Ages,State_Median_Household_Income,income_pc,cluster,group_encoded,CBSA_NAME_encoded
28279,0,0.000000,3304.18,-97.516430,35.467560,45.0,0.244444,0.088889,0.466667,1441647.0,36.1,97.4,15.5,51914.0,48300.0,0,8863.333441,13553.604373
29046,0,0.000000,3885.29,-77.368320,39.134970,46.0,0.304348,0.065217,0.739130,6358652.0,37.9,96.6,9.2,79154.0,76771.0,0,8863.333441,10391.683833
12238,0,0.000000,4083.44,-86.580447,36.214401,38.0,0.210526,0.105263,0.710526,2013506.0,37.2,96.5,15.2,52366.0,59105.0,0,11400.881828,11288.210417
20946,0,0.000000,4131.70,-77.436048,37.540725,18.0,0.277778,0.111111,0.555556,1317525.0,39.2,93.9,10.7,72600.0,56101.0,0,6894.100586,12391.668148
23044,0,0.000000,3623.27,-93.298280,44.840798,51.0,0.333333,0.156863,0.470588,3690512.0,38.0,99.5,9.6,70295.0,62695.0,0,7981.427595,11729.762414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21326,0,0.000000,8952.18,-76.641271,39.372594,34.0,0.529412,0.058824,0.558824,2838327.0,38.9,94.0,9.0,88589.0,66695.0,0,6894.100586,10154.722407
28977,0,0.000000,4531.95,-117.455600,33.973340,54.0,0.259259,0.018519,0.666667,4653105.0,35.4,100.5,11.5,83001.0,45365.0,0,8863.333441,10446.533183
25271,1,2.333333,9176.30,-97.516428,35.467560,45.0,0.244444,0.088889,0.466667,1441647.0,36.1,97.4,15.5,51914.0,48300.0,0,21915.405847,13553.604373
21119,0,0.000000,4397.70,-96.920913,32.707875,58.0,0.103448,0.051724,0.758621,7759615.0,35.6,97.9,13.6,64044.0,59409.0,0,6894.100586,14310.724485


28279     8775.520
29046     7735.450
12238     6499.730
20946     6689.040
23044     7592.500
           ...    
21326    13071.220
28977     7360.065
25271    29200.000
21119    19701.670
22294     4035.960
Name: priv_pay_median, Length: 2243, dtype: float64

Unnamed: 0,mcare_los,mcare_pay_median,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,State_Poverty_Percent_All_Ages,State_Median_Household_Income,income_pc,group_encoded,CBSA_NAME_encoded,cluster,site,lat,lon
0,0.000000,0.165275,0.216749,0.244444,0.266667,0.466667,0.068996,0.380597,0.388889,0.656,0.163607,0.225056,0.310483,0.544544,0.0,0.0,35.467560,-97.516430
1,0.000000,0.194343,0.221675,0.304348,0.195652,0.739130,0.318781,0.447761,0.344444,0.152,0.784830,0.525980,0.310483,0.343607,0.0,0.0,39.134970,-77.368320
2,0.000000,0.204254,0.182266,0.210526,0.315789,0.710526,0.098047,0.421642,0.338889,0.632,0.173915,0.339259,0.407036,0.400580,0.0,0.0,36.214401,-86.580447
3,0.000000,0.206668,0.083744,0.277778,0.333333,0.555556,0.062691,0.496269,0.194444,0.272,0.635362,0.307509,0.235554,0.470704,0.0,0.0,37.540725,-77.436048
4,0.000000,0.181236,0.246305,0.333333,0.470588,0.470588,0.183239,0.451493,0.505556,0.184,0.582796,0.377204,0.276927,0.428641,0.0,0.0,44.840798,-93.298280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2238,0.000000,0.447789,0.162562,0.529412,0.176471,0.558824,0.139948,0.485075,0.200000,0.136,1.000000,0.419482,0.235554,0.328548,0.0,0.0,39.372594,-76.641271
2239,0.000000,0.226689,0.261084,0.259259,0.055556,0.666667,0.232139,0.354478,0.561111,0.336,0.872563,0.194035,0.310483,0.347093,0.0,0.0,33.973340,-117.455600
2240,0.337349,0.458999,0.216749,0.244444,0.266667,0.466667,0.068996,0.380597,0.388889,0.656,0.163607,0.225056,0.807109,0.544544,0.0,1.0,35.467560,-97.516428
2241,0.000000,0.219973,0.280788,0.103448,0.155172,0.758621,0.389950,0.361940,0.416667,0.504,0.440238,0.342472,0.235554,0.592658,0.0,0.0,32.707875,-96.920913


28279     8775.520
29046     7735.450
12238     6499.730
20946     6689.040
23044     7592.500
           ...    
21326    13071.220
28977     7360.065
25271    29200.000
21119    19701.670
22294     4035.960
Name: priv_pay_median, Length: 2243, dtype: float64

3.998217468805704
[16, 2]


Unnamed: 0,site,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,State_Poverty_Percent_All_Ages,State_Median_Household_Income,income_pc,cluster,group_encoded,CBSA_NAME_encoded
15905,1,2.030801,10505.97,-77.436048,37.540725,18.0,0.277778,0.111111,0.555556,1317525.0,39.2,93.9,10.7,72600.0,56101.0,2,26517.856809,23169.997957
35165,0,0.000000,19313.93,-122.195900,47.607680,38.0,0.368421,0.026316,0.421053,,,,9.8,78674.0,,2,35280.013531,32718.468328
17230,0,0.000000,10132.42,-80.721442,35.122317,26.0,0.115385,0.076923,0.576923,2701046.0,38.1,94.8,12.9,59616.0,56682.0,2,26517.856809,33990.282600
27146,1,8.025641,18607.01,-117.455639,33.973343,54.0,0.259259,0.018519,0.666667,4653105.0,35.4,100.5,11.8,80423.0,40988.0,2,29304.413101,22763.947997
35357,0,0.000000,18122.40,-94.578570,39.099730,55.0,0.327273,0.072727,0.581818,2199544.0,37.9,97.9,10.6,63214.0,58057.0,2,35280.013531,21015.733281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31083,0,0.000000,2113.09,-89.850500,35.038720,28.0,0.357143,0.142857,0.571429,1336438.0,36.6,91.8,16.8,47094.0,46481.0,2,9429.352539,17434.585019
35121,0,0.000000,18089.26,-111.841300,33.306160,74.0,0.229730,0.067568,0.702703,4946145.0,37.6,99.5,13.5,62027.0,48082.0,2,35280.013531,20850.922917
35474,0,0.000000,20976.79,-122.195900,47.607680,38.0,0.368421,0.026316,0.421053,,,,9.5,80319.0,,2,35280.013531,32718.468328
35400,0,0.000000,19802.65,-75.165240,39.952630,31.0,0.290323,0.032258,0.774193,6228601.0,39.3,94.8,10.9,64898.0,69705.0,2,35280.013531,26813.206290


15905    31901.00
35165    50517.94
17230    30589.53
27146    33943.99
35357    48426.48
           ...   
31083     5098.88
35121    23340.30
35474    52450.20
35400    23183.31
37315    15439.72
Name: priv_pay_median, Length: 1330, dtype: float64

Unnamed: 0,mcare_los,mcare_pay_median,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,State_Poverty_Percent_All_Ages,State_Median_Household_Income,income_pc,group_encoded,CBSA_NAME_encoded,cluster,site,lat,lon
0,0.207956,0.321493,0.083744,0.416667,0.333333,0.555556,0.059259,0.375000,0.168,0.272,0.635362,0.261635,0.570249,0.321325,2.0,1.0,37.540725,-77.436048
1,0.000000,0.591025,0.182266,0.552632,0.078947,0.421053,0.120094,0.369565,0.624,0.200,0.773883,0.346467,0.862646,0.579485,2.0,0.0,47.607680,-122.195900
2,0.000000,0.310062,0.123153,0.173077,0.230769,0.576923,0.129799,0.315217,0.240,0.448,0.339255,0.269215,0.570249,0.613870,2.0,0.0,35.122317,-80.721442
3,0.821832,0.569393,0.261084,0.388889,0.055556,0.666667,0.229327,0.168478,0.696,0.360,0.813770,0.064453,0.663238,0.310347,2.0,1.0,33.973343,-117.455639
4,0.000000,0.554563,0.266010,0.490909,0.218182,0.581818,0.104229,0.304348,0.488,0.264,0.421309,0.287155,0.862646,0.263081,2.0,0.0,39.099730,-94.578570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1325,0.000000,0.064663,0.133005,0.535714,0.428571,0.571429,0.060223,0.233696,0.000,0.760,0.053684,0.136121,0.000000,0.166259,2.0,0.0,35.038720,-89.850500
1326,0.000000,0.553549,0.359606,0.344595,0.202703,0.702703,0.244268,0.288043,0.616,0.496,0.394239,0.157010,0.862646,0.258625,2.0,0.0,33.306160,-111.841300
1327,0.000000,0.641910,0.182266,0.552632,0.078947,0.421053,0.120094,0.369565,0.624,0.176,0.811398,0.346467,0.862646,0.579485,2.0,0.0,47.607680,-122.195900
1328,0.000000,0.605980,0.147783,0.435484,0.096774,0.774193,0.309655,0.380435,0.240,0.288,0.459714,0.439128,0.862646,0.419826,2.0,0.0,39.952630,-75.165240


15905    31901.00
35165    50517.94
17230    30589.53
27146    33943.99
35357    48426.48
           ...   
31083     5098.88
35121    23340.30
35474    52450.20
35400    23183.31
37315    15439.72
Name: priv_pay_median, Length: 1330, dtype: float64

3.993993993993994
[16, 2, 14]


Unnamed: 0,site,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,State_Poverty_Percent_All_Ages,State_Median_Household_Income,income_pc,cluster,group_encoded,CBSA_NAME_encoded
7083,1,4.311111,25148.94,-98.493067,29.424921,56.0,0.178571,0.071429,0.785714,2601788.0,35.7,98.9,13.4,66048.0,50022.0,1,66897.314722,68477.311905
6817,1,4.377902,29665.28,-77.368316,39.134974,46.0,0.304348,0.065217,0.739130,6358652.0,37.9,96.6,9.9,76471.0,73059.0,1,66897.314722,66386.050468
29109,1,7.739464,13183.08,-80.721440,35.122320,26.0,0.115385,0.076923,0.576923,2701046.0,38.1,94.8,14.1,53922.0,52232.0,1,35408.531923,64010.257632
7131,1,3.769767,30769.68,-83.079087,42.810536,30.0,0.433333,0.033333,0.733333,4365205.0,40.3,96.2,12.6,61352.0,58356.0,1,66897.314722,49584.403792
29311,1,8.603825,15019.04,-77.368320,39.134970,46.0,0.304348,0.065217,0.739130,6358652.0,37.9,96.6,10.7,72600.0,71615.0,1,35408.531923,66386.050468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5648,1,6.035629,25427.27,-74.005954,40.712776,143.0,0.552448,0.230769,0.643357,19768458.0,39.5,95.1,9.5,81777.0,75166.0,1,73789.525185,88389.000253
6650,1,3.685921,26604.20,-81.655651,30.332184,22.0,0.227273,0.136364,0.909091,1637666.0,39.5,95.6,12.7,59198.0,52184.0,1,66897.314722,70880.852846
6447,1,4.047431,22103.60,-98.493067,29.424921,56.0,0.178571,0.071429,0.785714,2601788.0,35.7,98.9,14.9,60632.0,48192.0,1,66897.314722,68477.311905
6816,1,3.662609,30915.10,-83.079087,42.810536,30.0,0.433333,0.033333,0.733333,4365205.0,40.3,96.2,12.9,59522.0,54213.0,1,66897.314722,49584.403792


7083     73633.900
6817     73606.685
29109    33876.090
7131     64336.070
29311    36645.570
           ...    
5648     64024.590
6650     73466.860
6447     66670.000
6816     54020.910
6719     54390.000
Name: priv_pay_median, Length: 168, dtype: float64

Unnamed: 0,mcare_los,mcare_pay_median,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,State_Poverty_Percent_All_Ages,State_Median_Household_Income,income_pc,group_encoded,CBSA_NAME_encoded,cluster,site,lat,lon
0,0.247995,0.382339,0.252525,0.267857,0.198020,0.760000,0.113221,0.053333,0.651376,0.443478,0.485940,0.301984,0.58142,0.679991,1.0,1.0,29.424921,-98.493067
1,0.256727,0.497705,0.202020,0.456522,0.174774,0.681739,0.307289,0.346667,0.440367,0.139130,0.723643,0.799823,0.58142,0.646382,1.0,1.0,39.134974,-77.368316
2,0.696224,0.076683,0.101010,0.173077,0.218583,0.409231,0.118348,0.373333,0.275229,0.504348,0.209400,0.349743,0.00000,0.608199,1.0,1.0,35.122320,-80.721440
3,0.177218,0.525916,0.121212,0.650000,0.055446,0.672000,0.204314,0.666667,0.403670,0.373913,0.378846,0.482085,0.58142,0.376355,1.0,1.0,42.810536,-83.079087
4,0.809232,0.123581,0.202020,0.456522,0.174774,0.681739,0.307289,0.346667,0.440367,0.208696,0.635362,0.768617,0.00000,0.646382,1.0,1.0,39.134970,-77.368320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,0.473462,0.389449,0.691919,0.828671,0.794364,0.520839,1.000000,0.560000,0.302752,0.104348,0.844649,0.845356,0.70868,1.000000,1.0,1.0,40.712776,-74.005954
164,0.166256,0.419513,0.080808,0.340909,0.441044,0.967273,0.063417,0.560000,0.348624,0.382609,0.329722,0.348706,0.58142,0.718619,1.0,1.0,30.332184,-81.655651
165,0.213521,0.304549,0.252525,0.267857,0.198020,0.760000,0.113221,0.053333,0.651376,0.573913,0.362426,0.262437,0.58142,0.679991,1.0,1.0,29.424921,-98.493067
166,0.163208,0.529630,0.121212,0.650000,0.055446,0.672000,0.204314,0.666667,0.403670,0.400000,0.337111,0.392553,0.58142,0.376355,1.0,1.0,42.810536,-83.079087


7083     73633.900
6817     73606.685
29109    33876.090
7131     64336.070
29311    36645.570
           ...    
5648     64024.590
6650     73466.860
6447     66670.000
6816     54020.910
6719     54390.000
Name: priv_pay_median, Length: 168, dtype: float64

4.0


## Run XGBoost model

In [12]:
train_mapes = []
train_sizes = []
test_mapes = []
test_sizes = []

# Train test split
for idx in range(0,len(X_dev_list)):
    print(f"Index is: {idx}")
    
    # Parameterization
    mono = {'site': 1}

    param_grid = {
#         'booster':['gbtree','dart'],
        'colsample_bylevel':[1],
        'colsample_bytree':[1],
        'enable_categorical':[False],
        'gamma':[0],
        'gpu_id':[-1],
        'interaction_constraints':[''],
        'max_delta_step':[0],
        'min_child_weight':[1],
        'missing':[np.nan],
#         'n_estimators':[175,200,225,250],
#         'n_estimators':[175,225,250],
        'n_jobs':[8],
        'predictor':['auto'],
        'reg_alpha':[0],
        'scale_pos_weight':[1],
        'tree_method':['exact'],
        'validate_parameters':[1],
        'learning_rate':[1],
#         'max_depth':[10,15,20,25],
#         'max_depth':[10,20,25],
        'num_parallel_tree':[250],
        'objective':['reg:squarederror'],
        'subsample':[0.8],
        'random_state':[RDM_SEED],
#         'reg_lambda':[0,0.1,0.15,0.2,0.25,0.3]
#         'reg_lambda':[0,0.1,0.2],
#         'tree_method':['approx','hist']

    }
    if MONOTONE_MODEL:
        param_grid['monotone_constraints'] = [mono]

    # Create, run, and tune (if applicable) model
    xgb_param_tuning_model = xgb.XGBRFRegressor(
                                                n_estimators = 250,
                                                max_depth=25,
                                                reg_lambda=0,
                                                booster = 'gbtree',
        tree_method = 'hist'
                                               )
    
    xgb_mono_model = GridSearchCV(xgb_param_tuning_model, param_grid, scoring='neg_mean_absolute_percentage_error')
    xgb_mono_model.fit(X_dev_list[idx], y_dev_list[idx])
    
    # Output optimal params (if applicable)
    print(f"Best parameters (if grid search was applied): {xgb_mono_model.best_params_}")
    print(f"Feature importances:")
    import_df = pd.concat([pd.Series(X_dev_list[idx].columns,name="feature"), pd.Series(xgb_mono_model.best_estimator_.feature_importances_,name="importance")], axis=1)
    display(import_df.sort_values(by="importance",ascending=False))

    
    # Predict on train and test data
    y_train_pred_xgb = xgb_mono_model.predict(X_dev_list[idx])
    y_test_pred_xgb = xgb_mono_model.predict(X_test_list[idx])

    # Store results
    train_sizes.append(len(X_dev_list[idx]))
    test_sizes.append(len(X_test_list[idx]))
    train_mapes.append(mean_absolute_percentage_error(y_true=y_dev_list[idx], y_pred=y_train_pred_xgb))
    test_mapes.append(mean_absolute_percentage_error(y_true=y_test_list[idx], y_pred=y_test_pred_xgb))
    

train_mapes = np.array(train_mapes)
train_sizes = np.array(train_sizes)
test_mapes = np.array(test_mapes)
test_sizes = np.array(test_sizes)

# Output results?
print(f"Random Forest with Threshold >{COUNT_THRESH} claims for training set:")
print(f"Train MAPEs: {train_mapes}")
print(f"Train sizes: {train_sizes}")
print(f"Test MAPEs: {test_mapes}")
print(f"Test sizes: {test_sizes}")
print(f"Total train MAPE: {((train_mapes * train_sizes) / (train_sizes.sum())).sum()}")
print(f"Total test MAPE: {((test_mapes * test_sizes) / (test_sizes.sum())).sum()}")

Index is: 0
Best parameters (if grid search was applied): {'colsample_bylevel': 1, 'colsample_bytree': 1, 'enable_categorical': False, 'gamma': 0, 'gpu_id': -1, 'interaction_constraints': '', 'learning_rate': 1, 'max_delta_step': 0, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': {'site': 1}, 'n_jobs': 8, 'num_parallel_tree': 250, 'objective': 'reg:squarederror', 'predictor': 'auto', 'random_state': 123, 'reg_alpha': 0, 'scale_pos_weight': 1, 'subsample': 0.8, 'tree_method': 'exact', 'validate_parameters': 1}
Feature importances:


Unnamed: 0,feature,importance
15,site,0.548739
13,CBSA_NAME_encoded,0.277373
12,group_encoded,0.035035
17,lon,0.026276
6,total_population,0.019983
16,lat,0.014089
1,mcare_pay_median,0.011567
8,sex_ratio,0.011231
7,median_age,0.010608
0,mcare_los,0.010235


Index is: 1
Best parameters (if grid search was applied): {'colsample_bylevel': 1, 'colsample_bytree': 1, 'enable_categorical': False, 'gamma': 0, 'gpu_id': -1, 'interaction_constraints': '', 'learning_rate': 1, 'max_delta_step': 0, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': {'site': 1}, 'n_jobs': 8, 'num_parallel_tree': 250, 'objective': 'reg:squarederror', 'predictor': 'auto', 'random_state': 123, 'reg_alpha': 0, 'scale_pos_weight': 1, 'subsample': 0.8, 'tree_method': 'exact', 'validate_parameters': 1}
Feature importances:


Unnamed: 0,feature,importance
13,CBSA_NAME_encoded,0.479143
15,site,0.1432
1,mcare_pay_median,0.051157
6,total_population,0.051079
16,lat,0.04412
12,group_encoded,0.035806
5,PctPrivate,0.031864
17,lon,0.026013
8,sex_ratio,0.025965
4,PctLargeHospital,0.021449


Index is: 2
Best parameters (if grid search was applied): {'colsample_bylevel': 1, 'colsample_bytree': 1, 'enable_categorical': False, 'gamma': 0, 'gpu_id': -1, 'interaction_constraints': '', 'learning_rate': 1, 'max_delta_step': 0, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': {'site': 1}, 'n_jobs': 8, 'num_parallel_tree': 250, 'objective': 'reg:squarederror', 'predictor': 'auto', 'random_state': 123, 'reg_alpha': 0, 'scale_pos_weight': 1, 'subsample': 0.8, 'tree_method': 'exact', 'validate_parameters': 1}
Feature importances:


Unnamed: 0,feature,importance
13,CBSA_NAME_encoded,0.365903
6,total_population,0.15595
12,group_encoded,0.095545
1,mcare_pay_median,0.071324
5,PctPrivate,0.054627
3,PctTeaching,0.046135
16,lat,0.044022
17,lon,0.035502
8,sex_ratio,0.033118
7,median_age,0.021109


Random Forest with Threshold >34 claims for training set:
Train MAPEs: [0.03297291 0.03055936 0.02433861]
Train sizes: [2243 1330  168]
Test MAPEs: [0.29277351 0.14776086 0.13259656]
Test sizes: [561 333  42]
Total train MAPE: 0.03172710066767522
Total test MAPE: 0.2339950422562902
