# Tree-based Models, with Clusters

## Library Imports

In [1]:
# Necessary code to import our helper functions
import sys
sys.path.append("../..")

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Library imports
import numpy as np
import pandas as pd
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from matplotlib import pyplot as plt
import xgboost as xgb
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
from Common_Functions import data_split, add_unique_identifier, data_cleaning, hospital_data_agg, optimal_k

In [3]:
# Method from Shruti's code
def standardize_data(train_data, val_data):
    train_temp = train_data.drop(columns = ['site','cluster','lat','lon'])
    val_temp = val_data.drop(columns = ['site','cluster','lat','lon'])
    
    scaler = MinMaxScaler()
    
    train_data_scaled = scaler.fit_transform(train_temp)
    train_data_scaled = pd.DataFrame(train_data_scaled, columns = train_temp.columns)
    train_data_scaled['cluster'] = train_data['cluster'].to_list()
    train_data_scaled['site'] = train_data['site'].to_list()
    train_data_scaled['lat'] = train_data['lat'].to_list()
    train_data_scaled['lon'] = train_data['lon'].to_list()
    
    val_data_scaled = scaler.transform(val_temp)
    val_data_scaled = pd.DataFrame(val_data_scaled, columns = val_temp.columns)
    val_data_scaled['cluster'] = val_data['cluster'].to_list()
    val_data_scaled['site'] = val_data['site'].to_list()
    val_data_scaled['lat'] = val_data['lat'].to_list()
    val_data_scaled['lon'] = val_data['lon'].to_list()
    
    return train_data_scaled, val_data_scaled

In [4]:
# Method slightly modified from Shruti's code
def impute_knn(train_data, val_data, optimal_k):
    train_data_scaled, val_data_scaled = standardize_data(train_data, val_data)

    knn = KNNImputer(n_neighbors = optimal_k)

    # imputing values
    train_data_scaled[list(train_data_scaled.columns)] = knn.fit_transform(train_data_scaled)
    val_data_scaled[list(val_data_scaled.columns)] = knn.transform(val_data_scaled)
    
    return train_data_scaled, val_data_scaled

## Data Import

In [5]:
data = pd.read_csv("../../Feature Matrix/processed_data.csv")
data.dropna(subset = ['mcare_count'], inplace = True)
data.drop(columns = ['year'], inplace = True)
data.replace([np.inf, -np.inf], 0, inplace=True)

## Model Parameters

In [6]:
COUNT_THRESH = 34
RDM_SEED = 123
TRAIN_TEST_PROPORTION = 0.8
MONOTONE_MODEL = True

## Data Transformation

### One-Hot Categorical Encoding and Dropping NAs

In [7]:
data = data_cleaning(data, dropna = False, one_hot = False)

### Data Split

In [8]:
working_set, predict_set = data_split(data, count_thresh = COUNT_THRESH)

In [9]:
model_data = working_set

In [10]:
CBSAs = model_data['CBSA_NAME']
model_data.drop('CBSA_NAME',axis = 1,inplace=True)
model_data = pd.get_dummies(model_data)
model_data['CBSA_NAME'] = CBSAs

In [11]:
display(model_data)

Unnamed: 0,site,priv_count,priv_pay_median,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,PctLargeHospital,...,group_radius/ulna internal fixation,group_robotic_assisted_surgery,group_rtc_slap_bank,group_septoplasty,group_tha,group_thoracic,group_tka,group_tpa,group_tsa,CBSA_NAME
40,1,63,24289.900,2.549296,8794.190,-96.920913,32.707875,114.0,0.105263,0.052632,...,0,0,0,0,0,0,0,0,0,"Dallas-Fort Worth-Arlington, TX"
70,1,51,21408.000,3.543210,10395.160,-95.622552,29.598443,181.0,0.088398,0.060773,...,0,0,0,0,0,0,0,0,0,"Houston-The Woodlands-Sugar Land, TX"
112,1,64,29757.100,3.918699,14174.100,-74.005954,40.712776,143.0,0.552448,0.230769,...,0,0,0,0,0,0,0,0,0,"New York-Newark-Jersey City, NY-NJ-PA"
219,1,66,25240.905,3.241935,10144.445,-96.920913,32.707875,114.0,0.105263,0.052632,...,0,0,0,0,0,0,0,0,0,"Dallas-Fort Worth-Arlington, TX"
275,1,45,34963.900,3.262295,14008.190,-74.005954,40.712776,143.0,0.552448,0.230769,...,0,0,0,0,0,0,0,0,0,"New York-Newark-Jersey City, NY-NJ-PA"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44974,0,54,17137.490,,6934.483,-84.294090,34.075380,80.0,0.162500,0.050000,...,0,0,0,0,0,0,0,0,0,"Atlanta-Sandy Springs-Alpharetta, GA"
44991,0,39,11807.250,,6905.252,-80.721440,35.122320,26.0,0.115385,0.076923,...,0,0,0,0,0,0,0,0,0,"Charlotte-Concord-Gastonia, NC-SC"
45038,0,35,18421.250,,6724.500,-89.850500,35.038720,28.0,0.357143,0.142857,...,0,0,0,0,0,0,0,0,0,"Memphis, TN-MS-AR"
45100,0,51,13926.250,,7334.787,-84.294090,34.075380,80.0,0.162500,0.050000,...,0,0,0,0,0,0,0,0,0,"Atlanta-Sandy Springs-Alpharetta, GA"


## Split Model Data by Cluster

In [12]:
X_input = model_data.drop(columns=["priv_pay_median"])
y_input = model_data["priv_pay_median"]

X_list = []
y_list = []

X_dev_list = []
y_dev_list = []
X_test_list = []
y_test_list = []
optimal_k_list = []

for cluster_label in model_data["cluster"].unique():
    
    X_clu = X_input[X_input["cluster"] == cluster_label]
    y_clu = y_input[X_input["cluster"] == cluster_label]
    
    X_dev_clu, X_test_clu, y_dev_clu, y_test_clu = train_test_split(X_clu,
                                                                    y_clu,
                                                                    train_size = TRAIN_TEST_PROPORTION,
                                                                    random_state = RDM_SEED)
    
#     # Target encoding group variable
#     te_group = TargetEncoder(min_samples_leaf=1)
#     X_dev_clu['group_encoded'] = te_group.fit_transform(X_dev_clu['group'],y_dev_clu)
#     X_dev_clu.drop(columns = 'group', inplace = True)
#     X_test_clu['group_encoded'] = te_group.transform(X_test_clu['group'])
#     X_test_clu.drop(columns = 'group', inplace = True)

    # Target encoding CBSA Name
    te_CBSA_NAME = TargetEncoder(min_samples_leaf=1)
    X_dev_clu['CBSA_NAME_encoded'] = te_CBSA_NAME.fit_transform(X_dev_clu['CBSA_NAME'],y_dev_clu)
    X_dev_clu.drop(columns = 'CBSA_NAME', inplace = True)
    X_test_clu['CBSA_NAME_encoded'] = te_CBSA_NAME.transform(X_test_clu['CBSA_NAME'])
    X_test_clu.drop(columns = 'CBSA_NAME', inplace = True)
    
    # KNN Imputation
    knn_data = X_dev_clu.copy()
    knn_data['priv_pay_median'] = y_dev_clu

    optimal_k_list.append(optimal_k(knn_data))
    print(optimal_k_list)
    
    display(X_dev_clu)
    display(y_dev_clu)
    
    X_dev_clu, X_test_clu = impute_knn(X_dev_clu, X_test_clu, optimal_k_list[-1])
    display(X_dev_clu)
    display(y_dev_clu)
    
    X_dev_list.append(X_dev_clu)
    y_dev_list.append(y_dev_clu)
    X_test_list.append(X_test_clu)
    y_test_list.append(y_test_clu)
    
    print(X_dev_clu.shape[0] / X_test_clu.shape[0])

[5]


Unnamed: 0,site,priv_count,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,...,group_radius/ulna internal fixation,group_robotic_assisted_surgery,group_rtc_slap_bank,group_septoplasty,group_tha,group_thoracic,group_tka,group_tpa,group_tsa,CBSA_NAME_encoded
2303,0,190,0.0,4627.61,-74.005954,40.712776,29.0,0.655172,0.137931,0.827586,...,0,0,0,0,0,0,0,0,0,13456.646389
33187,0,680,0.0,5986.97,-80.721440,35.122320,26.0,0.115385,0.076923,0.576923,...,0,0,0,0,0,0,0,0,0,15663.336786
8031,0,84,0.0,3796.60,-88.039891,30.695366,8.0,0.500000,0.125000,0.500000,...,0,0,1,0,0,0,0,0,0,5748.503439
22696,0,66,0.0,4455.43,-88.011847,41.743507,12.0,0.083333,0.000000,0.416667,...,0,0,0,0,0,0,0,0,0,11803.174615
12074,0,63,0.0,6882.37,-76.641271,39.372594,34.0,0.529412,0.058824,0.558824,...,1,0,0,0,0,0,0,0,0,10053.227037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21321,0,121,0.0,4571.07,-84.294090,34.075376,80.0,0.162500,0.050000,0.725000,...,0,0,0,0,0,0,0,0,0,12728.470571
22099,0,41,0.0,3583.78,-80.721442,28.263933,9.0,0.000000,0.111111,0.888889,...,0,0,0,0,0,0,0,0,0,17441.352816
22544,0,35,0.0,4063.72,-114.627692,32.692651,1.0,1.000000,0.000000,1.000000,...,0,0,0,0,0,0,0,0,0,15924.785221
2456,0,78,0.0,3644.60,-86.811378,33.405387,24.0,0.250000,0.166667,0.583333,...,0,0,0,0,0,0,0,0,0,5845.387469


2303      9381.090
33187    22569.920
8031      5262.500
22696    12125.260
12074    10410.410
           ...    
21321     7809.500
22099    13604.360
22544    15539.320
2456      5073.275
33455     5627.230
Name: priv_pay_median, Length: 2243, dtype: float64

Unnamed: 0,priv_count,mcare_los,mcare_pay_median,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,...,group_tha,group_thoracic,group_tka,group_tpa,group_tsa,CBSA_NAME_encoded,cluster,site,lat,lon
0,0.057836,0.0,0.231473,0.137931,0.655172,0.413793,0.827586,1.000000,0.507463,0.288770,...,0.0,0.0,0.0,0.0,0.0,0.459571,0.0,0.0,40.712776,-74.005954
1,0.240672,0.0,0.299469,0.123153,0.115385,0.230769,0.576923,0.132974,0.455224,0.272727,...,0.0,0.0,0.0,0.0,0.0,0.580810,0.0,0.0,35.122320,-80.721440
2,0.018284,0.0,0.189906,0.034483,0.500000,0.375000,0.500000,0.017641,0.485075,0.171123,...,0.0,0.0,0.0,0.0,0.0,0.036076,0.0,0.0,30.695366,-88.039891
3,0.011567,0.0,0.222861,0.054187,0.083333,0.000000,0.416667,0.478889,0.470149,0.401070,...,0.0,0.0,0.0,0.0,0.0,0.368727,0.0,0.0,41.743507,-88.011847
4,0.010448,0.0,0.344257,0.162562,0.529412,0.176471,0.558824,0.139948,0.485075,0.229947,...,0.0,0.0,0.0,0.0,0.0,0.272583,0.0,0.0,39.372594,-76.641271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2238,0.032090,0.0,0.228645,0.389163,0.162500,0.150000,0.725000,0.307925,0.421642,0.219251,...,0.0,0.0,0.0,0.0,0.0,0.419564,0.0,0.0,34.075376,-84.294090
2239,0.002239,0.0,0.179261,0.039409,0.000000,0.333333,0.888889,0.027085,0.787313,0.390374,...,0.0,0.0,0.0,0.0,0.0,0.678497,0.0,0.0,28.263933,-80.721442
2240,0.000000,0.0,0.203268,0.000000,1.000000,0.000000,1.000000,0.006275,0.354478,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.595174,0.0,0.0,32.692651,-114.627692
2241,0.016045,0.0,0.182303,0.113300,0.250000,0.500000,0.583333,0.052365,0.503731,0.208556,...,0.0,0.0,0.0,0.0,0.0,0.041398,0.0,0.0,33.405387,-86.811378


2303      9381.090
33187    22569.920
8031      5262.500
22696    12125.260
12074    10410.410
           ...    
21321     7809.500
22099    13604.360
22544    15539.320
2456      5073.275
33455     5627.230
Name: priv_pay_median, Length: 2243, dtype: float64

3.998217468805704
[5, 3]


Unnamed: 0,site,priv_count,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,...,group_radius/ulna internal fixation,group_robotic_assisted_surgery,group_rtc_slap_bank,group_septoplasty,group_tha,group_thoracic,group_tka,group_tpa,group_tsa,CBSA_NAME_encoded
39072,1,40,5.509346,19355.010,-74.005950,40.712780,143.0,0.552448,0.230769,0.643357,...,0,0,0,0,0,0,0,1,0,33370.705833
37159,0,116,0.000000,17975.160,-87.984550,42.106700,96.0,0.427083,0.156250,0.427083,...,0,0,0,0,0,0,0,0,0,21153.753226
36885,0,70,0.000000,16039.300,-83.079090,42.810540,51.0,0.411765,0.137255,0.431372,...,0,0,0,0,0,0,0,0,0,20081.019833
27885,0,52,0.000000,751.240,-83.079087,42.810536,30.0,0.433333,0.033333,0.733333,...,0,0,0,0,0,0,0,0,0,20081.019833
13190,1,61,3.948819,12373.395,-77.608846,43.156578,19.0,0.473684,0.157895,0.631579,...,0,0,0,0,1,0,0,0,0,22987.159480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34631,0,81,0.000000,18075.960,-75.165240,39.952630,11.0,0.636364,0.272727,0.909091,...,0,0,0,0,0,0,0,0,0,23363.829928
36141,0,41,0.000000,19029.720,-111.841300,33.306160,74.0,0.229730,0.067568,0.702703,...,0,0,0,0,0,0,0,0,0,21666.063971
31612,0,53,0.000000,2410.610,-81.655650,30.332180,22.0,0.227273,0.136364,0.909091,...,0,0,0,0,0,0,0,0,0,27133.297608
14417,0,91,0.000000,9932.630,-77.436048,37.540725,18.0,0.277778,0.111111,0.555556,...,0,0,0,0,1,0,0,0,0,24724.878713


39072    34200.00
37159    35186.17
36885    24567.33
27885     4441.70
13190    22870.39
           ...   
34631    20471.10
36141    29104.76
31612     8886.98
14417    18110.87
15890    17016.53
Name: priv_pay_median, Length: 1330, dtype: float64

Unnamed: 0,priv_count,mcare_los,mcare_pay_median,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,...,group_tha,group_thoracic,group_tka,group_tpa,group_tsa,CBSA_NAME_encoded,cluster,site,lat,lon
0,0.007215,0.564162,0.592282,0.699507,0.828671,0.692308,0.643357,1.000000,0.391304,0.264000,...,0.0,0.0,0.0,1.0,0.0,0.610217,2.0,1.0,40.712780,-74.005950
1,0.116883,0.000000,0.550057,0.467980,0.640625,0.468750,0.427083,0.206962,0.336957,0.389333,...,0.0,0.0,0.0,0.0,0.0,0.297964,2.0,0.0,42.106700,-87.984550
2,0.050505,0.000000,0.490818,0.246305,0.617647,0.411765,0.431372,0.214648,0.434783,0.352000,...,0.0,0.0,0.0,0.0,0.0,0.270546,2.0,0.0,42.810540,-83.079090
3,0.024531,0.000000,0.022989,0.142857,0.650000,0.100000,0.733333,0.214648,0.434783,0.352000,...,0.0,0.0,0.0,0.0,0.0,0.270546,2.0,0.0,42.810536,-83.079087
4,0.037518,0.404362,0.378638,0.088670,0.710526,0.473684,0.631579,0.047402,0.456522,0.336000,...,1.0,0.0,0.0,0.0,0.0,0.344824,2.0,1.0,43.156578,-77.608846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1325,0.066378,0.000000,0.553142,0.049261,0.954545,0.818182,0.909091,0.309655,0.380435,0.240000,...,0.0,0.0,0.0,0.0,0.0,0.354451,2.0,0.0,39.952630,-75.165240
1326,0.008658,0.000000,0.582328,0.359606,0.344595,0.202703,0.702703,0.244268,0.288043,0.616000,...,0.0,0.0,0.0,0.0,0.0,0.311058,2.0,0.0,33.306160,-111.841300
1327,0.025974,0.000000,0.073767,0.103448,0.340909,0.409091,0.909091,0.075581,0.391304,0.304000,...,0.0,0.0,0.0,0.0,0.0,0.450795,2.0,0.0,30.332180,-81.655650
1328,0.080808,0.000000,0.303948,0.083744,0.416667,0.333333,0.555556,0.059259,0.375000,0.168000,...,1.0,0.0,0.0,0.0,0.0,0.389238,2.0,0.0,37.540725,-77.436048


39072    34200.00
37159    35186.17
36885    24567.33
27885     4441.70
13190    22870.39
           ...   
34631    20471.10
36141    29104.76
31612     8886.98
14417    18110.87
15890    17016.53
Name: priv_pay_median, Length: 1330, dtype: float64

3.993993993993994
[5, 3, 4]


Unnamed: 0,site,priv_count,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,...,group_radius/ulna internal fixation,group_robotic_assisted_surgery,group_rtc_slap_bank,group_septoplasty,group_tha,group_thoracic,group_tka,group_tpa,group_tsa,CBSA_NAME_encoded
5843,1,47,6.163218,28955.290,-74.005954,40.712776,143.0,0.552448,0.230769,0.643357,...,0,0,0,0,0,0,0,0,0,82561.324914
7065,1,68,4.200581,31754.425,-77.436048,37.540725,18.0,0.277778,0.111111,0.555556,...,0,0,0,0,0,0,0,0,0,62298.775506
6346,1,47,3.832298,24064.910,-88.011847,41.743507,12.0,0.083333,0.000000,0.416667,...,0,0,0,0,0,0,0,0,0,62298.775506
29647,1,51,9.219512,17672.640,-95.622550,29.598440,181.0,0.088398,0.060773,0.823204,...,0,0,0,0,0,0,0,0,0,54268.775579
6476,1,35,3.127551,29335.980,-122.527463,47.169828,9.0,0.333333,0.111111,0.555556,...,0,0,0,0,0,0,0,0,0,62298.775506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6796,1,129,4.141243,27336.005,-82.388578,28.274347,49.0,0.285714,0.081633,0.836735,...,0,0,0,0,0,0,0,0,0,66409.182391
7113,1,68,4.232210,28678.360,-82.388578,28.274347,49.0,0.285714,0.081633,0.836735,...,0,0,0,0,0,0,0,0,0,66409.182391
29371,1,57,7.699801,15381.860,-96.920910,32.707880,114.0,0.105263,0.052632,0.807018,...,0,0,0,0,0,0,0,0,0,79156.725156
6962,1,104,4.059078,30032.010,-95.622552,29.598443,181.0,0.088398,0.060773,0.823204,...,0,0,0,0,0,0,0,0,0,54268.775579


5843      82650.050
7065     111025.245
6346      60000.280
29647     29225.000
6476      73536.030
            ...    
6796      97492.210
7113      97049.600
29371     54822.360
6962      78694.205
6549      69144.890
Name: priv_pay_median, Length: 168, dtype: float64

Unnamed: 0,priv_count,mcare_los,mcare_pay_median,Hospitals,PctTeaching,PctLargeHospital,PctPrivate,total_population,median_age,sex_ratio,...,group_tha,group_thoracic,group_tka,group_tpa,group_tsa,CBSA_NAME_encoded,cluster,site,lat,lon
0,0.039344,0.429521,0.461422,0.691919,0.828671,0.807692,0.520839,1.000000,0.560000,0.302752,...,0.0,0.0,0.0,0.0,0.0,0.989558,1.0,1.0,40.712776,-74.005954
1,0.108197,0.204659,0.530218,0.060606,0.416667,0.388889,0.373333,0.046880,0.520000,0.192661,...,0.0,0.0,0.0,0.0,0.0,0.635229,1.0,1.0,37.540725,-77.436048
2,0.039344,0.162464,0.341229,0.030303,0.125000,0.000000,0.140000,0.470099,0.426667,0.495413,...,0.0,0.0,0.0,0.0,0.0,0.635229,1.0,1.0,41.743507,-88.011847
3,0.052459,0.779686,0.184123,0.883838,0.132597,0.212707,0.822983,0.351104,0.000000,0.660550,...,0.0,0.0,0.0,0.0,0.0,0.494809,1.0,1.0,29.598440,-95.622550
4,0.000000,0.081720,0.470779,0.015152,0.500000,0.388889,0.373333,0.133365,0.350000,0.752294,...,0.0,0.0,0.0,0.0,0.0,0.635229,1.0,1.0,47.169828,-122.527463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,0.308197,0.197860,0.421624,0.217172,0.428571,0.285714,0.845714,0.145131,0.933333,0.376147,...,0.0,0.0,0.0,0.0,0.0,0.707107,1.0,1.0,28.274347,-82.388578
164,0.108197,0.208283,0.454616,0.217172,0.428571,0.285714,0.845714,0.145131,0.933333,0.376147,...,0.0,0.0,0.0,0.0,0.0,0.707107,1.0,1.0,28.274347,-82.388578
165,0.072131,0.605570,0.127822,0.545455,0.157895,0.184211,0.795789,0.379659,0.040000,0.559633,...,0.0,0.0,0.0,0.0,0.0,0.930022,1.0,1.0,32.707880,-96.920910
166,0.226230,0.188447,0.487885,0.883838,0.132597,0.212707,0.822983,0.351104,0.000000,0.660550,...,0.0,0.0,0.0,0.0,0.0,0.494809,1.0,1.0,29.598443,-95.622552


5843      82650.050
7065     111025.245
6346      60000.280
29647     29225.000
6476      73536.030
            ...    
6796      97492.210
7113      97049.600
29371     54822.360
6962      78694.205
6549      69144.890
Name: priv_pay_median, Length: 168, dtype: float64

4.0


## Run XGBoost model

In [13]:
train_mapes = []
train_sizes = []
test_mapes = []
test_sizes = []

# Train test split
for idx in range(0,len(X_dev_list)):
    print(f"Index is: {idx}")
    
    # Parameterization
    mono = {'site': 1}

    param_grid = {
#         'subsample':[0.5,0.6,0.7,0.9,1.0],
#         'colsample_bytree':[0.6,0.75,0.8,0.9,1.0],
#         'n_estimators':[10,25,100, 150, 200],
        'max_depth':[2,4,6,8],#,16],
        'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2],
#         'gamma':[0, 0.001],
        'min_child_weight':[1, 2, 3, 4, 5, 6],
        'reg_lambda':[0,0.25,0.5,0.75],
        'tree_method': ['hist']
    }
    
    if MONOTONE_MODEL:
        param_grid['monotone_constraints'] = [mono]

    if idx == 0:
        # Create, run, and tune (if applicable) model
        xgb_param_tuning_model = xgb.XGBRegressor(n_estimators = 100,
#                                                   max_depth=6,
#                                                   reg_lambda=0,
                                                  gamma=0
                                                 )
    elif idx == 1:
        xgb_param_tuning_model = xgb.XGBRegressor(n_estimators = 100,
#                                                   max_depth=6,
#                                                   reg_lambda=0.25,
                                                  gamma=0
                                                 )
    else:
        xgb_param_tuning_model = xgb.XGBRegressor(n_estimators = 100,
#                                                   max_depth=6,
#                                                   reg_lambda=0.5,
                                                  gamma=0
                                                 )
    
    xgb_mono_model = GridSearchCV(xgb_param_tuning_model,
                                  param_grid,
                                  cv=5,
                                  scoring='neg_mean_absolute_percentage_error')
    xgb_mono_model.fit(X_dev_list[idx], y_dev_list[idx])
    
    # Output optimal params (if applicable)
    print(f"Best parameters (if grid search was applied): {xgb_mono_model.best_params_}")
    print(f"Validation score: {xgb_mono_model.best_score_}")
    
    # Predict on train and test data
    y_train_pred_xgb = xgb_mono_model.predict(X_dev_list[idx])
    y_test_pred_xgb = xgb_mono_model.predict(X_test_list[idx])

    # Store results
    train_sizes.append(len(X_dev_list[idx]))
    test_sizes.append(len(X_test_list[idx]))
    train_mapes.append(mean_absolute_percentage_error(y_true=y_dev_list[idx], y_pred=y_train_pred_xgb))
    test_mapes.append(mean_absolute_percentage_error(y_true=y_test_list[idx], y_pred=y_test_pred_xgb))
    

train_mapes = np.array(train_mapes)
train_sizes = np.array(train_sizes)
test_mapes = np.array(test_mapes)
test_sizes = np.array(test_sizes)

# Output results?
print(f"XGB with Threshold >{COUNT_THRESH} claims for training set:")
print(f"Train MAPEs: {train_mapes}")
print(f"Train sizes: {train_sizes}")
print(f"Test MAPEs: {test_mapes}")
print(f"Test sizes: {test_sizes}")
print(f"Total train MAPE: {((train_mapes * train_sizes) / (train_sizes.sum())).sum()}")
print(f"Total test MAPE: {((test_mapes * test_sizes) / (test_sizes.sum())).sum()}")

Index is: 0
Best parameters (if grid search was applied): {'learning_rate': 0.2, 'max_depth': 6, 'min_child_weight': 1, 'monotone_constraints': {'site': 1}, 'reg_lambda': 0.25, 'tree_method': 'hist'}
Validation score: -0.16659473271929953
Index is: 1
Best parameters (if grid search was applied): {'learning_rate': 0.05, 'max_depth': 8, 'min_child_weight': 3, 'monotone_constraints': {'site': 1}, 'reg_lambda': 0.5, 'tree_method': 'hist'}
Validation score: -0.15517303679128153
Index is: 2
Best parameters (if grid search was applied): {'learning_rate': 0.05, 'max_depth': 6, 'min_child_weight': 3, 'monotone_constraints': {'site': 1}, 'reg_lambda': 0.5, 'tree_method': 'hist'}
Validation score: -0.13610541512847785
XGB with Threshold >34 claims for training set:
Train MAPEs: [0.05635951 0.04333163 0.01768632]
Train sizes: [2243 1330  168]
Test MAPEs: [0.29745276 0.13442393 0.10992348]
Test sizes: [561 333  42]
Total train MAPE: 0.04999111407098536
Total test MAPE: 0.2310373455807232
