# Tree-based Models, with Clusters

## Library Imports

In [1]:
# Necessary code to import our helper functions
import sys
sys.path.append("../..")

In [2]:
# Library imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot as plt
import xgboost as xgb
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
from Common_Functions import add_unique_identifier, data_cleaning, hospital_data_agg

  from pandas import MultiIndex, Int64Index


In [3]:
def data_split(data, count_col_name = 'priv_count', count_thresh = 50):
    """
    This function splits the data into model set and future set. Model set is the data used to train, evaluate
    test the model. Future data is what the model needs to predict on.
    
    Args:
    data (pandas: DataFrame) - a pandas data frame with at least 3 columns - "priv_pay_mean", "priv_pay_median" and count_col_name
    count_col_name (str) - name of the column which is thresholded to make the split
    count_thresh (int) - threshold value used to split data on count_col_name
    
    Returns:
    model_data (pandas: DataFrame) - data frame with observations that will be used to train and test model
    future_data (pandas: DataFrame) - data frame with all observations on which model will make predictions
    """
    data = data[(data['priv_pay_median'] > 0) | (data['priv_pay_median'].isnull())]
    future_data = data[(data[count_col_name] <= count_thresh) | (data[count_col_name].isnull())]
    model_data = data[data[count_col_name] > count_thresh]
    model_data = model_data[model_data.priv_pay_median.notnull()]
    return model_data, future_data

In [4]:
# Method from Shruti's code
def standardize_data(train_data, val_data):
    train_temp = train_data.drop(columns = ['site','cluster','lat','lon'])
    val_temp = val_data.drop(columns = ['site','cluster','lat','lon'])
    
    scaler = MinMaxScaler()
    
    train_data_scaled = scaler.fit_transform(train_temp)
    train_data_scaled = pd.DataFrame(train_data_scaled, columns = train_temp.columns)
    train_data_scaled['cluster'] = train_data['cluster'].to_list()
    train_data_scaled['site'] = train_data['site'].to_list()
    train_data_scaled['lat'] = train_data['lat'].to_list()
    train_data_scaled['lon'] = train_data['lon'].to_list()
    
    val_data_scaled = scaler.transform(val_temp)
    val_data_scaled = pd.DataFrame(val_data_scaled, columns = val_temp.columns)
    val_data_scaled['cluster'] = val_data['cluster'].to_list()
    val_data_scaled['site'] = val_data['site'].to_list()
    val_data_scaled['lat'] = val_data['lat'].to_list()
    val_data_scaled['lon'] = val_data['lon'].to_list()
    
    return train_data_scaled, val_data_scaled

In [5]:
# Method slightly modified from Shruti's code
def impute_knn(train_data, val_data, optimal_k): 
    train_data_scaled, val_data_scaled = standardize_data(train_data, val_data)

    knn = KNNImputer(n_neighbors = optimal_k)

    # imputing values
    train_data_imputed = knn.fit_transform(train_data_scaled.values)
    train_data_imputed = pd.DataFrame(train_data_imputed, columns = train_data_scaled.columns)
    val_data_imputed = knn.transform(val_data_scaled.values)
    val_data_imputed = pd.DataFrame(val_data_imputed, columns = val_data_scaled.columns)
    
    return train_data_imputed, val_data_imputed

## Data Import

In [6]:
data = pd.read_csv("../../Feature Matrix/processed_data.csv")

## Model Parameters

In [7]:
COUNT_THRESH = 34
RDM_SEED = 123
TRAIN_TEST_PROPORTION = 0.8

## Data Transformation

### One-Hot Categorical Encoding and Dropping NAs

In [8]:
data.drop(columns = ['CBSA_NAME'], inplace=True)
data = data_cleaning(data, dropna = False)

### Data Split

In [9]:
working_set, predict_set = data_split(data, count_thresh = COUNT_THRESH)

In [10]:
model_data = working_set
predict_data = predict_set

In [11]:
display(model_data)

Unnamed: 0,year,site,priv_count,priv_pay_median,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,...,group_revision_tha,group_revision_tka,group_robotic_assisted_surgery,group_rtc_slap_bank,group_septoplasty,group_tha,group_thoracic,group_tka,group_tpa,group_tsa
40,2018,1,63,24289.900,2.549296,8794.190,-96.920913,32.707875,114.0,0.105263,...,0,0,0,0,0,0,0,0,0,0
70,2018,1,51,21408.000,3.543210,10395.160,-95.622552,29.598443,181.0,0.088398,...,0,0,0,0,0,0,0,0,0,0
112,2018,1,64,29757.100,3.918699,14174.100,-74.005954,40.712776,143.0,0.552448,...,0,0,0,0,0,0,0,0,0,0
219,2019,1,66,25240.905,3.241935,10144.445,-96.920913,32.707875,114.0,0.105263,...,0,0,0,0,0,0,0,0,0,0
275,2019,1,45,34963.900,3.262295,14008.190,-74.005954,40.712776,143.0,0.552448,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46235,2020,0,156,5909.230,,,-80.101620,26.469680,21.0,0.238095,...,0,0,0,1,0,0,0,0,0,0
46237,2020,0,128,4582.570,,,-97.330050,37.687180,21.0,0.190476,...,0,0,0,1,0,0,0,0,0,0
46238,2020,0,267,5316.990,,,-75.165240,39.952630,8.0,0.375000,...,0,0,0,1,0,0,0,0,0,0
46241,2020,0,36,5378.645,,,-80.244220,36.099860,11.0,0.363636,...,0,0,0,1,0,0,0,0,0,0


## Dev/Test Split

In [12]:
X_input = model_data.drop(columns=["priv_pay_median"])
y_input = model_data["priv_pay_median"]

display(X_input)
display(y_input)

X_dev, X_test, y_dev, y_test = train_test_split(X_input,
                                                y_input,
                                                train_size = TRAIN_TEST_PROPORTION,
                                                random_state = RDM_SEED)

Unnamed: 0,year,site,priv_count,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,PctLargeHospital,...,group_revision_tha,group_revision_tka,group_robotic_assisted_surgery,group_rtc_slap_bank,group_septoplasty,group_tha,group_thoracic,group_tka,group_tpa,group_tsa
40,2018,1,63,2.549296,8794.190,-96.920913,32.707875,114.0,0.105263,0.052632,...,0,0,0,0,0,0,0,0,0,0
70,2018,1,51,3.543210,10395.160,-95.622552,29.598443,181.0,0.088398,0.060773,...,0,0,0,0,0,0,0,0,0,0
112,2018,1,64,3.918699,14174.100,-74.005954,40.712776,143.0,0.552448,0.230769,...,0,0,0,0,0,0,0,0,0,0
219,2019,1,66,3.241935,10144.445,-96.920913,32.707875,114.0,0.105263,0.052632,...,0,0,0,0,0,0,0,0,0,0
275,2019,1,45,3.262295,14008.190,-74.005954,40.712776,143.0,0.552448,0.230769,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46235,2020,0,156,,,-80.101620,26.469680,21.0,0.238095,0.095238,...,0,0,0,1,0,0,0,0,0,0
46237,2020,0,128,,,-97.330050,37.687180,21.0,0.190476,0.142857,...,0,0,0,1,0,0,0,0,0,0
46238,2020,0,267,,,-75.165240,39.952630,8.0,0.375000,0.125000,...,0,0,0,1,0,0,0,0,0,0
46241,2020,0,36,,,-80.244220,36.099860,11.0,0.363636,0.181818,...,0,0,0,1,0,0,0,0,0,0


40       24289.900
70       21408.000
112      29757.100
219      25240.905
275      34963.900
           ...    
46235     5909.230
46237     4582.570
46238     5316.990
46241     5378.645
46243     3665.000
Name: priv_pay_median, Length: 5364, dtype: float64

## Split Model Data by Cluster

In [13]:
X_dev_list = []
y_dev_list = []
X_test_list = []
y_test_list = []

for cluster_label in model_data["cluster"].unique():
    X_dev_list.append(X_dev[X_dev["cluster"] == cluster_label])
    y_dev_list.append(y_dev[X_dev["cluster"] == cluster_label])
    X_test_list.append(X_test[X_test["cluster"] == cluster_label])
    y_test_list.append(y_test[X_test["cluster"] == cluster_label])
    
    print(X_dev_list[-1].shape[0] / X_test_list[-1].shape[0])

4.107692307692307
3.8518518518518516
3.6666666666666665


## Run XGBoost model

In [14]:
train_mapes = []
train_sizes = []
test_mapes = []
test_sizes = []

# Train test split
for idx in range(1,len(X_dev_list)):

    X_train, X_val, y_train, y_val = train_test_split(X_dev_list[idx],
                                                        y_dev_list[idx],
                                                        train_size = TRAIN_TEST_PROPORTION,
                                                        random_state = RDM_SEED)
    # KNN Here!!
    
    # Parameterization
    mono = {'site': 1}

    param_grid = {
        'booster':['gbtree','dart'],
        'colsample_bylevel':[1],
        'colsample_bytree':[1],
        'enable_categorical':[False],
        'gamma':[0],
        'gpu_id':[-1],
        'interaction_constraints':[''],
        'max_delta_step':[0],
        'min_child_weight':[1],
        'missing':[np.nan],
        'n_estimators':[100,175,250],
        'n_jobs':[8],
        'predictor':['auto'],
        'reg_alpha':[0],
        'scale_pos_weight':[1],
        'tree_method':['exact'],
        'validate_parameters':[1],
        'learning_rate':[1],
        'max_depth':[10,17,25],
        'num_parallel_tree':[250],
        'objective':['reg:squarederror'],
        'subsample':[0.8],
        'random_state':[RDM_SEED],
        'reg_lambda':[0,0.25,0.5]
    }
    
    # Create, run, and tune (if applicable) model
    xgb_param_tuning_model = xgb.XGBRFRegressor(monotone_constraints = mono#,
#                                                 n_estimators = 250,
#                                                 max_depth=25
                                               )
    
    xgb_mono_model = GridSearchCV(xgb_param_tuning_model, param_grid, scoring='neg_mean_absolute_percentage_error')
    xgb_mono_model.fit(X_train, y_train)
    
    # Output optimal params (if applicable)
    print(f"Best parameters (if grid search was applied): {xgb_mono_model.best_params_}")
    
    # Predict on train and test data
    y_train_pred_xgb = xgb_mono_model.predict(X_train)
    y_test_pred_xgb = xgb_mono_model.predict(X_test)

    # Store results
    train_sizes.append(len(X_train))
    test_sizes.append(len(X_test))
    train_mapes.append(mean_absolute_percentage_error(y_true=y_train, y_pred=y_train_pred_xgb))
    test_mapes.append(mean_absolute_percentage_error(y_true=y_test, y_pred=y_test_pred_xgb))
    

train_mapes = np.array(train_mapes)
train_sizes = np.array(train_sizes)
test_mapes = np.array(test_mapes)
test_sizes = np.array(test_sizes)

# Output results?
print(f"Monotonic XGBoost with Threshold >{COUNT_THRESH} claims for training set:")
print(f"Train MAPEs: {train_mapes}")
print(f"Train sizes: {train_sizes}")
print(f"Test MAPEs: {test_mapes}")
print(f"Test sizes: {test_sizes}")
print(f"Total train MAPE: {((train_mapes * train_sizes) / (train_sizes.sum())).sum()}")
print(f"Total test MAPE: {((test_mapes * test_sizes) / (test_sizes.sum())).sum()}")

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


ValueError: 
All the 270 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
39 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\rogmo\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 1533, in fit
    super().fit(**args)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 761, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 286, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 775, in <lambda>
    create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 616, in __init__
    handle, feature_names, feature_types = dispatch_data_backend(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\data.py", line 707, in dispatch_data_backend
    return _from_pandas_df(data, enable_categorical, missing, threads,
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\data.py", line 299, in _from_pandas_df
    return _from_numpy_array(data, missing, nthread, feature_names,
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\data.py", line 178, in _from_numpy_array
    _check_call(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 218, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [22:56:07] C:\Windows\Temp\abs_557yfx631l\croots\recipe\xgboost-split_1659548953302\work\src\data\data.cc:981: Check failed: valid: Input data contains `inf` or `nan`

--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\rogmo\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 1533, in fit
    super().fit(**args)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 761, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 286, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 775, in <lambda>
    create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 616, in __init__
    handle, feature_names, feature_types = dispatch_data_backend(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\data.py", line 707, in dispatch_data_backend
    return _from_pandas_df(data, enable_categorical, missing, threads,
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\data.py", line 299, in _from_pandas_df
    return _from_numpy_array(data, missing, nthread, feature_names,
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\data.py", line 178, in _from_numpy_array
    _check_call(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 218, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [22:56:08] C:\Windows\Temp\abs_557yfx631l\croots\recipe\xgboost-split_1659548953302\work\src\data\data.cc:981: Check failed: valid: Input data contains `inf` or `nan`

--------------------------------------------------------------------------------
95 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\rogmo\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 1533, in fit
    super().fit(**args)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 761, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 286, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 775, in <lambda>
    create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 616, in __init__
    handle, feature_names, feature_types = dispatch_data_backend(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\data.py", line 707, in dispatch_data_backend
    return _from_pandas_df(data, enable_categorical, missing, threads,
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\data.py", line 299, in _from_pandas_df
    return _from_numpy_array(data, missing, nthread, feature_names,
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\data.py", line 178, in _from_numpy_array
    _check_call(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 218, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [22:56:09] C:\Windows\Temp\abs_557yfx631l\croots\recipe\xgboost-split_1659548953302\work\src\data\data.cc:981: Check failed: valid: Input data contains `inf` or `nan`

--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\rogmo\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 1533, in fit
    super().fit(**args)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 761, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 286, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\sklearn.py", line 775, in <lambda>
    create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 616, in __init__
    handle, feature_names, feature_types = dispatch_data_backend(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\data.py", line 707, in dispatch_data_backend
    return _from_pandas_df(data, enable_categorical, missing, threads,
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\data.py", line 299, in _from_pandas_df
    return _from_numpy_array(data, missing, nthread, feature_names,
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\data.py", line 178, in _from_numpy_array
    _check_call(
  File "C:\Users\rogmo\anaconda3\lib\site-packages\xgboost\core.py", line 218, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [22:56:10] C:\Windows\Temp\abs_557yfx631l\croots\recipe\xgboost-split_1659548953302\work\src\data\data.cc:981: Check failed: valid: Input data contains `inf` or `nan`
