# Tree-based Models, with Clusters

## Library Imports

In [1]:
# Necessary code to import our helper functions
import sys
sys.path.append("../..")

In [2]:
# Library imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot as plt
import lightgbm as lgb
from sklearn.metrics import mean_absolute_percentage_error
from Common_Functions import add_unique_identifier, data_cleaning, hospital_data_agg

In [3]:
def data_split(data, count_col_name = 'priv_count', count_thresh = 50):
    """
    This function splits the data into model set and future set. Model set is the data used to train, evaluate
    test the model. Future data is what the model needs to predict on.
    
    Args:
    data (pandas: DataFrame) - a pandas data frame with at least 3 columns - "priv_pay_mean", "priv_pay_median" and count_col_name
    count_col_name (str) - name of the column which is thresholded to make the split
    count_thresh (int) - threshold value used to split data on count_col_name
    
    Returns:
    model_data (pandas: DataFrame) - data frame with observations that will be used to train and test model
    future_data (pandas: DataFrame) - data frame with all observations on which model will make predictions
    """
    data = data[(data['priv_pay_median'] > 0) | (data['priv_pay_median'].isnull())]
    future_data = data[(data[count_col_name] <= count_thresh) | (data[count_col_name].isnull())]
    model_data = data[data[count_col_name] > count_thresh]
    model_data = model_data[model_data.priv_pay_median.notnull()]
    return model_data, future_data

## Data Import

In [4]:
data = pd.read_csv("../../Feature Matrix/processed_data.csv")

## Model Parameters

In [5]:
COUNT_THRESH = 34
RDM_SEED = 123
TRAIN_TEST_PROPORTION = 0.8

## Data Transformation

### One-Hot Categorical Encoding and Dropping NAs

In [6]:
data.drop(columns = ['CBSA_NAME', 'Unnamed: 0'], inplace=True)
data = data_cleaning(data)

### Data Split

In [7]:
working_set, predict_set = data_split(data, count_thresh = COUNT_THRESH)

In [8]:
model_data = working_set
predict_data = predict_set

In [9]:
display(model_data)

Unnamed: 0,year,site,priv_count,priv_pay_median,mcare_los,mcare_pay_median,lon,lat,Hospitals,PctTeaching,...,group_revision_tha,group_revision_tka,group_robotic_assisted_surgery,group_rtc_slap_bank,group_septoplasty,group_tha,group_thoracic,group_tka,group_tpa,group_tsa
40,2018,1,63,24289.900,2.549296,8794.190,-96.920913,32.707875,114.0,0.105263,...,0,0,0,0,0,0,0,0,0,0
70,2018,1,51,21408.000,3.543210,10395.160,-95.622552,29.598443,181.0,0.088398,...,0,0,0,0,0,0,0,0,0,0
112,2018,1,64,29757.100,3.918699,14174.100,-74.005954,40.712776,143.0,0.552448,...,0,0,0,0,0,0,0,0,0,0
219,2019,1,66,25240.905,3.241935,10144.445,-96.920913,32.707875,114.0,0.105263,...,0,0,0,0,0,0,0,0,0,0
275,2019,1,45,34963.900,3.262295,14008.190,-74.005954,40.712776,143.0,0.552448,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41969,2019,0,55,8614.250,0.000000,6291.420,-83.079090,42.810540,30.0,0.433333,...,0,0,0,0,0,0,0,0,0,0
41993,2020,0,62,11590.770,0.000000,6628.030,-84.294090,34.075380,80.0,0.162500,...,0,0,0,0,0,0,0,0,0,0
42065,2020,0,39,20492.920,0.000000,7015.710,-95.622550,29.598440,181.0,0.088398,...,0,0,0,0,0,0,0,0,0,0
42110,2020,0,44,13777.100,0.000000,8517.130,-74.005950,40.712780,143.0,0.552448,...,0,0,0,0,0,0,0,0,0,0


## Split Model Data by Cluster

In [10]:
cluster_data_list = []
for cluster_label in model_data["cluster"].unique():
    cluster_data_list.append(model_data[model_data["cluster"] == cluster_label])

## Run LightGBM model

In [11]:
train_mapes = []
train_sizes = []
test_mapes = []
test_sizes = []

# Train test split
for cluster_dataset in cluster_data_list:

    X_input = cluster_dataset.drop(columns=["priv_pay_median"])
    y_input = cluster_dataset["priv_pay_median"]

    X_train, X_test, y_train, y_test = train_test_split(X_input,
                                                        y_input,
                                                        train_size = TRAIN_TEST_PROPORTION,
                                                        random_state = RDM_SEED)
    # Parameterization
    param_grid = {
#         'boosting_type':['gbdt','dart'],
#         'n_estimators':[150,200,250,300],
#         'num_leaves':[60,70,80,90],
#         'learning_rate': [0.2,0.3,0.4],
#         'min_child_weight':[0,1],
#         'reg_lambda':[0,0.25,0.5]
    }
    mono = np.array((pd.Series(X_train.columns) == "site").astype(int))
    
    # Create, run, and tune (if applicable) model
    lgb_param_tuning_model = lgb.LGBMRegressor(boosting_type = 'dart',
                                               monotone_constraints = mono,
                                               learning_rate = 0.3,
                                               n_estimators=300,
                                               num_leaves = 80,
                                               reg_lambda = 0.25,
                                               min_child_weight=0
                                             )
    lgb_mono_model = GridSearchCV(lgb_param_tuning_model, param_grid, scoring='neg_mean_absolute_percentage_error')
    lgb_mono_model.fit(X_train, y_train)
    
    # Output optimal params (if applicable)
    print(f"Best parameters (if grid search was applied): {lgb_mono_model.best_params_}")
    
    # Predict on train and test data
    y_train_pred_lgb = lgb_mono_model.predict(X_train)
    y_test_pred_lgb = lgb_mono_model.predict(X_test)

    # Store results
    train_sizes.append(len(X_train))
    test_sizes.append(len(X_test))
    train_mapes.append(mean_absolute_percentage_error(y_true=y_train, y_pred=y_train_pred_lgb))
    test_mapes.append(mean_absolute_percentage_error(y_true=y_test, y_pred=y_test_pred_lgb))
    

train_mapes = np.array(train_mapes)
train_sizes = np.array(train_sizes)
test_mapes = np.array(test_mapes)
test_sizes = np.array(test_sizes)

# Output results?
print(f"Monotonic LightGBM with Threshold >{COUNT_THRESH} claims for training set:")
print(f"Train MAPEs: {train_mapes}")
print(f"Train sizes: {train_sizes}")
print(f"Test MAPEs: {test_mapes}")
print(f"Test sizes: {test_sizes}")
print(f"Total train MAPE: {((train_mapes * train_sizes) / (train_sizes.sum())).sum()}")
print(f"Total test MAPE: {((test_mapes * test_sizes) / (test_sizes.sum())).sum()}")

Best parameters (if grid search was applied): {}
Best parameters (if grid search was applied): {}
Best parameters (if grid search was applied): {}
Monotonic LightGBM with Threshold >34 claims for training set:
Train MAPEs: [0.03096931 0.02497394 0.0322169 ]
Train sizes: [1577 1082  147]
Test MAPEs: [0.14498769 0.13623181 0.12738956]
Test sizes: [395 271  37]
Total train MAPE: 0.0287228416150374
Total test MAPE: 0.14068616073085063
