In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
import joblib

In [2]:
try:
    df = pd.read_csv("../../../Data/Medical-Cost-Data/medical_cost.csv")
except:
    print(f"Data not found, make sure to run the medical_cost_preprocessing.ipynb file in its entirety to retrieve the data")

In [3]:
# Perform classification where charges higher than the median are classified as yes/no based on other features
X = df.drop(columns=['charges'])
y = df['charges']

In [4]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
gradient_boost = GradientBoostingRegressor(
    random_state=0
)

param_grid = {
    "n_estimators": [200, 500],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 8],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", 0.5]
}

cv = KFold(n_splits=5, shuffle=True, random_state=0)

grid = GridSearchCV(
    estimator=gradient_boost,
    param_grid=param_grid,
    cv=cv,
    scoring="r2",
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


In [None]:
print("Best params:", grid.best_params_)
print("Best CV accuracy:", round(grid.best_score_, 4))

best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)
test_r2 = r2_score(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)

print(f"R-squared (R2):")
print(test_r2)
print(f"Root Mean Squared Error (RMSE):")
print(test_rmse)

scores = cross_val_score(best_model, X_train, y_train, cv=10, scoring="r2")

print(f"Mean R2:")
print(scores.mean())

print(f"R2 Standard deviation:")
print(scores.std())

print(f"Saving model")
joblib.dump(best_model, "../Saved-Models/medical_cost_gradient_boost_regression_model.pkl")

Best params: {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 500}
Best CV accuracy: 0.85
R-squared (R2):
0.9018050130034795
Root Mean Squared Error (RMSE):
3952.9437867212228
Mean R2:
0.8485107017605523
R2 Standard deviation:
0.04012571123345827
Saving model


['../Saved-Models/medical_cost_gradient_boost_regression_model.pkl']