# Health Insurance Claim Prediction

### 1. Importing Libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

### 2. Importing Dataset

In [2]:
df = pd.read_csv("Medicalpremium.csv")

In [3]:
df.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice
0,45,0,0,0,0,155,57,0,0,0,25000
1,60,1,0,0,0,180,73,0,0,0,29000
2,36,1,1,0,0,158,59,0,0,1,23000
3,52,1,1,0,1,183,93,0,0,2,28000
4,38,0,0,0,1,166,88,0,0,1,23000


### 3. Extracting BMI Feature

In [4]:
df['BMI'] = df['Weight'] / ( (df['Height']/100)**2 )

In [5]:
df.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice,BMI
0,45,0,0,0,0,155,57,0,0,0,25000,23.725286
1,60,1,0,0,0,180,73,0,0,0,29000,22.530864
2,36,1,1,0,0,158,59,0,0,1,23000,23.634033
3,52,1,1,0,1,183,93,0,0,2,28000,27.770313
4,38,0,0,0,1,166,88,0,0,1,23000,31.934969


### 4. Dataset exploration

In [6]:
df.describe()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice,BMI
count,986.0,986.0,986.0,986.0,986.0,986.0,986.0,986.0,986.0,986.0,986.0,986.0
mean,41.745436,0.419878,0.46856,0.055781,0.180527,168.182556,76.950304,0.21501,0.117647,0.667343,24336.713996,27.460709
std,13.963371,0.493789,0.499264,0.229615,0.384821,10.098155,14.265096,0.411038,0.322353,0.749205,6248.184382,5.878671
min,18.0,0.0,0.0,0.0,0.0,145.0,51.0,0.0,0.0,0.0,15000.0,15.156281
25%,30.0,0.0,0.0,0.0,0.0,161.0,67.0,0.0,0.0,0.0,21000.0,23.393392
50%,42.0,0.0,0.0,0.0,0.0,168.0,75.0,0.0,0.0,1.0,23000.0,27.156602
75%,53.0,1.0,1.0,0.0,0.0,176.0,87.0,0.0,0.0,1.0,28000.0,30.75987
max,66.0,1.0,1.0,1.0,1.0,188.0,132.0,1.0,1.0,3.0,40000.0,50.0


In [7]:
df.corr()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice,BMI
Age,1.0,0.210908,0.244888,-0.008549,0.051072,0.039879,-0.01859,-0.024416,-0.027623,0.429181,0.69754,-0.042027
Diabetes,0.210908,1.0,0.127727,-0.036652,-0.089428,-0.003783,-0.024563,-0.080102,-0.055527,0.122722,0.076209,-0.022913
BloodPressureProblems,0.244888,0.127727,1.0,-0.024538,0.045424,-0.037926,-0.061016,-0.01155,0.048239,0.251568,0.167097,-0.038028
AnyTransplants,-0.008549,-0.036652,-0.024538,1.0,0.035285,-0.031543,0.002087,0.001876,-0.020171,-0.004154,0.289056,0.023508
AnyChronicDiseases,0.051072,-0.089428,0.045424,0.035285,1.0,0.047419,-0.033318,-0.027418,0.008666,0.014835,0.20861,-0.05698
Height,0.039879,-0.003783,-0.037926,-0.031543,0.047419,1.0,0.066946,-0.0102,0.010549,0.037289,0.02691,-0.504947
Weight,-0.01859,-0.024563,-0.061016,0.002087,-0.033318,0.066946,1.0,0.037492,0.003481,-0.006108,0.141507,0.820679
KnownAllergies,-0.024416,-0.080102,-0.01155,0.001876,-0.027418,-0.0102,0.037492,1.0,0.115383,0.103923,0.012103,0.040437
HistoryOfCancerInFamily,-0.027623,-0.055527,0.048239,-0.020171,0.008666,0.010549,0.003481,0.115383,1.0,0.212657,0.083139,0.00239
NumberOfMajorSurgeries,0.429181,0.122722,0.251568,-0.004154,0.014835,0.037289,-0.006108,0.103923,0.212657,1.0,0.26425,-0.027225


In [8]:
df.shape

(986, 12)

### 5. Train-Test Split

In [9]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['PremiumPrice'])  # Independent features
y = df['PremiumPrice']  # Dependent variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
y_train = y_train.ravel()
y_test = y_test.ravel()

### 6. Evaluating Ensemble Models

In [11]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [12]:
# Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [13]:
# Beginning Model Training
models = {
    "XGBoost Regressor": XGBRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

XGBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 112.4317
- Mean Absolute Error: 71.2485
- R2 Score: 0.9997
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 2386.0513
- Mean Absolute Error: 1280.8626
- R2 Score: 0.8665


Gradient Boosting Regressor
Model performance for Training set
- Root Mean Squared Error: 2030.0182
- Mean Absolute Error: 1177.8247
- R2 Score: 0.8917
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 2372.9243
- Mean Absolute Error: 1519.1369
- R2 Score: 0.8680


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 1150.3812
- Mean Absolute Error: 464.6954
- R2 Score: 0.9652
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 2069.4677
- Mean Absolute Error: 953.4848
- R2 Score: 0.8996




### 7. Hyperparameter Tuning using Bayesian Optimization

In [26]:
# Bayesian Optimization on Gradient Boosting
import optuna
from sklearn.model_selection import cross_val_score

def objective_gbr(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    }

    model = GradientBoostingRegressor(**params)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()
    return score

study_gbr = optuna.create_study(direction='maximize')  # maximize negative MSE = minimize MSE
study_gbr.optimize(objective_gbr, n_trials=50, n_jobs=-1)

print("Best GBR Params:", study_gbr.best_params)
print("Best GBR Score:", study_gbr.best_value)

[I 2025-04-04 20:04:20,141] A new study created in memory with name: no-name-02ebcf10-b7a0-4c05-97d5-37a08a2c5601
[I 2025-04-04 20:04:23,413] Trial 2 finished with value: -10912787.377654077 and parameters: {'n_estimators': 114, 'learning_rate': 0.07994707940129563, 'max_depth': 4, 'min_samples_split': 9, 'min_samples_leaf': 5, 'subsample': 0.8460668265816481, 'max_features': 'sqrt'}. Best is trial 2 with value: -10912787.377654077.
[I 2025-04-04 20:04:24,834] Trial 0 finished with value: -11450856.46951915 and parameters: {'n_estimators': 159, 'learning_rate': 0.01568487025430545, 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 1, 'subsample': 0.702887773434642, 'max_features': 'log2'}. Best is trial 2 with value: -10912787.377654077.
[I 2025-04-04 20:04:25,098] Trial 3 finished with value: -12921520.811848404 and parameters: {'n_estimators': 176, 'learning_rate': 0.012618958916527796, 'max_depth': 4, 'min_samples_split': 6, 'min_samples_leaf': 5, 'subsample': 0.8703273804

Best GBR Params: {'n_estimators': 489, 'learning_rate': 0.026716324774950354, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 2, 'subsample': 0.904389658423455, 'max_features': 'log2'}
Best GBR Score: -9793388.595389893


In [77]:
# Use best parameters from Optuna study
best_gbr_params = study_gbr.best_params

# Train model using best parameters
gbr_best_model = GradientBoostingRegressor(**best_gbr_params)
gbr_best_model.fit(X_train, y_train)

# Predict on test set
y_pred = gbr_best_model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", round(mae, 4))
print("R² Score:", round(r2, 4))

Mean Absolute Error (MAE): 1432.3006
R² Score: 0.8727


In [16]:
# Bayesian Optimization on Random Forest
import optuna
from sklearn.model_selection import cross_val_score
def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'criterion': trial.suggest_categorical('criterion', ['squared_error', 'absolute_error', 'friedman_mse']),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    }

    model = RandomForestRegressor(**params)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()
    return score

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=50, n_jobs=-1)

print("Best RF Params:", study_rf.best_params)
print("Best RF Score:", study_rf.best_value)

[32m[I 2025-04-07 13:35:27,734][0m A new study created in memory with name: no-name-5a28702e-775f-4d3b-ad15-257623cde681[0m
[32m[I 2025-04-07 13:35:38,503][0m Trial 0 finished with value: -10877592.825608071 and parameters: {'n_estimators': 287, 'criterion': 'friedman_mse', 'max_depth': 19, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 0 with value: -10877592.825608071.[0m
[32m[I 2025-04-07 13:35:51,004][0m Trial 3 finished with value: -12887415.013838667 and parameters: {'n_estimators': 261, 'criterion': 'absolute_error', 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: -10877592.825608071.[0m
[32m[I 2025-04-07 13:36:01,534][0m Trial 5 finished with value: -10895255.0580046 and parameters: {'n_estimators': 319, 'criterion': 'squared_error', 'max_depth': 17, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 0 with value: -10877592.825

Best RF Params: {'n_estimators': 390, 'criterion': 'friedman_mse', 'max_depth': 19, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None}
Best RF Score: -9002651.313008616


In [17]:
# Use best parameters from Optuna study
best_rf_params = study_rf.best_params

# Train model using best parameters
rf_best_model = RandomForestRegressor(**best_rf_params)
rf_best_model.fit(X_train, y_train)

# Predict on test set
y_pred_rf = rf_best_model.predict(X_test)

# Evaluate
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest - Mean Absolute Error (MAE):", round(mae_rf, 4))
print("Random Forest - R² Score:", round(r2_rf, 4))

Random Forest - Mean Absolute Error (MAE): 1014.5223
Random Forest - R² Score: 0.9032


### 8. Saving the trained Random Forest Model

In [18]:
rf_best_model.feature_importances_

array([6.75264228e-01, 9.21055836e-04, 5.54416133e-03, 1.05622642e-01,
       3.85730690e-02, 1.50950809e-02, 7.20295031e-02, 4.10028934e-04,
       2.31847508e-02, 3.13895608e-02, 3.19659188e-02])

In [19]:
import joblib
joblib.dump(rf_best_model, 'new_rf_model.pkl')

['new_rf_model.pkl']

### 9. Identifying feature importances from the trained model

In [21]:
importances = rf_best_model.feature_importances_
feature_names = rf_best_model.feature_names_in_

feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

print(feature_importance_df)

                    Feature  Importance
0                       Age    0.675264
3            AnyTransplants    0.105623
6                    Weight    0.072030
4        AnyChronicDiseases    0.038573
10                      BMI    0.031966
9    NumberOfMajorSurgeries    0.031390
8   HistoryOfCancerInFamily    0.023185
5                    Height    0.015095
2     BloodPressureProblems    0.005544
1                  Diabetes    0.000921
7            KnownAllergies    0.000410
