In [1]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# data preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
# Modelling
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
# Model evaluation
from src.model_performance import evaluate_model, print_evaluated_results
# ensemble modelling
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
# save model
import joblib


In [3]:
df = pd.read_csv("data/insurance.csv")

In [4]:
df.shape

(1338, 7)

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


#### Data Preprocessing
- Handling duplicates by dropping 
- Get features and target variable
- Train-test split 
- Encoding categorical variables  
- Feature scaling  


#### Dropping the duplicate row

In [6]:
df.drop_duplicates(inplace=True)

#### Getting the features(X) and target variable(y)

In [7]:
target = 'charges'
X = df.drop(target, axis=1)
y = df[target]

In [8]:
# Define which columns should be encoded and which should be scaled
cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(exclude='object').columns


#### Train - test split

In [9]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
 #Define transformers
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

#### Model Training

In [11]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_names = []
test_r2_scores = []
test_rmse_scores = []

for name, model in models.items():
    # Build pipeline with preprocessing + model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Train on raw data 
    pipeline.fit(X_train, y_train)

    # Predict on train and test
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    # Evaluate performance
    train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)

    print(f"{name}")
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(train_mae))
    print("- R2 Score: {:.4f}".format(train_r2))

    print('----------------------------------')

    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(test_mae))
    print("- R2 Score: {:.4f}".format(test_r2))

    model_names.append(name)
    test_r2_scores.append(test_r2)
    test_rmse_scores.append(test_rmse)
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 6081.1069
- Mean Absolute Error: 4181.9015
- R2 Score: 0.7299
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5956.3429
- Mean Absolute Error: 4177.0456
- R2 Score: 0.8069


Lasso
Model performance for Training set
- Root Mean Squared Error: 6081.1094
- Mean Absolute Error: 4182.0241
- R2 Score: 0.7299
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5957.6178
- Mean Absolute Error: 4177.8750
- R2 Score: 0.8068


Ridge
Model performance for Training set
- Root Mean Squared Error: 6081.1696
- Mean Absolute Error: 4185.9403
- R2 Score: 0.7299
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5964.2765
- Mean Absolute Error: 4185.4070
- R2 Score: 0.8064


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 4834.7794
- Mean Absolute Error: 2918.5554


### Results

In [12]:
df_results = pd.DataFrame(
    list(zip(model_names, test_r2_scores, test_rmse_scores)),
    columns=['Model Name', 'R2_Score', 'RMSE']
).sort_values(by=["R2_Score"], ascending=False)

df_results


Unnamed: 0,Model Name,R2_Score,RMSE
7,CatBoosting Regressor,0.884436,4608.206251
5,Random Forest Regressor,0.879616,4703.322329
6,XGBRegressor,0.860713,5059.140897
8,AdaBoost Regressor,0.860252,5067.506165
0,Linear Regression,0.806929,5956.342894
1,Lasso,0.806846,5957.617816
2,Ridge,0.806414,5964.276526
4,Decision Tree,0.791415,6191.01854
3,K-Neighbors Regressor,0.78159,6335.152462


### Hyperparameter Tuning

In [13]:
# Creating the hyperparameter grid for CatBoost
cat_param_grid = {
    'model__depth': [4, 5, 6, 7, 8, 9, 10],
    'model__learning_rate': [0.01, 0.02, 0.03, 0.04],
    'model__iterations': [300, 400, 500, 600]
}

# Define the pipeline
cat_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', CatBoostRegressor(verbose=False))
])

# RandomizedSearchCV with pipeline
cat_search = RandomizedSearchCV(
    estimator=cat_pipeline,
    param_distributions=cat_param_grid,
    n_iter=10,
    cv=5,
    n_jobs=-1,
    scoring='r2',
    random_state=42
)

# Fit search
cat_search.fit(X_train, y_train)

# Results
print("Best CatBoost Params:", cat_search.best_params_)
print("Best CatBoost R2_score:", cat_search.best_score_)


Best CatBoost Params: {'model__learning_rate': 0.01, 'model__iterations': 400, 'model__depth': 4}
Best CatBoost R2_score: 0.8429302962243813


In [14]:
# Selecting best model
best_cbr = cat_search.best_estimator_

# Evaluate Train and Test dataset
print_evaluated_results(best_cbr,X_train,y_train,X_test,y_test)

Model performance for Training set
- Root Mean Squared Error: 4431.4391
- Mean Absolute Error: 2535.9612
- R2 Score: 0.8566
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4358.4370
- Mean Absolute Error: 2575.4553
- R2 Score: 0.8966


#### Tuning XGBRegressor

In [15]:
# Hyperparameter grid (note the 'model__' prefix)
xgb_param_grid = {
    'model__learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    'model__max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
    'model__min_child_weight': [1, 3, 5, 7],
    'model__gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
    'model__colsample_bytree': [0.3, 0.4, 0.5, 0.7],
    'model__n_estimators': [300, 400, 500, 600]
}

# Pipeline including preprocessing
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=42))
])

# Randomized SearchCV using pipeline
xgb_search = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=xgb_param_grid,
    n_iter=100,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Fit the search
xgb_search.fit(X_train, y_train)

# Results
print("Best XGBRegressor Params:", xgb_search.best_params_)
print("Best XGBRegressor R2_score:", xgb_search.best_score_)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best XGBRegressor Params: {'model__n_estimators': 500, 'model__min_child_weight': 7, 'model__max_depth': 3, 'model__learning_rate': 0.05, 'model__gamma': 0.3, 'model__colsample_bytree': 0.5}
Best XGBRegressor R2_score: 0.8322124061148763


In [16]:
# Selecting best xgb model
best_xgb = xgb_search.best_estimator_

# Evaluate Train and Test dataset
print_evaluated_results(best_xgb,X_train,y_train,X_test,y_test)

Model performance for Training set
- Root Mean Squared Error: 3956.9107
- Mean Absolute Error: 2199.1965
- R2 Score: 0.8856
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4420.8725
- Mean Absolute Error: 2666.8002
- R2 Score: 0.8936


#### Random forest hyperparameter tuning

In [17]:
rf_params = {
    'model__n_estimators': [100, 200, 300, 400, 500],
    'model__max_depth': [None, 5, 10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt']
}

# Define pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# RandomizedSearchCV
rf_search = RandomizedSearchCV(
    rf_pipeline,
    param_distributions=rf_params,
    n_iter=10,
    cv=5,
    n_jobs=-1,
    scoring='r2',
    random_state=42,
    verbose=1
)

# Fit the search
rf_search.fit(X_train, y_train)

# Output best results
print("Best Random Forest Params:", rf_search.best_params_)
print("Best Random Forest R2_score:", rf_search.best_score_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Random Forest Params: {'model__n_estimators': 300, 'model__min_samples_split': 2, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 20}
Best Random Forest R2_score: 0.826439413908659


In [18]:
# Selecting best rf model
best_rf = rf_search.best_estimator_

# Evaluate Train and Test dataset
print_evaluated_results(best_rf,X_train,y_train,X_test,y_test)

Model performance for Training set
- Root Mean Squared Error: 1788.3521
- Mean Absolute Error: 1018.2724
- R2 Score: 0.9766
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4787.5756
- Mean Absolute Error: 2766.5365
- R2 Score: 0.8753


In [19]:
ada_params = {
    'model__n_estimators': [50, 100, 200, 300, 500],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 1.0],
    'model__loss': ['linear', 'square', 'exponential']
}

# Define pipeline
ada_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', AdaBoostRegressor(random_state=42))
])

# Randomized search
ada_search = RandomizedSearchCV(
    estimator=ada_pipeline,
    param_distributions=ada_params,
    n_iter=100,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Fit search
ada_search.fit(X_train, y_train)

# Results
print("Best AdaBoost Params:", ada_search.best_params_)
print("Best AdaBoost R2_score:", ada_search.best_score_)


Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best AdaBoost Params: {'model__n_estimators': 50, 'model__loss': 'exponential', 'model__learning_rate': 0.01}
Best AdaBoost R2_score: 0.8311618999296911


In [20]:
# Selecting best adaboost model
best_ada = ada_search.best_estimator_

# Evaluate Train and Test dataset
print_evaluated_results(best_ada,X_train,y_train,X_test,y_test)

Model performance for Training set
- Root Mean Squared Error: 4548.8813
- Mean Absolute Error: 2858.2712
- R2 Score: 0.8489
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4462.1888
- Mean Absolute Error: 2936.6425
- R2 Score: 0.8916


In [None]:
joblib.dump(best_cbr, '/content/drive/MyDrive/insurance_pricing_ml/models/catboost_model.pkl')
joblib.dump(best_xgb, '/content/drive/MyDrive/insurance_pricing_ml/models/xgboost_model.pkl')
joblib.dump(best_ada, '/content/drive/MyDrive/insurance_pricing_ml/models/adaboost_model.pkl')


['/content/drive/MyDrive/insurance_pricing_ml/models/adaboost_model.pkl']

#### Improving model performance with Ensemble technique

In [22]:
# Extract raw trained models from their pipelines
catboost_model = best_cbr.named_steps['model']
xgb_model = best_xgb.named_steps['model']
ada_model = best_ada.named_steps['model']

# Define the ensemble voting regressor with trained models
voting_reg = VotingRegressor(
    estimators=[
        ('catboost', catboost_model),
        ('xgboost', xgb_model),
        ('adaboost', ada_model)
    ]
)

# Wrap the entire ensemble in a pipeline with a single shared preprocessor
voting_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', voting_reg)
])

# Fit the ensemble pipeline
voting_pipeline.fit(X_train, y_train)

# Evaluate performance
print_evaluated_results(voting_pipeline, X_train, y_train, X_test, y_test)

Model performance for Training set
- Root Mean Squared Error: 4218.5835
- Mean Absolute Error: 2417.6043
- R2 Score: 0.8700
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4285.7039
- Mean Absolute Error: 2610.3511
- R2 Score: 0.9000


In [24]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression


# Extract trained models (already tuned) from their pipelines
catboost_model = best_cbr.named_steps['model']
xgb_model = best_xgb.named_steps['model']
ada_model = best_ada.named_steps['model']

stacking_model = StackingRegressor(
    estimators=[
        ('catboost', catboost_model),
        ('xgboost', xgb_model),
        ('adaboost', ada_model)
    ],
    final_estimator=LinearRegression(),
    cv=5,
    n_jobs=-1
)

# Meta-model
meta_model = LinearRegression()



# Wrap in a pipeline (this is now deployable)
stacking_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', stacking_model)
])

# Fit the full pipeline
stacking_pipeline.fit(X_train, y_train)

# Evaluate
print_evaluated_results(stacking_pipeline, X_train, y_train, X_test, y_test)


Model performance for Training set
- Root Mean Squared Error: 4335.6435
- Mean Absolute Error: 2394.1836
- R2 Score: 0.8627
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4209.5625
- Mean Absolute Error: 2454.5471
- R2 Score: 0.9036


In [None]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Helper function
def evaluate_all_models(model, name, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mae = mean_absolute_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)

    return {
        "Model": name,
        "Test RMSE": rmse,
        "Test MAE": mae,
        "Test R2 Score": r2
    }

# List of all models to compare
model_results = [
    evaluate_all_models(best_cbr, "CatBoost Regressor", X_train, y_train, X_test, y_test),
    evaluate_all_models(best_xgb, "XGBoost Regressor", X_train, y_train, X_test, y_test),
    evaluate_all_models(best_rf, "Random Forest Regressor", X_train, y_train, X_test, y_test),
    evaluate_all_models(best_ada, "AdaBoost Regressor", X_train, y_train, X_test, y_test),
    evaluate_all_models(voting_pipeline, "Voting Regressor", X_train, y_train, X_test, y_test),
    evaluate_all_models(stacking_pipeline, "Stacking Regressor", X_train, y_train, X_test, y_test)
]

# Create DataFrame
summary_df = pd.DataFrame(model_results).sort_values(by="Test R2 Score", ascending=False)

# Display
print(summary_df)

# Optionally export to CSV
#summary_df.to_csv("/content/drive/MyDrive/insurance_pricing_ml/models/model_comparison_summary.csv", index=False)


                     Model    Test RMSE     Test MAE  Test R2 Score
5       Stacking Regressor  4209.562460  2454.547060       0.903566
4         Voting Regressor  4285.703878  2610.351109       0.900045
0       CatBoost Regressor  4358.436979  2575.455341       0.896624
1        XGBoost Regressor  4420.872535  2666.800160       0.893641
3       AdaBoost Regressor  4462.188801  2936.642492       0.891644
2  Random Forest Regressor  4787.575587  2766.536526       0.875265


OSError: Cannot save file into a non-existent directory: '\content\drive\MyDrive\insurance_pricing_ml\models'

### 7. Model Comparison
- Compare top perfoming models
- Select best-performing model  


In [28]:
import os

In [31]:
# Define the directory and create it if it doesn't exist
save_dir = 'models'
os.makedirs(save_dir, exist_ok=True)

#  Save the stacking model
joblib.dump(stacking_pipeline, 'models/stacking_pipeline.pkl')

# Save the voting model
joblib.dump(voting_pipeline, 'models/voting_pipeline.pkl')

['models/voting_pipeline.pkl']

In [None]:
import joblib

# Assume pipeline is your full pipeline object with preprocessing + model
joblib.dump(pipeline, 'final_pipeline.pkl')
