In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import pickle
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/content/drive/MyDrive/insurance_premium_prediction/data/insurance_c.csv")

## 5. Feature Engineering

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [None]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [None]:
y.head()

Unnamed: 0,charges
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


Split Feature

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, y_test.shape

((1069, 6), (268,))

Column Transformer

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

standard_scaling= StandardScaler()
one_hot = OneHotEncoder()

ct = ColumnTransformer(transformers =
    [
        ("OneHotEncoder", one_hot, cat_features),
         ("StandardScaler", standard_scaling, num_features),
    ],
    remainder = "passthrough"
)

In [None]:
X_train_t = ct.fit_transform(X_train)
X_test_t = ct.transform(X_test)

In [None]:
X_train_t[0]

array([ 0.        ,  1.        ,  1.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        , -1.1576804 , -0.99692768,
       -0.90790804])

In [None]:
X_test_t[0]

array([ 0.        ,  1.        ,  1.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.70051832, -1.3267337 ,
       -0.90790804])

## 6. Model building

In [None]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = root_mean_squared_error(true, predicted)
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNN": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "XGBoost": XGBRegressor(),
    "AdaBoost": AdaBoostRegressor()
}

r2_scores = {}
trained_models = {}

for name, model in models.items():
    model.fit(X_train_t, y_train)
    trained_models[name] = model

    y_train_pred = model.predict(X_train_t)
    y_test_pred = model.predict(X_test_t)

    train_mae, train_mse, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    test_mae, test_mse, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)
    r2_scores[name] = test_r2

    print(f"- {name}")
    print(f"Train R2: {train_r2:.4f} | Test R2: {test_r2:.4f}")
    print(f"Train RMSE: {train_rmse:.4f} | Test RMSE: {test_rmse:.4f}")
    print("-" * 35)

- Linear Regression
Train R2: 0.7299 | Test R2: 0.8069
Train RMSE: 6081.1069 | Test RMSE: 5956.3429
-----------------------------------
- Lasso
Train R2: 0.7299 | Test R2: 0.8068
Train RMSE: 6081.1094 | Test RMSE: 5957.6161
-----------------------------------
- Ridge
Train R2: 0.7299 | Test R2: 0.8064
Train RMSE: 6081.1696 | Test RMSE: 5964.2765
-----------------------------------
- KNN
Train R2: 0.8293 | Test R2: 0.7816
Train RMSE: 4834.7794 | Test RMSE: 6335.1525
-----------------------------------
- Decision Tree
Train R2: 1.0000 | Test R2: 0.8043
Train RMSE: 0.0000 | Test RMSE: 5996.9350
-----------------------------------
- Random Forest
Train R2: 0.9748 | Test R2: 0.8800
Train RMSE: 1855.8825 | Test RMSE: 4696.5778
-----------------------------------
- XGBoost
Train R2: 0.9951 | Test R2: 0.8600
Train RMSE: 822.5030 | Test RMSE: 5072.3697
-----------------------------------
- AdaBoost
Train R2: 0.8041 | Test R2: 0.8470
Train RMSE: 5179.4337 | Test RMSE: 5302.3927
-----------------

In [None]:
pd.DataFrame(r2_scores.items(), columns=["Model Name", "R2 Score"]).sort_values(by="R2 Score", ascending=False)

Unnamed: 0,Model Name,R2 Score
5,Random Forest,0.879961
6,XGBoost,0.859983
7,AdaBoost,0.846996
0,Linear Regression,0.806929
1,Lasso,0.806846
2,Ridge,0.806414
4,Decision Tree,0.804288
3,KNN,0.78159


Random Forest Regressor is the top performer, followed by XGBRegressor  and AdaBoost Regressor. Simpler models like Linear Regression, Lasso, and Ridge offer decent performance with easier interpretability, while Decision Tree and K-Neighbors Regressor are less effective for this particular dataset.

Hyper parameter Tunning

In [None]:
# Define Models
from scipy.stats import randint, uniform
models = {
    'Random_Forest_Regressor': RandomForestRegressor(random_state=42),
    'XGBRegressor': XGBRegressor(random_state=42),
    'AdaBoost_Regressor': AdaBoostRegressor(random_state=42)
}

# Hyperparameter Grids
param_grids = {
    "Random_Forest_Regressor": {
        'n_estimators': randint(100, 1000),
        'max_depth': randint(10, 50),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10),
    },

    "XGBRegressor":  {
        'n_estimators': randint(100, 1000),
        'learning_rate': uniform(0.01, 0.3),
        'max_depth': randint(3, 10),
    },

    "AdaBoost_Regressor": {
        'n_estimators': randint(50, 500),
        'learning_rate': uniform(0.01, 1.5),
    }
}

In [None]:
# Train and Tune
best_models = {}
for name in models:
    print(f"Training and tuning {name}...")
    search = RandomizedSearchCV(models[name], param_grids[name], cv=3, n_iter=5, scoring='r2', random_state=42)
    search.fit(X_train_t, y_train)
    best_model = search.best_estimator_
    best_models[name] = best_model

Training and tuning Random_Forest_Regressor...
Training and tuning XGBRegressor...
Training and tuning AdaBoost_Regressor...


In [None]:
best_models

{'Random_Forest_Regressor': RandomForestRegressor(max_depth=28, min_samples_leaf=7, min_samples_split=12,
                       n_estimators=558, random_state=42),
 'XGBRegressor': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None,
              learning_rate=np.float64(0.027425083650459835), max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=199, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...),
 'AdaBoost_Regressor': A

In [None]:
best_model_final = {"model_name": None, "model": None, "r2": -float("inf")}

print("\nFinal Evaluation on Test Set:")
for name, model in best_models.items():
    y_pred = model.predict(X_test_t)
    mae, mse, rmse, r2 = evaluate_model(y_test, y_pred)


    print(f"{name} - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.2f}")

    # Update best model if current R² is better
    if r2 > best_model_final["r2"]:
        best_model_final["model_name"] = name
        best_model_final["model"] = model
        best_model_final["r2"] = r2


Final Evaluation on Test Set:
Random_Forest_Regressor - RMSE: 4255.24, MAE: 2415.41, R²: 0.90
XGBRegressor - RMSE: 4565.85, MAE: 2452.09, R²: 0.89
AdaBoost_Regressor - RMSE: 5388.20, MAE: 4682.89, R²: 0.84


### Creating Pickle File

In [None]:
Random_Forest_Regressor = best_model_final['model']
model_name = best_model_final['model_name']

with open("/content/drive/MyDrive/insurance_premium_prediction/pkl/" + model_name + '.pkl', 'wb') as file:
    pickle.dump(Random_Forest_Regressor, file)

In [None]:
with open('/content/drive/MyDrive/insurance_premium_prediction/pkl/transformer.pkl', 'wb') as file:
    pickle.dump(ct, file)

## 7. Testing Model

In [None]:
pickle_file_path = '/content/drive/MyDrive/insurance_premium_prediction/pkl/Random_Forest_Regressor.pkl'
with open(pickle_file_path, 'rb') as file:
    RandomForest_Regressor = pickle.load(file)

In [None]:
transformer_file_path = '/content/drive/MyDrive/insurance_premium_prediction/pkl/transformer.pkl'
with open(transformer_file_path, 'rb') as file:
    transformer = pickle.load(file)

In [None]:
X_test.iloc[[0]]

Unnamed: 0,age,sex,bmi,children,smoker,region
899,49,male,22.515,0,no,northeast


In [None]:
input = transformer.transform(X_test.iloc[[0]])
input

array([[ 0.        ,  1.        ,  1.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.70051832, -1.3267337 ,
        -0.90790804]])

In [None]:
RandomForest_Regressor.predict(input),  y_test.iloc[0]

(array([10220.72152168]), np.float64(8688.85885))