In [1]:
import numpy as np
import pandas as pd

#Training
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
x = df.drop(columns=['charges'], axis=1)

In [5]:
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [6]:
y = df['charges']

In [7]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [8]:
# Create Column Transformer 

num_features = x.select_dtypes(exclude="object").columns
cat_features = x.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [9]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder()

In [10]:
processor = ColumnTransformer(
    [
        ("StandardScaler", num_transformer, num_features),
        ("OneHotEncoder", cat_transformer, cat_features)
    ]
)

In [11]:
x = processor.fit_transform(x)

In [12]:
x.shape

(1338, 11)

In [13]:
# Seprate dataset for train and test split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
x_train.shape, x_test.shape

((936, 11), (402, 11))

#### Create an Evaluate Function to give all metrics after model Training

In [14]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [15]:
models = {
    "Linear Regression" : LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 6142.5198
- Mean Absolute Error: 4251.2558
- R2 Score: 0.7424
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5812.1003
- Mean Absolute Error: 4145.4506
- R2 Score: 0.7696


Lasso
Model performance for Training set
- Root Mean Squared Error: 6142.5223
- Mean Absolute Error: 4251.2480
- R2 Score: 0.7424
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5812.1939
- Mean Absolute Error: 4145.7578
- R2 Score: 0.7696


Ridge
Model performance for Training set
- Root Mean Squared Error: 6142.6014
- Mean Absolute Error: 4256.0118
- R2 Score: 0.7424
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5813.4693
- Mean Absolute Error: 4151.3837
- R2 Score: 0.7695


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 4954.3565
- Mean Absolute Error: 2988.7511


In [16]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=["Model Name", "R2_Score"]).sort_values(by=["R2_Score"], ascending=False)

Unnamed: 0,Model Name,R2_Score
5,Random Forest Regressor,0.856166
7,CatBoosting Regressor,0.854802
6,XGBRegressor,0.83167
8,AdaBoost Regressor,0.826498
0,Linear Regression,0.769612
1,Lasso,0.769604
2,Ridge,0.769503
3,K-Neighbors Regressor,0.763999
4,Decision Tree,0.72906
