In [3]:
#Importing required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

In [4]:
#Loading dataset
df = pd.read_csv("dataset/insurance.csv")

In [5]:
#top 5 records of dataset
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [6]:
#Shape of dataset
df.shape

(1338, 7)

In [7]:
#Check duplicated values in dataset
df.duplicated().sum()

1

In [8]:
#Here we will drop one duplicated value from dataset.
df.drop_duplicates(inplace=True)

In [9]:
df.duplicated().sum()

0

In [10]:
#Check null values in dataset
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [11]:
#There is no missing values in dataset

In [12]:
#Columns of dataset
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'expenses'], dtype='object')

In [13]:
df.select_dtypes(include='object').columns

Index(['sex', 'smoker', 'region'], dtype='object')

In [14]:
#Independent and Dependent Variables
X= df.drop('expenses', axis = 1)
y= df[['expenses']]

In [15]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.8,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.9,0,no,northwest


In [16]:
y.head()

Unnamed: 0,expenses
0,16884.92
1,1725.55
2,4449.46
3,21984.47
4,3866.86


In [17]:
X.shape, y.shape

((1337, 6), (1337, 1))

In [18]:
#Separate categorical features and numerical features
categorical_columns= X.dtypes[X.dtypes=='object'].index
numerical_columns= X.dtypes[X.dtypes!='object'].index

print("Categorical Features: ", categorical_columns)
print("Numerical Features: ", numerical_columns)

Categorical Features:  Index(['sex', 'smoker', 'region'], dtype='object')
Numerical Features:  Index(['age', 'bmi', 'children'], dtype='object')


In [19]:
#Unique values in categorical features
for col in categorical_columns:
  print(col, df[col].unique())
  print("="*25)


sex ['female' 'male']
smoker ['yes' 'no']
region ['southwest' 'southeast' 'northwest' 'northeast']


In [20]:
#Numerical pipeline
#Numerical pipeline
num_pipeline= Pipeline(
    steps= [
        ('scaler', StandardScaler()) 
    ]
)


#Categorical Pipeline
cat_pipeline= Pipeline(
    steps= [
        ('one_hot_encoder', OneHotEncoder()),
        ('scaler', StandardScaler(with_mean=False))
    ]
)

preprocessor= ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
])

In [23]:
#Train-test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(X,y, test_size= 0.2, random_state= 42)

In [24]:
x_train.shape, x_test.shape

((1069, 6), (268, 6))

In [25]:
y_train.shape, y_test.shape

((1069, 1), (268, 1))

In [26]:
#Transforming dataset
x_train= pd.DataFrame(preprocessor.fit_transform(x_train), columns= preprocessor.get_feature_names_out())
x_test= pd.DataFrame(preprocessor.transform(x_test), columns= preprocessor.get_feature_names_out())

### Model Training

In [27]:
#Create function for evaluating metrics of the models

def evalute_models(true, predicted):
  mae= mean_absolute_error(true, predicted)
  mse = mean_squared_error(true, predicted)
  rmse= np.sqrt(mean_squared_error(true, predicted))
  score= r2_score(true, predicted)
  return mae, rmse, score

In [28]:
#Training multiple models

models= {
    "Linear Regression": LinearRegression(),
    'Ridge': Ridge(),
    "Lasso": Lasso(),
    "KNieghborsRegressor": KNeighborsRegressor(),
    "DecisionTree Regressor": DecisionTreeRegressor(),
    "RandomForest Regressor": RandomForestRegressor(),
    "XGB Regressor": XGBRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor()
}

model_list= []
r2_score_list= []

for i in range(len(list(models))):
  model= list(models.values())[i]
  #Train the model
  model.fit(x_train, y_train)
  #Make Prediction
  y_predict = model.predict(x_test)
  
  mae, rmse, score= evalute_models(y_test, y_predict)
  print(list(models.keys())[i])

  model_list.append(list(models.keys())[i])
  r2_score_list.append(score)
  print("Model Training Performance")
  print("RMSE: ", rmse*100)
  print("MAE: ", mae*100)
  print("R2 Score: ", score*100)
  print("="*30)

Linear Regression
Model Training Performance
RMSE:  592151.1252587889
MAE:  414703.43656716414
R2 Score:  80.91802039485573
Ridge
Model Training Performance
RMSE:  595831.8836352848
MAE:  417847.2061010102
R2 Score:  80.68005936059993
Lasso
Model Training Performance
RMSE:  595720.8475870077
MAE:  417746.77259615954
R2 Score:  80.68725941160602
KNieghborsRegressor
Model Training Performance
RMSE:  551134.0913722184
MAE:  349434.4276119403
R2 Score:  83.46999970390272
DecisionTree Regressor
Model Training Performance
RMSE:  637761.5134862764
MAE:  293567.0298507462
R2 Score:  77.8652345251563
RandomForest Regressor
Model Training Performance
RMSE:  474729.3232816422
MAE:  267363.09600746265
R2 Score:  87.73548366471795
XGB Regressor
Model Training Performance
RMSE:  509077.67336905457
MAE:  298977.5964947601
R2 Score:  85.89651598244689
GradientBoostingRegressor
Model Training Performance
RMSE:  432599.69196550176
MAE:  257043.2389249326
R2 Score:  89.81571084886923
AdaBoostRegressor
Mo

In [29]:
r2_score_list

[0.8091802039485573,
 0.8068005936059992,
 0.8068725941160603,
 0.8346999970390271,
 0.778652345251563,
 0.8773548366471795,
 0.8589651598244689,
 0.8981571084886922,
 0.8683382928765038]

In [30]:
pd.DataFrame({"Model_name":model_list,"R2_scores": r2_score_list}).sort_values(by= 'R2_scores' , ascending= False)

Unnamed: 0,Model_name,R2_scores
7,GradientBoostingRegressor,0.898157
5,RandomForest Regressor,0.877355
8,AdaBoostRegressor,0.868338
6,XGB Regressor,0.858965
3,KNieghborsRegressor,0.8347
0,Linear Regression,0.80918
2,Lasso,0.806873
1,Ridge,0.806801
4,DecisionTree Regressor,0.778652


In [31]:
#From the above we can say that, out of all above regression models, the GradientBoostingRegressor moels gives better accuracy 89% with good RMSE value.