In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

In [16]:
df=pd.read_csv('insurance.csv',encoding='latin1')

In [17]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [19]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [20]:
## Data Cleaning
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
categorical_features = [feature for feature in df.columns if df[feature].dtypes == 'O']
categorical_features

['sex', 'smoker', 'region']

In [23]:
numerical_features=[feature for feature in df.columns if df[feature].dtypes != 'O']
numerical_features

['age', 'bmi', 'children', 'charges']

In [26]:
## Indpendent and dependent features
from sklearn.model_selection import train_test_split
X = df.drop(['charges'], axis=1)
y = df['charges']

In [27]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
onehot_columns = ['sex','smoker','region']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features)
        
    ],remainder='passthrough'
    
)

In [28]:
X=preprocessor.fit_transform(X)

In [29]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((1070, 11), (268, 11))

In [36]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [32]:
##Create a Function to Evaluate Model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [37]:
## Beginning Model Training
models = {
    "Linear Regression": LinearRegression(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Adaboost":AdaBoostRegressor(),
    "XGboost":XGBRegressor()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 6106.1214
- Mean Absolute Error: 4216.1593
- R2 Score: 0.7417
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5800.6789
- Mean Absolute Error: 4194.3274
- R2 Score: 0.7833


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 4446.0204
- Mean Absolute Error: 2679.2378
- R2 Score: 0.8630
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5552.9885
- Mean Absolute Error: 3512.4474
- R2 Score: 0.8014


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 494.2060
- Mean Absolute Error: 29.5725
- R2 Score: 0.9983
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6686.4342
- Mean Absolute Error: 3209.1698
- R2 Score: 0.7120


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 1882.3792
- Mean Abs

In [41]:
##random forest and xgboost is performing well
rf_params = {"max_depth": [5, 8, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 300,500]}
xg_params={"max_depth": [5, 8, None, 10],
           "n_estimators": [100, 300,500],
           "reg_alpha": [0, 0.01, 0.1],
           "reg_lambda": [0.1, 0.5, 1.0]}           

In [42]:
# Models list for Hyperparameter tuning
randomcv_models = [("RF", RandomForestRegressor(), rf_params),
                   ("XGB", XGBRegressor(),xg_params)
                   ]

In [43]:
##Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------------- Best Params for RF -------------------
{'n_estimators': 300, 'min_samples_split': 15, 'max_features': 7, 'max_depth': 5}
---------------- Best Params for XGB -------------------
{'reg_lambda': 1.0, 'reg_alpha': 0.01, 'n_estimators': 100, 'max_depth': None}


In [44]:
## Retraining the models with best parameters
models = {
    "Random Forest Regressor": RandomForestRegressor(n_estimators=300, min_samples_split=15, max_features=7, max_depth=5),
     "XGBoost Regressor": XGBRegressor(reg_lambda=1.0,reg_alpha=0.01,n_estimators=100,max_depth=None)
    
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    
    print('='*35)
    print('\n')

Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 4137.6973
- Mean Absolute Error: 2345.7882
- R2 Score: 0.8814
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4350.7607
- Mean Absolute Error: 2508.2687
- R2 Score: 0.8781


XGBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 875.6809
- Mean Absolute Error: 458.3725
- R2 Score: 0.9947
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4902.7338
- Mean Absolute Error: 2748.2658
- R2 Score: 0.8452




In [45]:
##xgboost is performing same
##rf has working well on test data bt on training data its performance is being sacrificed