In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("insurance_pre.csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 62.8+ KB


In [7]:
dataset=pd.get_dummies(df,drop_first=True,dtype="int" #smoker(Yes-1,No-0), sex (male-1, female-0)

In [9]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [11]:
dataset.info() # Checking the datatype once we created the columns using get_dummies

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         1338 non-null   int64  
 1   bmi         1338 non-null   float64
 2   children    1338 non-null   int64  
 3   charges     1338 non-null   float64
 4   sex_male    1338 non-null   int32  
 5   smoker_yes  1338 non-null   int32  
dtypes: float64(2), int32(2), int64(2)
memory usage: 52.4 KB


In [13]:
dataset.isnull().sum() # Checking whether we have any null values or not

age           0
bmi           0
children      0
charges       0
sex_male      0
smoker_yes    0
dtype: int64

In [15]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [17]:
x = dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']]
y = dataset['charges'].values

x,y

(      age     bmi  children  sex_male  smoker_yes
 0      19  27.900         0         0           1
 1      18  33.770         1         1           0
 2      28  33.000         3         1           0
 3      33  22.705         0         1           0
 4      32  28.880         0         1           0
 ...   ...     ...       ...       ...         ...
 1333   50  30.970         3         1           0
 1334   18  31.920         0         0           0
 1335   18  36.850         0         0           0
 1336   21  25.800         0         0           0
 1337   61  29.070         0         0           1
 
 [1338 rows x 5 columns],
 array([16884.924 ,  1725.5523,  4449.462 , ...,  1629.8335,  2007.945 ,
        29141.3603]))

In [19]:
#split into training set and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 0)

In [21]:
X_train

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
1163,18,28.215,0,0,0
196,39,32.800,0,0,0
438,52,46.750,5,0,0
183,44,26.410,0,0,0
1298,33,27.455,2,1,0
...,...,...,...,...,...
763,27,26.030,0,1,0
835,42,35.970,2,1,0
1216,40,25.080,0,1,0
559,19,35.530,0,1,0


In [23]:
# Performing standard scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# !pip install xgboost

In [25]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

In [27]:
# Defining Algorithms and Their Parameter Grids
algorithms ={
    'LinearRegression' :{
        'model' : LinearRegression(),
        'param_grid' : {} #No Hyperparmeter to tune
    },
    'SVM' : {
        'model' : SVR(),
        'param_grid' : {
            'kernel' :['linear','rbf','poly'], #Determines the shape of the decision boundary.
            'C': [0.1,1,10,100,1000], #Regularization parameter to control training and test error
            'gamma':['auto','scale'], # Gamma for kernel coefficient.
            'epsilon': [0.1, 0.2, 0.5] # Controls the tolerance for errors in regression.
        }
    },
    'DecisionTree' : {
        'model' : DecisionTreeRegressor(),
        'param_grid' :{
            'criterion' :['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],  #To measure the quality of a split.
            'max_features': ['sqrt', 'log2'], #The number of features to consider when looking for the best split
            'max_depth':[3, 5, 10, None], # maximum depth of the tree
            'min_samples_split':[2,5,10], # The minimum number of samples required to split an internal node.
            'splitter' : ['best', 'random']
        }
    },
    'RandomForest' :{ # Bagging Technique
        'model' : RandomForestRegressor(),
        'param_grid' :{
            'n_estimators' :[100, 200,300],
            'criterion' :['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'max_features': ['sqrt', 'log2'],
            'max_depth':[3, 5, 10, None],
            'ccp_alpha': [0.0, 0.01, 0.1, 1.0] #U sed for pruning the decision trees within the random forest to reduce overfitting.(post-pruning)
        }        
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(),
        'param_grid': {
            'n_estimators': [100, 200, 300],            # Number of boosting stages
            'learning_rate': [0.01, 0.05, 0.1],         # Controls step size
            'max_depth': [3, 5, 10],                    # Maximum tree depth
            'min_samples_split': [2, 5, 10]             # Minimum samples to split a node
        }
    },
    'XGBoost' : {
        'model': XGBRegressor(),
        'param_grid' : {
            'n_estimators': [100, 200, 300],           # Number of trees
            'learning_rate': [0.01, 0.05, 0.1],         # Step size shrinkage
            'max_depth': [3, 5, 10],                    # Maximum depth of trees
            'min_child_weight': [1, 3, 5]               # To ensures the node has enough importance or confidence to exist.          
        }
    },
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'param_grid': {
            'n_neighbors': [3, 5, 10, 20],              # Number of neighbors
            'weights': ['uniform', 'distance'],         # Weighting method
            'algorithm': ['auto', 'ball_tree', 'kd_tree'],  # Search algorithm
            'p': [1, 2]                                  # Distance metric (1=Manhattan, 2=Euclidean)
        }
    }    
}

In [29]:
# Perform GridSearch for each algorithm
results = {}
for algo_name,config_name in algorithms.items():
    print(f"Training {algo_name} : ")
    grid = GridSearchCV(estimator = config_name['model'], 
                        param_grid = config_name['param_grid'],
                        scoring='r2',                        
                        refit=True,
                        verbose=3,
                        n_jobs=-1)
    grid.fit(X_train,y_train)

    # Predict on test data
    y_pred = grid.predict(X_test)

    # Calculate r2 score
    r2score = r2_score(y_test,y_pred)

    # Store results

    results[algo_name] = {
        'best_params' : grid.best_params_,
        'r2_score' : r2score,
        'best_model': grid.best_estimator_
    }



Training LinearRegression : 
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Training SVM : 
Fitting 5 folds for each of 90 candidates, totalling 450 fits
Training DecisionTree : 
Fitting 5 folds for each of 192 candidates, totalling 960 fits
Training RandomForest : 
Fitting 5 folds for each of 384 candidates, totalling 1920 fits
Training GradientBoosting : 
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Training XGBoost : 
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Training KNeighborsRegressor : 
Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [41]:
results.items()

dict_items([('LinearRegression', {'best_params': {}, 'r2_score': 0.7894790349867009, 'best_model': LinearRegression()}), ('SVM', {'best_params': {'C': 1000, 'epsilon': 0.5, 'gamma': 'auto', 'kernel': 'poly'}, 'r2_score': 0.856675173876146, 'best_model': SVR(C=1000, epsilon=0.5, gamma='auto', kernel='poly')}), ('DecisionTree', {'best_params': {'criterion': 'friedman_mse', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'splitter': 'best'}, 'r2_score': 0.2499092557445668, 'best_model': DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,
                      max_features='sqrt')}), ('RandomForest', {'best_params': {'ccp_alpha': 0.0, 'criterion': 'absolute_error', 'max_depth': 10, 'max_features': 'log2', 'n_estimators': 200}, 'r2_score': 0.8869141774113195, 'best_model': RandomForestRegressor(criterion='absolute_error', max_depth=10,
                      max_features='log2', n_estimators=200)}), ('GradientBoosting', {'best_params': {'learning_rate': 0.05, 'max_de

In [43]:
# Print Results
for algo_name, result in results.items():
    print(f"\n{algo_name}:")
    print(f"Best Parameters: {result['best_params']}")
    print(f"R2 Score: {result['r2_score']}")


LinearRegression:
Best Parameters: {}
R2 Score: 0.7894790349867009

SVM:
Best Parameters: {'C': 1000, 'epsilon': 0.5, 'gamma': 'auto', 'kernel': 'poly'}
R2 Score: 0.856675173876146

DecisionTree:
Best Parameters: {'criterion': 'friedman_mse', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'splitter': 'best'}
R2 Score: 0.2499092557445668

RandomForest:
Best Parameters: {'ccp_alpha': 0.0, 'criterion': 'absolute_error', 'max_depth': 10, 'max_features': 'log2', 'n_estimators': 200}
R2 Score: 0.8869141774113195

GradientBoosting:
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 10, 'n_estimators': 100}
R2 Score: 0.8913563537931856

XGBoost:
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100}
R2 Score: 0.8927017681759811

KNeighborsRegressor:
Best Parameters: {'algorithm': 'ball_tree', 'n_neighbors': 10, 'p': 2, 'weights': 'uniform'}
R2 Score: 0.8654160885764687


In [45]:
# Finding the best algorithm

best_algo_name = max(results, key=lambda k: results[k]['r2_score'])
best_model = results[best_algo_name]['best_model']

In [47]:
import pickle

# Save the best model using pickle
filename="insurance_final_model.pkl"

with open(filename,'wb') as file: #using with we can close the file automatically.
    pickle.dump(best_model, file)

print(f"Best model ({best_algo_name}) saved to {filename}")


Best model (XGBoost) saved to insurance_final_model.pkl
