In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score

In [None]:
df=pd.read_csv("insurance.csv")

In [None]:
df.head(5)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [None]:
df.shape

(1338, 7)

In [None]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming 'X' contains features and 'y' contains target variable
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [None]:
# Encode categorical features
categorical_features = ['sex', 'smoker', 'region']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ], remainder='passthrough')


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### **Linear regrassion**

In [None]:
# Create and train regression model
regression_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
regression_model.fit(X_train, y_train)

In [None]:

# Evaluate model performance
y_pred = regression_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2score = r2_score(y_test,y_pred)
print("R2 Score", r2score)

Mean Squared Error: 33596915.851361476
R2 Score 0.7835929767120722


### **Ridge**

In [None]:
# create and train regression model
from sklearn.linear_model import Ridge

regression_model_ridge= Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("regressor",Ridge())
])

regression_model_ridge.fit(X_train,y_train)

### **Lasso**

In [None]:
# Create and train regression model
from sklearn.linear_model import Lasso

regression_model_lasso = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=28))
])
regression_model_lasso.fit(X_train, y_train)

In [None]:
# Evaluate model Performance

y_pred_lasso=regression_model_lasso.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
print("mean squared error: ", mse)
r2score= r2_score(y_test,y_pred)
print("R2 score: ",r2score)

mean squared error:  33596915.851361476
R2 score:  0.7835929767120722


### **Elastic Net**

In [None]:
from sklearn.linear_model import ElasticNet

regression_model_elastic = Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("regressor",ElasticNet(alpha=0.1,l1_ratio=0.5))
])
regression_model_elastic.fit(X_train,y_train)

In [None]:
y_pred_elastic_net=regression_model_elastic.predict(X_test)

In [None]:
mse=mean_squared_error(y_test,y_pred_elastic_net)
print("mean squared error: ",mse)
r2score=r2_score(y_test,y_pred_elastic_net)
print("R2 score: ",r2score)

mean squared error:  36213757.726911016
R2 score:  0.766737174732858


### **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeRegressor

regression_model_decision_tree = Pipeline( steps=[
    ("preprocessor",preprocessor),
    ("regressor",DecisionTreeRegressor(max_depth=3))
])

regression_model_decision_tree.fit(X_train,y_train)

In [None]:
y_pred_decision_tree=regression_model_decision_tree.predict(X_test)

In [None]:
mse=mean_squared_error(y_test,y_pred_decision_tree)
print("Mean Squared Error: ",mse)
r2score=r2_score(y_test,y_pred_decision_tree)
print("R2 score: ",r2score)

Mean Squared Error:  22812669.85234084
R2 score:  0.8530572866320028


### **Random forest regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor

regression_model_random_forest=Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("regressor", RandomForestRegressor())

])

regression_model_random_forest.fit(X_train,y_train)

In [None]:
y_pred_random_forest=regression_model_random_forest.predict(X_test)

In [None]:
mse=mean_squared_error(y_test,y_pred_random_forest)
print("mean squared error: ", mse)
r2score=r2_score(y_test,y_pred_random_forest)
print(r2score)

mean squared error:  21684857.565979283
0.8603218373663275


### **Gradient Boosting**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

regression_model_gradient = Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("regressor",GradientBoostingRegressor(learning_rate=0.1,n_estimators=100,max_depth=3))
])

regression_model_gradient.fit(X_train,y_train)

In [None]:
y_pred_gradient=regression_model_gradient.predict(X_test)

In [None]:
mse=mean_squared_error(y_test,y_pred_gradient)
print("mean squred error: ",mse)
r2score=r2_score(y_test,y_pred_gradient)
print("R2 score: ",r2score)

mean squred error:  18757537.157022573
R2 score:  0.879177517414907


### **AdaBoost**

In [None]:
from sklearn.ensemble import AdaBoostRegressor

regression_model_adaboost=Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("regressor", AdaBoostRegressor(n_estimators=100,learning_rate= 0.1 , loss="linear"))
])

regression_model_adaboost.fit(X_train,y_train)

In [None]:
y_pred_adaboost=regression_model_adaboost.predict(X_test)

In [None]:
mse=mean_squared_error(y_test,y_pred_adaboost)
print("mean squred error: ",mse)
r2score=r2_score(y_test,y_pred_adaboost)
print("R2 score: ",r2score)

mean squred error:  25727610.735556647
R2 score:  0.8342813465311958


### **XGBoost**

In [None]:
from xgboost import XGBRFRegressor

regression_model_xgboost=Pipeline( steps=[
    ("preprocessor",preprocessor),
    ("regressor",XGBRFRegressor(learning_rate=1,n_estimators=100,max_depth=3))
])

regression_model_xgboost.fit(X_train,y_train)

In [None]:
y_pred_xgboost=regression_model_xgboost.predict(X_test)

In [None]:
mse=mean_squared_error(y_test,y_pred_xgboost)
print("mean squred error: ",mse)
r2score=r2_score(y_test,y_pred_xgboost)
print("R2 score: ",r2score)

mean squred error:  20358774.78206341
R2 score:  0.8688635031897687


### **Support Vector Regressor**

In [None]:
from sklearn.svm import SVR

regression_model_svr=Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("regressor",SVR(kernel="rbf",epsilon=0.1,C=1.0))
])

regression_model_svr.fit(X_train,y_train)

In [None]:
y_pred_svr=regression_model_svr.predict(X_test)

In [None]:
mse=mean_squared_error(y_test,y_pred_svr)
print("mean squred error: ",mse)
r2score=r2_score(y_test,y_pred_svr)
print("R2 score: ",r2score)

mean squred error:  166502152.13488975
R2 score:  -0.07248639351177277


### **K Nearest Neighbors Regression**

In [111]:
from sklearn.neighbors import KNeighborsRegressor

regression_model_knn = Pipeline( steps=[
    ("preprocessor",preprocessor),
    ("regressor", KNeighborsRegressor(n_neighbors=3,weights="uniform"))
])

regression_model_knn.fit(X_train,y_train)

In [112]:
y_pred_knn=regression_model_knn.predict(X_test)

In [113]:
mse=mean_squared_error(y_test,y_pred_knn)
print("mean squared error: ",mse)
r2score=r2_score(y_test,y_pred_knn)
print("R2 score: ",r2score)

mean squared error:  109953306.94619767
R2 score:  0.2917603519870048


From all the above regression algorithm ***Gradient Boosting*** has the hieghest R2 score of  0.879177517414907.