In [1]:
import numpy as np
import pandas as pd

In [43]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import StratifiedKFold,GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [10]:
df = pd.read_csv('/content/insurance_data.csv')

In [11]:
df.head(1)

Unnamed: 0,index,PatientID,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
0,0,1,39.0,male,23.2,91,Yes,0,No,southeast,1121.87


In [12]:
df = df.drop(columns=['index','PatientID'])

In [13]:
df.head(1)

Unnamed: 0,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
0,39.0,male,23.2,91,Yes,0,No,southeast,1121.87


In [25]:
X = df.drop(columns=['claim'])
y = df['claim']

In [26]:
X.isna().sum()

Unnamed: 0,0
age,5
gender,0
bmi,0
bloodpressure,0
diabetic,0
children,0
smoker,0
region,3


In [27]:
transformer = ColumnTransformer(
              transformers = [
                              ('num_pipe', Pipeline(
                                              steps = [
                                                      ('impute',SimpleImputer(strategy='mean')),
                                                      ('scaler',StandardScaler()),
                                                      ]
                                                      ),X.select_dtypes(include='number').columns),
                             ('cat_pipe', Pipeline(
                                              steps = [
                                                      ('impute',SimpleImputer(strategy='most_frequent')),
                                                      ('ohe',OneHotEncoder(drop='first'))
                                                      ]
                                                  ),X.select_dtypes(exclude='number').columns)
                              ]
                              )

In [28]:
X = transformer.fit_transform(X)

In [29]:
X

array([[ 8.31689020e-02, -1.22352517e+00, -2.76232718e-01, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [-1.27086139e+00, -9.32032684e-02, -6.26175371e-01, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 6.41397593e-16,  4.31003991e-01, -1.06360369e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       ...,
       [-7.29249274e-01,  6.27581714e-01, -2.76232718e-01, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-9.73684706e-02, -4.40588378e-02,  1.03605223e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [-7.29249274e-01,  2.74079223e+00,  5.98623916e-01, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00]])

In [30]:
transformer

In [32]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=42)

# BaggingRegressor

In [39]:
bag = BaggingRegressor(random_state=42,bootstrap=True)
bag.fit(X_train,y_train)
b_pred = bag.predict(X_test)

In [40]:
mean_squared_error(y_test,b_pred), r2_score(y_test,b_pred)

(33000735.70341539, 0.7896476805981306)

# Pasting

In [44]:
paste = BaggingRegressor(estimator = DecisionTreeRegressor(), bootstrap=False, random_state=42)
paste.fit(X_train,y_train)
p_pred = paste.predict(X_test)

In [45]:
mean_squared_error(y_test,p_pred), r2_score(y_test,p_pred)

(50985487.72625783, 0.67500980294982)

# Random Subspaces

In [46]:
subspace = BaggingRegressor(estimator = DecisionTreeRegressor(),max_samples=1,max_features=1,bootstrap_features=True,random_state=42)
subspace.fit(X_train,y_train)
s_pred = subspace.predict(X_test)

In [47]:
mean_squared_error(y_test,s_pred), r2_score(y_test,s_pred)

(78019301.66663477, 0.5026916608409959)

# Random Patches

In [48]:
patch = BaggingRegressor(estimator=DecisionTreeRegressor(),max_samples=200,max_features=0.5,bootstrap_features=True,random_state=42)

In [49]:
patch.fit(X_train,y_train)
patch_pred = patch.predict(X_test)

In [50]:
mean_squared_error(y_test,patch_pred), r2_score(y_test,patch_pred)

(72850758.22325821, 0.5356368385186883)