In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import(
mean_absolute_error as mae,
mean_squared_error as mse,
root_mean_squared_error as rmse,
r2_score as r_sqrd
)


### TASK 01 ---> Data Exploration and Preprocessing

In [4]:
df = pd.read_csv('insurance.csv')
df.sample(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
932,46,male,25.8,5,no,southwest,10096.97
314,27,female,31.4,0,yes,southwest,34838.873
809,25,male,25.84,1,no,northeast,3309.7926
143,29,male,29.735,2,no,northwest,18157.876
1069,54,female,31.9,1,no,southeast,10928.849


In [5]:
df.shape

(1338, 7)

In [6]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [7]:
df['sex'].unique()

array(['female', 'male'], dtype=object)

In [8]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [9]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1:]

In [10]:
X_train , X_test , y_train , y_test = train_test_split(X,y, test_size= 0.2,random_state=42)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1070, 6)
(268, 6)
(1070, 1)
(268, 1)


In [12]:
tnf = ColumnTransformer(
    transformers=[
        ('tnf 1', OneHotEncoder(sparse_output = False, drop = 'first'),['sex','smoker','region']),
        ('tnf 2', MinMaxScaler(),['age','bmi'])
    ]
)

In [13]:
X_train_tnf = pd.DataFrame(tnf.fit_transform(X_train))
X_test_tnf = pd.DataFrame(tnf.transform(X_test))

### TASK 02 ---> Implement Regression Models

In [15]:
lr = LinearRegression()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
gbr = GradientBoostingRegressor()
svr = SVR()

In [16]:
lr.fit(X_train_tnf,y_train)
dtr.fit(X_train_tnf,y_train)
rfr.fit(X_train_tnf,y_train)
gbr.fit(X_train_tnf,y_train)
svr.fit(X_train_tnf,y_train)

  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)


In [17]:
y_pred_lr = lr.predict(X_test_tnf)
y_pred_dtr = dtr.predict(X_test_tnf)
y_pred_rfr = rfr.predict(X_test_tnf)
y_pred_gbr = gbr.predict(X_test_tnf)
y_pred_svr = svr.predict(X_test_tnf)

### TASK 03 ---> Model Evaluation

In [19]:
y_true = y_test
def evaluateing_regression_models(y_true,y_pred):
    metrics = {
     'Mean Squared Error (MSE)': mse(y_true, y_pred),
     'Root Mean Squared Error (RMSE)':rmse(y_true, y_pred),
     'Mean Absolute Error (MAE)': mae(y_true, y_pred),
     'R-squared (R2) Score': r_sqrd(y_true, y_pred)
    }
 
    for name, value in metrics.items():
        print(f"{name}: {value:.4f}")
     
print('for Linear Regression:')
evaluateing_regression_models(y_true,y_pred_lr)
print('\nfor Decision Tree Regression:')
evaluateing_regression_models(y_true,y_pred_dtr)
print('\nRandom Forest Regression:')
evaluateing_regression_models(y_true,y_pred_rfr)
print('\nGradient Boosting Regression:')
evaluateing_regression_models(y_true,y_pred_gbr)
print('\nSV Regression:')
evaluateing_regression_models(y_true,y_pred_svr)

for Linear Regression:
Mean Squared Error (MSE): 34142364.8018
Root Mean Squared Error (RMSE): 5843.1468
Mean Absolute Error (MAE): 4222.9084
R-squared (R2) Score: 0.7801

for Decision Tree Regression:
Mean Squared Error (MSE): 45976988.8575
Root Mean Squared Error (RMSE): 6780.6334
Mean Absolute Error (MAE): 3421.6719
R-squared (R2) Score: 0.7038

Random Forest Regression:
Mean Squared Error (MSE): 23668098.6022
Root Mean Squared Error (RMSE): 4864.9870
Mean Absolute Error (MAE): 2766.7421
R-squared (R2) Score: 0.8475

Gradient Boosting Regression:
Mean Squared Error (MSE): 20332745.8887
Root Mean Squared Error (RMSE): 4509.1846
Mean Absolute Error (MAE): 2551.7749
R-squared (R2) Score: 0.8690

SV Regression:
Mean Squared Error (MSE): 166010979.1185
Root Mean Squared Error (RMSE): 12884.5248
Mean Absolute Error (MAE): 8612.8120
R-squared (R2) Score: -0.0693
