## 1. Data Processing  

### a. Handling missing values

In [2]:
import pandas as pd

df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,ID,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice
0,1,45,0,0,0,0,155,57,0,0,0,25000
1,2,60,1,0,0,0,180,73,0,0,0,29000
2,3,36,1,1,0,0,158,59,0,0,1,23000
3,4,52,1,1,0,1,183,93,0,0,2,28000
4,5,38,0,0,0,1,166,88,0,0,1,23000


In [3]:
df.isnull().sum()

ID                         0
Age                        0
Diabetes                   0
BloodPressureProblems      0
AnyTransplants             0
AnyChronicDiseases         0
Height                     0
Weight                     0
KnownAllergies             0
HistoryOfCancerInFamily    0
NumberOfMajorSurgeries     0
PremiumPrice               0
dtype: int64

The dataset does not contain any missing values.

In [4]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

### b. Feature Engineering

In [5]:
df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2
df.head()

Unnamed: 0,ID,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice,BMI
0,1,45,0,0,0,0,155,57,0,0,0,25000,23.725286
1,2,60,1,0,0,0,180,73,0,0,0,29000,22.530864
2,3,36,1,1,0,0,158,59,0,0,1,23000,23.634033
3,4,52,1,1,0,1,183,93,0,0,2,28000,27.770313
4,5,38,0,0,0,1,166,88,0,0,1,23000,31.934969


### c. Scaling and Encoding

Since all categorical features are binary, there's no need for additional encoding.

We will standardize the numerical features using StandardScaler from scikit-learn.

In [6]:
!pip install scikit-learn

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numerical_cols = ['Age', 'Height', 'Weight', 'BMI']

df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

df.head()



Unnamed: 0,ID,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice,BMI
0,1,0.233197,0,0,0,0,-1.306105,-1.39925,0,0,0,25000,-0.635742
1,2,1.307981,1,0,0,0,1.170852,-0.277062,0,0,0,29000,-0.839024
2,3,-0.411674,1,1,0,0,-1.00887,-1.258976,0,0,1,23000,-0.651273
3,4,0.734763,1,1,0,1,1.468086,1.125674,0,0,2,28000,0.052692
4,5,-0.268369,0,0,0,1,-0.216244,0.77499,0,0,1,23000,0.761487


## 2. Model Selection

### a. Linear Regression

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X = df.drop('PremiumPrice', axis=1)
y = df['PremiumPrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print('MAE:', mae)
print('R²:', r2)

MSE: 12211007.939350655
MAE: 2586.1256274419584
R²: 0.7136442644826895


### b. Tree-based Models

### Decision Trees

In [8]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

dt_y_pred = dt_model.predict(X_test)

dt_mse = mean_squared_error(y_test, dt_y_pred)
dt_mae = mean_absolute_error(y_test, dt_y_pred)
dt_r2 = r2_score(y_test, dt_y_pred)

print('Decision Tree Metrics\n')
print('MSE:', dt_mse)
print('MAE:', dt_mae)
print('R²:', dt_r2)

Decision Tree Metrics

MSE: 11055555.555555556
MAE: 924.2424242424242
R²: 0.7407403419613248


### Random Forests

In [9]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

rf_y_pred = rf_model.predict(X_test)

rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

print('Random Forest Metrics\n')
print('MSE:', rf_mse)
print('MAE:', rf_mae)
print('R²:', rf_r2)

Random Forest Metrics

MSE: 6115554.545454546
MAE: 1074.4444444444443
R²: 0.8565864399844955


### Gradient Boosting

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

gb_y_pred = gb_model.predict(X_test)

gb_mse = mean_squared_error(y_test, gb_y_pred)
gb_mae = mean_absolute_error(y_test, gb_y_pred)
gb_r2 = r2_score(y_test, gb_y_pred)

print('Gradient Boosting Metrics\n')
print('MSE:', gb_mse)
print('MAE:', gb_mae)
print('R²:', gb_r2)

Gradient Boosting Metrics

MSE: 6624494.7707829075
MAE: 1560.6768615062988
R²: 0.8446514749691506


### Summary of Model Performance  

Linear Regression:  

- MSE: 12,211,007.94  
- MAE: 2,586.13  
- R²: 0.7136  

Decision Tree Regression:  

- MSE: 11,055,555.56  
- MAE: 924.24  
- R²: 0.7407  

Random Forest Regression:  

- MSE: 6,115,554.55  
- MAE: 1,074.44  
- R²: 0.8566  

Gradient Boosting Regression:  

- MSE: 6,624,494.77  
- MAE: 1,560.68  
- R²: 0.8447  

Random Forest model has shown the best performance

### Hyperparameter Tuning for Random Forest

In [11]:
from sklearn.model_selection import GridSe
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, 
                           cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_
rf_y_pred = best_rf_model.predict(X_test)

rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

print('Optimized Random Forest\n')
print('MSE:', rf_mse)
print('MAE:', rf_mae)
print('R²:', rf_r2)

  warn(


Optimized Random Forest

MSE: 4606915.874001445
MAE: 1041.078687422634
R²: 0.8919649557089544


In [16]:
import joblib

with open('optimized_rf_model.pkl', 'wb') as model_file:
    joblib.dump(grid_search.best_estimator_, model_file)

In [22]:
!pip install scikit-learn



In [27]:
import sklearn

print("scikit-learn version:", sklearn.__version__)
print("joblib version:", joblib.__version__)

scikit-learn version: 1.2.2
joblib version: 1.2.0
