In [65]:
import numpy as np
import pandas as pd
import pickle as pkl
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [66]:
# loading the data from csv file to a Pandas Dataset
insurance_dataset = pd.read_csv('insurance.csv')

In [67]:
# Display first 5 rows of the dataset
insurance_dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [68]:
# number of rows and columns
insurance_dataset.shape

(1338, 7)

In [69]:
# getting some informations about the dataset
insurance_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Categorical Values

Sex,
Smoker,
Region

In [70]:
# checking for missing values
insurance_dataset.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [71]:
# statistical summary of the dataset
insurance_dataset.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [72]:
# distribution of age value
sns.set()
plt.figure(figsize=(6,6))
sns.distplot(insurance_dataset['age'])
plt.title('Age Distribution')
plt.show()


`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(insurance_dataset['age'])
  plt.show()


In [73]:
#  distribution of Gender column
plt.figure(figsize=(6,6))
sns.countplot(x='sex', data=insurance_dataset)
plt.title('Sex Distribution')
plt.show()

  plt.show()


In [74]:
insurance_dataset['sex'].value_counts()


sex
male      676
female    662
Name: count, dtype: int64

In [75]:
# distribution of BMI column
plt.figure(figsize=(6,6))
sns.distplot(insurance_dataset['bmi'])
plt.title('BMI Distribution')
plt.show()


`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(insurance_dataset['bmi'])
  plt.show()


Normal BMI Range --> 18.5 to 24.9

In [76]:
# distribution of children column
plt.figure(figsize=(6,6))
sns.countplot(x='children', data=insurance_dataset)
plt.title('Children')
plt.show()

  plt.show()


In [77]:
insurance_dataset['children'].value_counts()

children
0    574
1    324
2    240
3    157
4     25
5     18
Name: count, dtype: int64

In [78]:
#  distribution of smoker column
plt.figure(figsize=(6,6))
sns.countplot(x='smoker', data=insurance_dataset)
plt.title('smoker')
plt.show()

  plt.figure(figsize=(6,6))
  plt.show()


In [79]:
insurance_dataset['smoker'].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [80]:
#  distribution of region column
plt.figure(figsize=(6,6))
sns.countplot(x='region', data=insurance_dataset)
plt.title('region')
plt.show()

  plt.show()


In [81]:
insurance_dataset['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [82]:
# distribution of charges value
plt.figure(figsize=(6,6))
sns.distplot(insurance_dataset['charges'])
plt.title('Charges Distribution')
plt.show()


`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(insurance_dataset['charges'])
  plt.show()


Data Pre-Processing

Encoding the categorical columns

In [83]:
# encoding sex column
insurance_dataset.replace({'sex':{'male':0,'female':1}}, inplace=True)

3 # encoding 'smoker' column
insurance_dataset.replace({'smoker':{'yes':1,'no':0}}, inplace=True)

# encoding 'region' column
insurance_dataset.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}}, inplace=True)


  insurance_dataset.replace({'sex':{'male':0,'female':1}}, inplace=True)
  insurance_dataset.replace({'smoker':{'yes':1,'no':0}}, inplace=True)
  insurance_dataset.replace({'region':{'southeast':0,'southwest':1,'northeast':2,'northwest':3}}, inplace=True)


Splitting the Features and Target Variable

In [84]:
X = insurance_dataset.drop(columns='charges', axis=1) # Features(X)
Y = insurance_dataset['charges'] # Target(Y)

In [85]:
print(X)

      age  sex     bmi  children  smoker  region
0      19    1  27.900         0       1       1
1      18    0  33.770         1       0       0
2      28    0  33.000         3       0       0
3      33    0  22.705         0       0       3
4      32    0  28.880         0       0       3
...   ...  ...     ...       ...     ...     ...
1333   50    0  30.970         3       0       3
1334   18    1  31.920         0       0       2
1335   18    1  36.850         0       0       0
1336   21    1  25.800         0       0       1
1337   61    1  29.070         0       1       3

[1338 rows x 6 columns]


In [86]:
print(Y)

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64


Splitting the data into training data & testing sets (80% train, 20% test)

In [87]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(X.shape, X_train.shape, X_test.shape)

(1338, 6) (1070, 6) (268, 6)


Initialize Random Forest Regressor

In [88]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, Y_train)

In [89]:
# Predict on training and testing data
rf_train_pred = rf_regressor.predict(X_train)
rf_test_pred = rf_regressor.predict(X_test)

 Model Evaluation:
 
  R-squared scores

In [90]:
rf_train_r2 = metrics.r2_score(Y_train, rf_train_pred)
rf_test_r2 = metrics.r2_score(Y_test, rf_test_pred)

print("Random Forest Regressor Performance:")
print(f"Training R^2 Score: {rf_train_r2:.4f}")
print(f"Test R^2 Score: {rf_test_r2:.4f}")

Random Forest Regressor Performance:
Training R^2 Score: 0.9742
Test R^2 Score: 0.8666


Mean Absolute Error (MAE)

In [91]:
mae = metrics.mean_absolute_error(Y_test, rf_test_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

Mean Absolute Error (MAE): 2506.3072


Mean Squared Error (MSE)

In [92]:
mse = metrics.mean_squared_error(Y_test, rf_test_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")


Mean Squared Error (MSE): 20715371.3642


 Root Mean Squared Error (RMSE)

In [93]:
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Root Mean Squared Error (RMSE): 4551.4142


In [94]:
# Visualizing the feature importances
importances = rf_regressor.feature_importances_
features = X.columns

# Sort the feature importances
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title('Feature Importances (Random Forest)')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
plt.show()

  plt.show()


Prediction for a Sample input

In [95]:
input_data = (31, 1, 25.74, 0, 1, 0)  # (age, gender, bmi, children, smoker, region)

# Convert input data into numpy array and reshape it
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Predict the premium
premium_prediction = rf_regressor.predict(input_data_reshaped)

# Display the prediction
print(f"The predicted insurance premium is: ${premium_prediction[0]:.2f}")


The predicted insurance premium is: $19479.38




In [96]:
# Save the trained Random Forest model for later 
pkl.dump(rf_regressor, open('rf_model.pkl', 'wb'))
