In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
insurance = pd.read_csv('insurance.csv')

In [3]:
insurance

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
#Load dataset

df = pd.read_csv('insurance.csv')

In [5]:
#Preview the first# Check for missing values
df.isnull().sum() rows of the dataset

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
#Check the data types of each column

df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [7]:
#Check the summary statistics of the numerical variables

df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [8]:
#Check for missing values

df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [18]:
#Encode categorical variables

df = pd.get_dummies(df, drop_first=True)

In [15]:
#Split the dataset

X_train, X_test, y_train, y_test = train_test_split(df.drop('charges', axis=1), df['charges'], test_size=0.2, random_state=0)

In [14]:
#Create base classifiers

rf1 = RandomForestClassifier(n_estimators = 50, random_state = 42)
dt = DecisionTreeClassifier(max_depth = 5, random_state = 42)
lr = LogisticRegression(random_state = 42)

In [17]:
#Build Random Forest model

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

In [20]:
#Predict on test set and calculate Mean Squared Error

from sklearn.metrics import mean_squared_error
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('Random Forest Mean Squared Error:', mse)

Random Forest Mean Squared Error: 19479478.286309913


In [21]:
#Create a Random Forest regression model

rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [22]:
#Make predictions

y_pred = rf_reg.predict(X_test)

In [23]:
#Evaluate the model

mse = mean_squared_error(y_test, y_pred)
print("Random Forest Mean Squared Error:", mse)

Random Forest Mean Squared Error: 19028348.475794837


In [29]:
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [27]:
#models

rf1 = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42)
rf2 = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
lr = LinearRegression()

In [28]:
#Define the ensemble models

voting_regressor = VotingRegressor([('rf1', rf1), ('rf2', rf2), ('lr', lr)])
stacking_regressor = StackingRegressor([('rf1', rf1), ('rf2', rf2)], final_estimator=lr)

In [31]:
# Fit the models
rf1.fit(X_train, y_train)
rf2.fit(X_train, y_train)
lr.fit(X_train, y_train)
voting_regressor.fit(X_train, y_train)
stacking_regressor.fit(X_train, y_train)

StackingRegressor(estimators=[('rf1',
                               RandomForestRegressor(max_depth=5,
                                                     n_estimators=50,
                                                     random_state=42)),
                              ('rf2',
                               RandomForestRegressor(max_depth=5,
                                                     random_state=42))],
                  final_estimator=LinearRegression())

In [32]:
#Predicting

y_pred_rf1 = rf1.predict(X_test)
y_pred_rf2 = rf2.predict(X_test)
y_pred_lr = lr.predict(X_test)
y_pred_voting = voting_regressor.predict(X_test)
y_pred_stacking = stacking_regressor.predict(X_test)

In [33]:
#Calculate the Mean Squared Error 

mse_rf1 = mean_squared_error(y_test, y_pred_rf1)
mse_rf2 = mean_squared_error(y_test, y_pred_rf2)
mse_lr = mean_squared_error(y_test, y_pred_lr)
mse_voting = mean_squared_error(y_test, y_pred_voting)
mse_stacking = mean_squared_error(y_test, y_pred_stacking)

In [34]:
from sklearn.metrics import mean_squared_error, r2_score

In [35]:
#Tuning the Ensemble

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)

In [36]:
#Computing the MSE on test data

mse = mean_squared_error(y_test, y_pred_test)
print("Random Forest Mean Squared Error:", mse)

Random Forest Mean Squared Error: 19028348.475794837


In [37]:
#Computing the R-squared score

r2 = r2_score(y_test, y_pred_test)
print("Random Forest R-squared Score:", r2)

Random Forest R-squared Score: 0.8804225917098133
