<a href="https://colab.research.google.com/github/Nilanjan1210/HEALTHCARE-INSURANCE-CHARGES-PREDICTION/blob/main/Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
#  Import Libraries
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [76]:
url = "https://raw.githubusercontent.com/Nilanjan1210/HEALTHCARE-INSURANCE-CHARGES-PREDICTION/main/insurance.csv"
df = pd.read_csv(url)


In [77]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None


In [78]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [79]:
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
numerical_features.remove('charges')
print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)


numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])



Numerical features: ['age', 'bmi', 'children']
Categorical features: ['sex', 'smoker', 'region']


In [80]:
X = df.drop('charges', axis=1)
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data splitting completed.")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Data splitting completed.
X_train shape: (1070, 6)
X_test shape: (268, 6)
y_train shape: (1070,)
y_test shape: (268,)


# Linear Rgression Model

In [81]:
from sklearn.linear_model import LinearRegression

In [82]:
linear_regression_model = LinearRegression()

model_lr = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', linear_regression_model)])

print("Pipeline created successfully.")

Pipeline created successfully.


In [83]:
model_lr.fit(X_train, y_train)
print("Pipeline fitted to training data.")

Pipeline fitted to training data.


In [84]:
y_pred = model_lr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error on Test Data: {mse}")
print(f"R-squared Score on Test Data: {r2}")

Mean Squared Error on Test Data: 33596915.85136146
R-squared Score on Test Data: 0.7835929767120723


# Decision Tree Regressor model

In [85]:
from sklearn.tree import DecisionTreeRegressor

In [86]:
decision_tree_model = DecisionTreeRegressor(random_state=12)


model_dt = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', decision_tree_model)])

print("Decision Tree Pipeline created successfully.")


Decision Tree Pipeline created successfully.


In [87]:
# Fit the pipeline to the training data
model_dt.fit(X_train, y_train)
print("Decision Tree Pipeline fitted to training data.")

Decision Tree Pipeline fitted to training data.


In [88]:
# Make predictions on the test data
y_pred_dt = model_dt.predict(X_test)

In [89]:
# Evaluate the model
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print(f"\nDecision Tree Regressor Performance:")
print(f"Mean Squared Error on Test Data: {mse_dt}")
print(f"R-squared Score on Test Data: {r2_dt}")


Decision Tree Regressor Performance:
Mean Squared Error on Test Data: 45372795.57668561
R-squared Score on Test Data: 0.707741279811424


# Ridge Regression

In [90]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [91]:
# Define the Ridge Regression model
ridge_model = Ridge(random_state=12)

# Create a pipeline for the Ridge model
model_ridge = Pipeline(steps=[('preprocessor', preprocessor),
                         ('regressor', ridge_model)])

print("Ridge Regression Pipeline created successfully.")

Ridge Regression Pipeline created successfully.


In [92]:
# Define the parameter grid for hyperparameter tuning
param_grid = {'regressor__alpha': [0.1, 1.0, 10.0, 100.0]}

# Perform GridSearchCV
grid_search_ridge = GridSearchCV(model_ridge, param_grid, cv=5, scoring='r2')

# Fit the GridSearchCV to the training data
grid_search_ridge.fit(X_train, y_train)
print("GridSearchCV for Ridge Regression completed.")

GridSearchCV for Ridge Regression completed.


In [93]:
# Get the best parameters and best score
best_params_ridge = grid_search_ridge.best_params_
best_score_ridge = grid_search_ridge.best_score_

print(f"\nBest parameters for Ridge Regression: {best_params_ridge}")
print(f"Best R-squared score from GridSearchCV (cross-validation): {best_score_ridge}")


Best parameters for Ridge Regression: {'regressor__alpha': 1.0}
Best R-squared score from GridSearchCV (cross-validation): 0.7331517321187977


In [94]:
# Evaluate the best model on the test data
best_ridge_model = grid_search_ridge.best_estimator_
y_pred_ridge = best_ridge_model.predict(X_test)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"\nRidge Regressor Performance on Test Data (with best parameters):")
print(f"Mean Squared Error: {mse_ridge}")
print(f"R-squared Score: {r2_ridge}")


Ridge Regressor Performance on Test Data (with best parameters):
Mean Squared Error: 33645393.49385556
R-squared Score: 0.7832807188145148


#  LASSO

In [95]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV


In [96]:
# Define the LASSO Regression model
lasso_model = Lasso(random_state=12)

# Create a pipeline for the LASSO model
model_lasso = Pipeline(steps=[('preprocessor', preprocessor),
                         ('regressor', lasso_model)])

print("LASSO Regression Pipeline created successfully.")

LASSO Regression Pipeline created successfully.


In [97]:
# Define the parameter grid for hyperparameter tuning
param_grid = {'regressor__alpha': [0.1, 1.0, 10.0, 100.0]}

# Perform GridSearchCV
grid_search_lasso = GridSearchCV(model_lasso, param_grid, cv=5, scoring='r2')

# Fit the GridSearchCV to the training data
grid_search_lasso.fit(X_train, y_train)
print("GridSearchCV for LASSO Regression completed.")

GridSearchCV for LASSO Regression completed.


In [98]:
# Get the best parameters and best score
best_params_lasso = grid_search_lasso.best_params_
best_score_lasso = grid_search_lasso.best_score_

print(f"\nBest parameters for LASSO Regression: {best_params_lasso}")
print(f"Best R-squared score from GridSearchCV (cross-validation): {best_score_lasso}")


Best parameters for LASSO Regression: {'regressor__alpha': 100.0}
Best R-squared score from GridSearchCV (cross-validation): 0.7342401046625667


In [99]:
# Evaluate the best model on the test data
best_lasso_model = grid_search_lasso.best_estimator_
y_pred_lasso = best_lasso_model.predict(X_test)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"\nLASSO Regressor Performance on Test Data (with best parameters):")
print(f"Mean Squared Error: {mse_lasso}")
print(f"R-squared Score: {r2_lasso}")


LASSO Regressor Performance on Test Data (with best parameters):
Mean Squared Error: 34266062.50186862
R-squared Score: 0.7792828181421608


# Random Forest model

In [100]:
from sklearn.ensemble import RandomForestRegressor

In [101]:
# Define the Random Forest Regressor model
random_forest_model = RandomForestRegressor(random_state=42)

# Create a pipeline for the Random Forest model
model_rf = Pipeline(steps=[('preprocessor', preprocessor),
                         ('regressor', random_forest_model)])
print("Random Forest Pipeline created successfully.")


Random Forest Pipeline created successfully.


In [102]:
# Fit the pipeline to the training data
model_rf.fit(X_train, y_train)
print("Random Forest Pipeline fitted to training data.")

Random Forest Pipeline fitted to training data.


In [103]:
# Make predictions on the test data
y_pred_rf = model_rf.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"\nRandom Forest Regressor Performance:")
print(f"Mean Squared Error on Test Data: {mse_rf}")
print(f"R-squared Score on Test Data: {r2_rf}")


Random Forest Regressor Performance:
Mean Squared Error on Test Data: 20864569.513376206
R-squared Score on Test Data: 0.8656055394920775


In [104]:
# Define the Random Forest Regressor model
random_forest_model = RandomForestRegressor(random_state=42)

# Create a pipeline for the Random Forest model
model_rf = Pipeline(steps=[('preprocessor', preprocessor),
                         ('regressor', random_forest_model)])

print("Random Forest Pipeline created successfully.")

Random Forest Pipeline created successfully.


In [105]:
# Define the parameter grid for hyperparameter tuning
param_grid_rf = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [10, 20, None],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}

In [73]:
# Perform GridSearchCV
grid_search_rf = GridSearchCV(model_rf, param_grid_rf, cv=3, scoring='r2', n_jobs=-1)

# Fit the GridSearchCV to the training data
grid_search_rf.fit(X_train, y_train)
print("GridSearchCV for Random Forest Regression completed.")

GridSearchCV for Random Forest Regression completed.


In [74]:

# Get the best parameters and best score
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_

print(f"\nBest parameters for Random Forest Regression: {best_params_rf}")
print(f"Best R-squared score from GridSearchCV (cross-validation): {best_score_rf}")


Best parameters for Random Forest Regression: {'regressor__max_depth': 20, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 200}
Best R-squared score from GridSearchCV (cross-validation): 0.8374127799124705


In [106]:
# Evaluate the best model on the test data
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf_tuned = best_rf_model.predict(X_test)

mse_rf_tuned = mean_squared_error(y_test, y_pred_rf_tuned)
r2_rf_tuned = r2_score(y_test, y_pred_rf_tuned)

print(f"\nRandom Forest Regressor Performance on Test Data (with best parameters):")
print(f"Mean Squared Error: {mse_rf_tuned}")
print(f"R-squared Score: {r2_rf_tuned}")


Random Forest Regressor Performance on Test Data (with best parameters):
Mean Squared Error: 19440495.946652498
R-squared Score: 0.8747783910383665
