In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import  GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor

In [None]:
data = pd.read_csv('insurance.csv')
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
missing_values = data.isnull().sum()
missing_values

In [None]:
numeric_features = ['age', 'bmi', 'children']
categorical_features = ['sex', 'smoker', 'region']

# Split the data

In [None]:
X = data.drop("charges", axis=1)
y = data["charges"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="constant", fill_value="missing")),
    ('onehot', OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Stacked model pipeline

In [None]:
base_models = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('gbr', GradientBoostingRegressor(n_estimators=100, random_state=42)),
    ('etr', ExtraTreesRegressor(n_estimators=100, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42, objective='reg:squarederror')),
    ('svr', SVR())
]

meta_model = LinearRegression()

stacked_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacked_regressor', stacked_regressor)
])

# Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

In [None]:
model_pipeline.fit(X_train, y_train)

In [None]:
y_pred = model_pipeline.predict(X_test)

# check the metrics

In [None]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MAE: {mae}, MSE: {mse}, RMSE: {rmse}, R^2: {r2}")

# visualize

In [None]:
residuals = y_test - y_pred

plt.figure(figsize=(10, 5))
sns.histplot(residuals, kde=True, bins=30)
plt.title("Distribution of Residuals")
plt.xlabel("Residuals (Actual - Predicted)")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.title("Actual vs Predicted Insurance Charges")
plt.xlabel("Actual Insurance Charges")
plt.ylabel("Predicted Insurance Charges")
plt.show()

In [None]:
#Cross-val evaluation
cv_scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)

cv_rmse = -cv_scores
print(f"CV RMSE: {cv_rmse}\nAverage CV RMSE: {np.mean(cv_rmse)}")

# Fine tune w/ GridSearchCV

In [None]:
param_grid = {
    'stacked_regressor__rf__n_estimators': [50, 100, 150, 200],
    'stacked_regressor__xgb__n_estimators': [50, 100, 150, 200],
    'stacked_regressor__svr__C': [0.25, 0.5, 0.75, 1.0],
}

grid_search = GridSearchCV(estimator=model_pipeline,
                           param_grid=param_grid,
                           cv=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1,
                           verbose=1)

grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_},\nBest CV RMSE: {-grid_search.best_score_}")

In [None]:
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

mae_tuned = mean_absolute_error(y_test, y_pred_tuned)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
rmse_tuned = np.sqrt(mse_tuned)
r2_tunest = r2_score(y_test, y_pred_tuned)

print(f"Tuned Test MAE: {mae_tuned:.2f},\nTuned Test MSE: {mse_tuned:.2f},\nTuned Test RMSE: {rmse_tuned:.2f},\nTuned Test R^2: {r2_tunest:.2f}")

In [None]:
sample_input = X_test.iloc[[8]]
print(f"Sample Input: \n{sample_input}")

In [None]:
sample_prediction = best_model.predict(sample_input)
sample_prediction[0]

In [None]:
y_test