In [8]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/medical_insurance.csv")

# One-hot encode categorical variables
df = pd.get_dummies(df, drop_first=True)

# Features and target
X = df.drop("charges", axis=1)
y = df["charges"]

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# ---- Linear Regression (baseline) ----
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)


# ---- Gradient Boosting with GridSearch ----
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
gbr = GradientBoostingRegressor()
gbr_params = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 4]
}
gbr_grid = GridSearchCV(gbr, gbr_params, cv=3, scoring="r2")
gbr_grid.fit(X_train, y_train)
y_pred_gbr = gbr_grid.best_estimator_.predict(X_test)

# ---- XGBoost with GridSearch ----
from xgboost import XGBRegressor
xgb = XGBRegressor(objective="reg:squarederror", random_state=42)
xgb_params = {
    "n_estimators": [200, 300],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 4]
}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=3, scoring="r2")
xgb_grid.fit(X_train, y_train)
y_pred_xgb = xgb_grid.best_estimator_.predict(X_test)

# ---- AdaBoost with GridSearch ----
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor()
ada_params = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1, 0.5]
}
ada_grid = GridSearchCV(ada, ada_params, cv=3, scoring="r2")
ada_grid.fit(X_train, y_train)
y_pred_ada = ada_grid.best_estimator_.predict(X_test)

# ---- LightGBM with GridSearch ----
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
lgbm_params = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 4, -1],
    "num_leaves": [20, 31, 40]
}
lgbm_grid = GridSearchCV(lgbm, lgbm_params, cv=3, scoring="r2")
lgbm_grid.fit(X_train, y_train)
y_pred_lgbm = lgbm_grid.best_estimator_.predict(X_test)

# ---- Evaluation ----
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def metrics(y_true, y_pred):
    return {"R2": r2_score(y_true, y_pred),
            "RMSE": np.sqrt(mean_squared_error(y_true, y_pred))}

# LR
m = metrics(y_test, y_pred_lr)
print("Linear Regression   -> R2:", m["R2"], " RMSE:", m["RMSE"], " Params: None")

# GradientBoosting
m = metrics(y_test, y_pred_gbr)
print("GBR   -> R2:", m["R2"], " RMSE:", m["RMSE"], " Params:", gbr_grid.best_params_)

# XGBoost
m = metrics(y_test, y_pred_xgb)
print("XGB   -> R2:", m["R2"], " RMSE:", m["RMSE"], " Params:", xgb_grid.best_params_)

# AdaBoost
m = metrics(y_test, y_pred_ada)
print("Ada   -> R2:", m["R2"], " RMSE:", m["RMSE"], " Params:", ada_grid.best_params_)

# LightGBM
m = metrics(y_test, y_pred_lgbm)
print("LGBM  -> R2:", m["R2"], " RMSE:", m["RMSE"], " Params:", lgbm_grid.best_params_)





[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 1478, number of used features: 8
[LightGBM] [Info] Start training from score 13088.426616
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 316
[LightGBM] [Info] Number of data points in the train set: 1478, number of used features: 8
[LightGBM] [Info] Start training from score 13198.117629
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 316
[LightGBM] [Info] Number of data points in the train s

In [9]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
# ---- Ridge ----
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)
m = metrics(y_test, y_pred_ridge)
print("Ridge     -> R2:", m["R2"], " RMSE:", m["RMSE"], " Params: alpha=1.0")

# ---- Lasso ----
lasso = Lasso(alpha=0.01, max_iter=10000)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
m = metrics(y_test, y_pred_lasso)
print("Lasso     -> R2:", m["R2"], " RMSE:", m["RMSE"], " Params: alpha=0.01")

# ---- ElasticNet ----
enet = ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=10000)
enet.fit(X_train, y_train)
y_pred_enet = enet.predict(X_test)
m = metrics(y_test, y_pred_enet)
print("ElasticNet-> R2:", m["R2"], " RMSE:", m["RMSE"], " Params: alpha=0.01, l1_ratio=0.5")


Ridge     -> R2: 0.7301966737672894  RMSE: 6061.38994333822  Params: alpha=1.0
Lasso     -> R2: 0.7301408082287657  RMSE: 6062.017447252088  Params: alpha=0.01
ElasticNet-> R2: 0.7301750485245653  RMSE: 6061.632854358161  Params: alpha=0.01, l1_ratio=0.5


In [3]:
pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.16-py2.py3-none-any.whl.metadata (13 kB)
Collecting pytest-runner (from lazypredict)
  Downloading pytest_runner-6.0.1-py3-none-any.whl.metadata (7.3 kB)
Collecting mlflow>=2.0.0 (from lazypredict)
  Downloading mlflow-3.4.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.4.0 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_skinny-3.4.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.4.0 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_tracing-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting docker<8,>=4.0.0 (from mlflow>=2.0.0->lazypredict)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting fastmcp<3,>=2.0.0 (from mlflow>=2.0.0->lazypredict)
  Downloading fastmcp-2.12.3-py3-none-any.whl.metadata (17 kB)
Collecting graphene<4 (from mlflow>=2.0.0->lazypredict)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow>=2.0.0->

In [4]:
from lazypredict.Supervised import LazyRegressor

reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

print(models)

  0%|          | 0/42 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000106 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 310
[LightGBM] [Info] Number of data points in the train set: 2217, number of used features: 3
[LightGBM] [Info] Start training from score 13371.074037
                               Adjusted R-Squared  R-Squared       RMSE  \
Model                                                                     
ExtraTreesRegressor                          0.71       0.72    6585.80   
ExtraTreeRegressor                           0.67       0.68    7012.92   
RandomForestRegressor                        0.60       0.61    7759.50   
BaggingRegressor                             0.57       0.58    8026.84   
XGBRegressor                                 0.57       0.58    8067.36   
DecisionTreeRegressor                        0.53       0.53    8464.37   
HistGradientBoostingRegressor                0.34       0

In [10]:
models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ExtraTreesRegressor,0.71,0.72,6585.8,0.36
ExtraTreeRegressor,0.67,0.68,7012.92,0.02
RandomForestRegressor,0.6,0.61,7759.5,0.95
BaggingRegressor,0.57,0.58,8026.84,0.1
XGBRegressor,0.57,0.58,8067.36,0.56
DecisionTreeRegressor,0.53,0.53,8464.37,0.02
HistGradientBoostingRegressor,0.34,0.35,9999.0,0.38
LGBMRegressor,0.33,0.34,10067.42,0.08
GradientBoostingRegressor,0.22,0.23,10872.09,0.34
PoissonRegressor,0.15,0.16,11322.6,0.02
