In [2]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
insurance_data = pd.read_csv("../../data/raw/insurance.csv")

# divide feature
X = insurance_data.drop(columns=["charges"])
y = insurance_data["charges"]

# feature engineering
X = pd.get_dummies(X, columns=["region"], drop_first=True, dtype=int)
X["sex"] = X["sex"].map({"female": 1, "male": 0})
X["smoker"] = X["smoker"].map({"yes": 1, "no": 0})
X["age_smoker"] = X["age"] * X["smoker"]
X["bmi_smoker"] = X["bmi"] * X["smoker"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [4]:
ridge_model = Ridge(alpha=0.5)   # alpha -> lamda value
# when you set alpha=0 -> recommaned that use linearRegression 
ridge_model.fit(X_train, y_train)

y_pred = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"MSE for alpha{0.5}: ", mse)

MSE for alpha0.5:  20851241.02521307


In [5]:
""" RidgeCV -> automatically finds best regularization strength (alpha)
using cross-va lidation. """
from sklearn.linear_model import RidgeCV
a = [0.001, 0.1, 0.15, 0.5, 0.6, 0.8, 1, 2, 3, 5, 10, 20, 30, 40, 50, 100]

ridge_cv_model = RidgeCV(
    alphas=a,
    cv=5, # 5-fold cross-validation
)

ridge_cv_model.fit(X_train, y_train)

print("best alpha: ",ridge_cv_model.alpha_)

y_pred = ridge_cv_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("mse: ",mse)

# we can aslo calculate r2_score
r2 = r2_score(y_test, y_pred)
print("r2 =", r2)

best alpha:  0.001
mse:  20922417.541201882
r2 = 0.8652329243520386
