In [79]:
import pandas as pd
import numpy as np

**Importing dataset**

In [80]:
df=pd.read_csv("/content/insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [81]:
df.shape

(1338, 7)

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Categorical features:
- sex
- smoker
- region

In [83]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


There are no null values in the data

In [84]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


converting the categorical data into integers

In [88]:
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552
5,31,0,25.74,0,0,1,3756.6216
6,46,0,33.44,1,0,1,8240.5896
7,37,0,27.74,3,0,2,7281.5056
8,37,1,29.83,2,0,3,6406.4107
9,60,0,25.84,0,0,2,28923.13692


Separating Features and target

In [89]:
X=df.drop(columns='charges',axis=1)
Y=df['charges']
print(X)

      age  sex     bmi  children  smoker  region
0      19    0  27.900         0       1       0
1      18    1  33.770         1       0       1
2      28    1  33.000         3       0       1
3      33    1  22.705         0       0       2
4      32    1  28.880         0       0       2
...   ...  ...     ...       ...     ...     ...
1333   50    1  30.970         3       0       2
1334   18    0  31.920         0       0       3
1335   18    0  36.850         0       0       1
1336   21    0  25.800         0       0       0
1337   61    0  29.070         0       1       2

[1338 rows x 6 columns]


In [90]:
df.to_csv("processed_insurance_dataset.csv", index=False)

FEATURE SELECTION

In [122]:
X = df[['age', 'bmi', 'children', 'sex', 'smoker', 'region']].values
y = df['charges'].values

# Shuffle
idx = np.arange(len(X))
np.random.shuffle(idx)
X, y = X[idx], y[idx]

# Split
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# Normalize X (train only)
X_mean = X_train.mean(axis=0)
X_std = X_train.std(axis=0)

X_train = (X_train - X_mean) / X_std
X_test = (X_test - X_mean) / X_std

# Add bias
X_train = np.c_[np.ones(len(X_train)), X_train]
X_test = np.c_[np.ones(len(X_test)), X_test]

# Scale y
y_mean = y_train.mean()
y_std = y_train.std()

y_train_s = (y_train - y_mean) / y_std


In [123]:
# ================================
# RIDGE REGRESSION
# ================================
lambda_ridge = 0.1
weights_ridge = np.zeros(X_train.shape[1])

lr = 0.001
epochs = 7000
n = len(y_train_s)

for _ in range(epochs):
    y_pred = X_train.dot(weights_ridge)
    error = y_pred - y_train_s

    gradients = (2/n) * X_train.T.dot(error) + 2 * lambda_ridge * weights_ridge
    weights_ridge -= lr * gradients

# Test prediction
y_ridge_pred = (X_test.dot(weights_ridge)) * y_std + y_mean


In [124]:
# ================================
# LASSO REGRESSION
# ================================
lambda_lasso = 0.05
weights_lasso = np.zeros(X_train.shape[1])

for _ in range(epochs):
    y_pred = X_train.dot(weights_lasso)
    error = y_pred - y_train_s

    gradients = (2/n) * X_train.T.dot(error) + lambda_lasso * np.sign(weights_lasso)
    weights_lasso -= lr * gradients

# Test prediction
y_lasso_pred = (X_test.dot(weights_lasso)) * y_std + y_mean


In [125]:
def evaluate(y_true, y_pred, name):
    mse = np.mean((y_true - y_pred) ** 2)
    rmse = np.sqrt(mse)
    r2 = 1 - np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2)

    print(f"\n{name} RESULTS")
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("R2 Score:", r2)

evaluate(y_test, y_ridge_pred, "RIDGE")
evaluate(y_test, y_lasso_pred, "LASSO")



RIDGE RESULTS
MSE: 41848943.39523794
RMSE: 6469.075930551282
R2 Score: 0.760543931522064

LASSO RESULTS
MSE: 40778331.51571486
RMSE: 6385.791377403028
R2 Score: 0.7666698809663585


In [126]:
features = ['Bias','Age','BMI','Children','Sex','Smoker','Region']

print("\nRIDGE COEFFICIENTS")
for f, w in zip(features, weights_ridge):
    print(f"{f}: {w:.3f}")

print("\nLASSO COEFFICIENTS")
for f, w in zip(features, weights_lasso):
    print(f"{f}: {w:.3f}")



RIDGE COEFFICIENTS
Bias: -0.000
Age: 0.267
BMI: 0.154
Children: 0.045
Sex: -0.013
Smoker: 0.724
Region: 0.025

LASSO COEFFICIENTS
Bias: -0.000
Age: 0.274
BMI: 0.140
Children: 0.024
Sex: -0.000
Smoker: 0.771
Region: 0.001


In [127]:
def predict_insurance_regularized(age, bmi, children, sex, smoker, region, model="ridge"):
    # Create input
    X_new = np.array([[age, bmi, children, sex, smoker, region]])

    # Normalize using training stats
    X_new = (X_new - X_mean) / X_std

    # Add bias
    X_new = np.c_[np.ones(X_new.shape[0]), X_new]

    # Choose model
    if model == "ridge":
        y_scaled = X_new.dot(weights_ridge)
    elif model == "lasso":
        y_scaled = X_new.dot(weights_lasso)
    else:
        raise ValueError("Model must be 'ridge' or 'lasso'")

    # Convert back to original scale
    y_pred = y_scaled * y_std + y_mean

    return y_pred[0]


In [128]:
print("RIDGE Prediction:",
      predict_insurance_regularized(30, 25, 1, 0, 0, 2, model="ridge"))

print("LASSO Prediction:",
      predict_insurance_regularized(30, 25, 1, 0, 0, 2, model="lasso"))


RIDGE Prediction: 5255.452273001343
LASSO Prediction: 4818.705619659082


In [129]:
print("RIDGE Prediction:",
      predict_insurance_regularized(45, 32, 2, 1, 1, 1, model="ridge"))

print("LASSO Prediction:",
      predict_insurance_regularized(45, 32, 2, 1, 1, 1, model="lasso"))


RIDGE Prediction: 32056.299809474243
LASSO Prediction: 33268.04244567515


In [130]:
test_cases = [
    [25, 22, 0, 0, 0, 0],
    [40, 28, 2, 1, 1, 3],
    [55, 30, 3, 0, 1, 1]
]

for t in test_cases:
    print("Ridge:", predict_insurance_regularized(*t, model="ridge"),
          "| Lasso:", predict_insurance_regularized(*t, model="lasso"))


Ridge: 2243.507461125242 | Lasso: 2588.2639221848567
Ridge: 30285.635504401595 | Lasso: 31058.85888915464
Ridge: 34461.08521668407 | Lasso: 35253.234287304345
