In [2]:
import zipfile
import os

zip_path = "/content/archive (1).zip"   # uploaded zip file
extract_path = "/mnt/data/insurance_ridge"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

os.listdir(extract_path)

['insurance.csv']

In [3]:
for root, dirs, files in os.walk(extract_path):
    for file in files:
        if file.endswith(".csv"):
            print("CSV Found:", os.path.join(root, file))


CSV Found: /mnt/data/insurance_ridge/insurance.csv


In [4]:
import pandas as pd

csv_path = "/mnt/data/insurance_ridge/insurance.csv"  # change if needed
df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df_encoded = pd.get_dummies(df, columns=["sex", "smoker", "region"], drop_first=True)

df_encoded.head()


Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,False,True,False,False,True
1,18,33.77,1,1725.5523,True,False,False,True,False
2,28,33.0,3,4449.462,True,False,False,True,False
3,33,22.705,0,21984.47061,True,False,True,False,False
4,32,28.88,0,3866.8552,True,False,True,False,False


In [6]:
X = df_encoded[["age", "bmi", "children", "smoker_yes"]]
y = df_encoded["charges"]


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)


In [9]:
from sklearn.linear_model import Ridge

alphas = [0.1, 1, 10, 100]
ridge_predictions = {}

for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    ridge_predictions[a] = ridge.predict(X_test)


In [10]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    return mae, rmse, r2


In [11]:
results = {}

# Linear Regression results
results["Linear Regression"] = evaluate(y_test, y_pred_lr)

# Ridge Regression results
for a in alphas:
    results[f"Ridge alpha={a}"] = evaluate(y_test, ridge_predictions[a])

results_df = pd.DataFrame(results, index=["MAE", "RMSE", "R2 Score"]).T
results_df


Unnamed: 0,MAE,RMSE,R2 Score
Linear Regression,4213.798595,5829.378522,0.781115
Ridge alpha=0.1,4214.995041,5829.68038,0.781092
Ridge alpha=1,4225.750279,5832.609778,0.780872
Ridge alpha=10,4330.79636,5880.507334,0.777258
Ridge alpha=100,5204.424227,6951.187294,0.688764


In [12]:
best_model = results_df["RMSE"].idxmin()
print("Best Model based on Lowest RMSE:", best_model)


Best Model based on Lowest RMSE: Linear Regression


Linear Regression and Ridge Regression were trained on the insurance dataset.
Ridge Regression was tested with alpha values 0.1, 1, 10, and 100.
The models were evaluated using MAE, RMSE, and R² score.
The best alpha is the one giving lowest RMSE and highest R².
From the results table, the best model is: (write the output shown).
