In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm

In [2]:
insurance = pd.read_csv("../datasets/insurance.csv")
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# 1

In [3]:
X = insurance.drop(columns=['charges'])
y = insurance['charges']
X = pd.get_dummies(X, drop_first=True).astype(float)
X['age_squared'] = X['age'] ** 2
X['bmi_obese'] = (X['bmi'] >= 30).astype(float)
X['obese_smoker'] = X['bmi_obese'] * X['smoker_yes']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.866
Model:                            OLS   Adj. R-squared:                  0.865
Method:                 Least Squares   F-statistic:                     781.7
Date:                Thu, 26 Feb 2026   Prob (F-statistic):               0.00
Time:                        15:55:26   Log-Likelihood:                -13131.
No. Observations:                1338   AIC:                         2.629e+04
Df Residuals:                    1326   BIC:                         2.635e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const              134.2509   1362.751  

We again start with the Lantz model from HW4.

In [6]:
threshold = y.median()
y_binary = (y > threshold).astype(int)

logit_model = sm.GLM(y_binary, X, family=sm.families.Binomial()).fit()
print(logit_model.summary())

print(np.exp(logit_model.params))

                 Generalized Linear Model Regression Results                  
Dep. Variable:                charges   No. Observations:                 1338
Model:                            GLM   Df Residuals:                     1326
Model Family:                Binomial   Df Model:                           11
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -287.73
Date:                Thu, 26 Feb 2026   Deviance:                       575.46
Time:                        16:03:48   Pearson chi2:                 1.46e+03
No. Iterations:                    25   Pseudo R-squ. (CS):             0.6157
Covariance Type:            nonrobust                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const                7.8018      1.395  

All coefficients are interpreted in the following manner: log-odds, odds ratio, standard error  

**Intercept**: 7.8018, 2,445, 1.395  
The intercept of the model, the baseline when all predictors are zero which
isn't realistic or meaningful in this model. Anchors the regression line. 

**Age**: -0.7961, 4.511, 0.078  
**Age^2**: 0.0126, 1.103, 0.001  
The negative linear term and positive quadratic term imply a U-shaped
relationship in log-odds. This means at younger ages, the probability
of high cost decreases quickly, but as age increases, the quadratic term
dominates, increasing the probability of a higher cost. Both terms are 
significant and precise (low SE), thus age is an important non linear predictor.

**BMI**: 0.0484, 1.05, 0.03  
A one unit increase in BMI while holding all other predictors constant increases
the odds of high cost medical charge by around 5%. The coefficient is well
estimated due to low SE however is not statistically significant with a p-value
of 0.107.

**Children**: 0.5113, 1.668, 0.09  
Each additional child increases the odds of being high cost by around 67%. This
is a strong, precise, and statistically significant effect.

**Sex (male)**: -0.504, 0.6, 0.219  
Males have around a 40% lower odds of being high cost compared to females 
while holding all other variables constant. This is a strong, well estimated,
and statistically significant effect.

**Region**:  
**NW**: -0.4304, 0.65, 0.296  
**SE**: -1.0420, 0.35, 0.312  
**SW**: -0.9675, 0.38, 0.307  
People outside the northeast are generally less likely to be in the high cost
group. These are reasonably precise estimates, however northwest seems to be
not statistically significant due to the high p value of 0.145.

**Smoker (yes)**: 29.0072, 3.959766e+12, 2.49e+04  
The enormous standard errors which tells us that the model is experiencing
quasi-complete separation. This means that almost every smoker is above
the median cahrge, thus smoking nearly perfectly predicts high cost. In a logistic
regression, this causes the coefficient to spike, as well as its stanard error,
causing the model to become numerically unstable. We can also see this issue 
with people who are obese and smokers. Logically this also makes sense as having
both of these issues generally puts you toward higher medical insurance cost 
charges.
 
**BMI_obese**: -0.2167, 0.8, 0.357  
An indicator that people who are obese are more likely to experience higher
insurane cost charges. The value is well estimated but not statistically significant
due to its p-value of 0.543.

# 2

In [7]:
from sklearn.linear_model import LassoCV, Lasso
from sklearn.preprocessing import StandardScaler

X = insurance.drop(columns=['charges'])
y = insurance['charges']
X = pd.get_dummies(X, drop_first=True).astype(float)
X['age_squared'] = X['age'] ** 2
X['bmi_obese'] = (X['bmi'] >= 30).astype(float)
X['obese_smoker'] = X['bmi_obese'] * X['smoker_yes']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lasso_cv = LassoCV(cv=5, random_state=0).fit(X_scaled, y_binary)
selected = X.columns[lasso_cv.coef_ != 0]

X_reduced = sm.add_constant(X[selected])
reduced_model = sm.GLM(y_binary, X_reduced, family=sm.families.Binomial()).fit()

B = 1000
n = len(y_binary)

boot_coefs = []

np.random.seed(0)
for _ in range(B):
    sample_idx = np.random.choice(n, n, replace=True)
    X_boot = X.iloc[sample_idx]
    y_boot = y_binary.iloc[sample_idx]

    scaler_b = StandardScaler()
    Xb_scaled = scaler_b.fit_transform(X_boot)

    lasso_b = Lasso(alpha=lasso_cv.alpha_)
    lasso_b.fit(Xb_scaled, y_boot)
    selected_b = X.columns[lasso_b.coef_ != 0]

    if len(selected_b) == 0:
        continue

    Xb_reduced = sm.add_constant(X_boot[selected_b])
    model_b = sm.GLM(y_boot, Xb_reduced, family=sm.families.Binomial()).fit()

    coef_series = pd.Series(0.0, index=X_reduced.columns)
    for name in model_b.params.index:
        if name in coef_series.index:
            coef_series[name] = model_b.params[name]

    boot_coefs.append(coef_series.values)

boot_coefs = np.array(boot_coefs)
boot_ses = pd.Series(boot_coefs.std(axis=0), index=X_reduced.columns)

print(reduced_model.summary())
print(boot_ses)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                charges   No. Observations:                 1338
Model:                            GLM   Df Residuals:                     1326
Model Family:                Binomial   Df Model:                           11
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -287.73
Date:                Thu, 26 Feb 2026   Deviance:                       575.46
Time:                        16:55:41   Pearson chi2:                 1.46e+03
No. Iterations:                    25   Pseudo R-squ. (CS):             0.6157
Covariance Type:            nonrobust                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const                7.8018      1.395  