In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
from scipy.stats import norm

In [2]:
insurance = pd.read_csv("../datasets/insurance.csv")
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# 1

In [3]:
# continuous predictor variables only
X = insurance[['age', 'bmi', 'children']]
y = insurance['charges']

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.120
Model:                            OLS   Adj. R-squared:                  0.118
Method:                 Least Squares   F-statistic:                     60.69
Date:                Fri, 06 Feb 2026   Prob (F-statistic):           8.80e-37
Time:                        10:18:53   Log-Likelihood:                -14392.
No. Observations:                1338   AIC:                         2.879e+04
Df Residuals:                    1334   BIC:                         2.881e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -6916.2433   1757.480     -3.935      0.0

We see that the DW score is 2, which doesn't indicate any signs of autocorrelation
or nonlinearity. The omnibus, skew, and kurtosis scores are quite high however,
indicating non-Gaussian residuals. Overall, however, as noted in the previous
project as well, this doesn't indicate an ill-fitted *non-linear* relationship,
but rather just an omission of more important predictor variables such as the 
categorical ones that Lantz analyzes is more important towards the model.

# 2

In [6]:
X = insurance.drop(columns=['charges'])
X = pd.get_dummies(X, drop_first=True).astype(float)
y = insurance['charges']

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     500.8
Date:                Fri, 06 Feb 2026   Prob (F-statistic):               0.00
Time:                        10:24:23   Log-Likelihood:                -13548.
No. Observations:                1338   AIC:                         2.711e+04
Df Residuals:                    1329   BIC:                         2.716e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const            -1.194e+04    987.819  

This is a better model including all variables. We can see that only `age`, `bmi`,
`chidlren`, and `smoker_yes` are significant predictors. Likewise to Lantz, we can
convert `bmi` to a categorical binary predictor instead, indicating `1` for 
individuals that are obese (bmi > 30), and `0` otherwise.

In [8]:
# model with only significant predictors
X = insurance.drop(columns=['charges'])
X = pd.get_dummies(X, drop_first=True).astype(float)
X = X[['age', 'bmi', 'children', 'smoker_yes']]

# convert bmi to categorical variable (obese or not)
X['bmi_obese'] = (X['bmi'] >= 30).astype(float)
X = X.drop(columns=['bmi'])

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.753
Model:                            OLS   Adj. R-squared:                  0.753
Method:                 Least Squares   F-statistic:                     1018.
Date:                Fri, 06 Feb 2026   Prob (F-statistic):               0.00
Time:                        10:28:54   Log-Likelihood:                -13541.
No. Observations:                1338   AIC:                         2.709e+04
Df Residuals:                    1333   BIC:                         2.712e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -4549.7577    531.337     -8.563      0.0

This model doesn't really improve anything. We can try again by adding an `age^2` 
term and interaction between obesity and smoking like Lantz does.

In [13]:
X = insurance.drop(columns=['charges'])
X = pd.get_dummies(X, drop_first=True).astype(float)
X = X[['age', 'bmi', 'children', 'smoker_yes']]
X['bmi_obese'] = (X['bmi'] >= 30).astype(float)
X = X.drop(columns=['bmi'])
X['age_squared'] = X['age'] ** 2
X['obese_smoker'] = X['bmi_obese'] * X['smoker_yes']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.864
Model:                            OLS   Adj. R-squared:                  0.863
Method:                 Least Squares   F-statistic:                     1404.
Date:                Fri, 06 Feb 2026   Prob (F-statistic):               0.00
Time:                        10:33:28   Log-Likelihood:                -13145.
No. Observations:                1338   AIC:                         2.630e+04
Df Residuals:                    1331   BIC:                         2.634e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         2237.3818   1083.910      2.064   