In [51]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [52]:
from sklearn.preprocessing import StandardScaler

def fit_model(predictors, real_values):
    predictors = sm.add_constant(predictors)
    model = sm.OLS(real_values, predictors)
    return model.fit()

df = pd.read_csv('insurance_data.csv')

# create dataframe
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded = df_encoded.dropna()
predictors = df_encoded.drop('expenses', axis=1)
res1 = fit_model(predictors, df_encoded['expenses'])

print(res1.summary())

# multi collinearity
predictors = predictors.drop('weight', axis=1)
res2 = fit_model(predictors, df_encoded['expenses'])

# only significant
predictors = predictors.drop(['region_northwest', 'children', 'gender_male'], axis=1)
res3 = fit_model(predictors, df_encoded['expenses'])

#scale data
scalar = StandardScaler()
scalar.fit(predictors)
scalar_scaled = scalar.transform(predictors)
scalar_scaled = pd.DataFrame(scalar_scaled, columns=predictors.columns, index=predictors.index)
res4 = fit_model(scalar_scaled, df_encoded['expenses'])

real_values = df_encoded['expenses']


                            OLS Regression Results                            
Dep. Variable:               expenses   R-squared:                       0.750
Model:                            OLS   Adj. R-squared:                  0.748
Method:                 Least Squares   F-statistic:                     412.5
Date:                Thu, 22 May 2025   Prob (F-statistic):               0.00
Time:                        19:25:50   Log-Likelihood:                -12613.
No. Observations:                1246   AIC:                         2.525e+04
Df Residuals:                    1236   BIC:                         2.530e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const            -1.436e+04   1111.643  

In [53]:
# comments:

# (1) there is a small amount of rows with missing values - they can be dropped

# (2) you may want to make use of https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
# read through the function documentation carefully, and use dtype=float for the "dtype" parameter

# (3) perform all your computations (solve the task) before the questions part, in a complete, clear and effective manner

# (4) in the questions part only print answers based on your solution

#### Questions (answer the questions, all computations should precede this part)

#### Question 1

In [54]:
# did you remove any numerical predictor from the data based on multi-collinearity considerations?
# if not - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command

print("i did removed the weight predicator because of its correlation with the BMI predicator - there is a direct use of weight in bmi calculations")

i did removed the weight predicator because of its correlation with the BMI predicator - there is a direct use of weight in bmi calculations


#### Question 2

In [55]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your inspection of the regression summary (after taking care of multi-collinearity)
# display your answer as a dataframe (as in assignment 2)
result = [("amount", 259.2854)]
pd.DataFrame(result)

Unnamed: 0,0,1
0,amount,259.2854


#### Question 3

In [56]:
# consider the predictors: age, gender, BMI, weight, children, smoker
# what predictors (out of this list) have significant contribution to predicting medical expenses?

# report only significant predictors sorted by their contribution to the prediction from highest to lowest
# for each predictor specify if it has a positive or a negative effect on the medical expenses

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value
result = [("smoker", "positive"), ("age", "positive"), ("BMI", "positive")]
pd.DataFrame(result, columns=["predictor", "effect"])

Unnamed: 0,predictor,effect
0,smoker,positive
1,age,positive
2,BMI,positive


#### Question 4

In [57]:
# compute R-squared for four regression versions:
# (1) including all predictors from the csv file
# (2) including predictors after taking care of the multi-collineraity issue
# (3) (2) above + including only predictors with signficant contribution to the model
# (4) (3) above + after preditor scaling
result = [("all", res1.rsquared), 
          ("multi-collinearity", res2.rsquared), 
          ("significant", res3.rsquared), 
          ("scaling", res4.rsquared)]
pd.DataFrame(result, columns=["version", "r-squared"])

Unnamed: 0,version,r-squared
0,all,0.750237
1,multi-collinearity,0.749695
2,significant,0.749442
3,scaling,0.749442


#### Question 5

In [58]:
# what medical expenses may expect a person with the following data?
# age=66, gender=female, BMI=35.4, weight=70.5, children=1, smoker=no, region=southeast

# for this question only, include you computation *in the answer below* using model (3) from Question 4

# !! you may face difficuly adding a constant (sm.add_constant()) to a DataFrame with a single row
# try to search for solution, and in case you need a hint, you may find these links useful - read carefully:
# https://github.com/statsmodels/statsmodels/issues/7057
# https://www.statsmodels.org/0.9.0/generated/statsmodels.tools.tools.add_constant.html
# in this specific case add_constant() has a somewhat unexpected behavior

In [59]:
predictors_test = pd.DataFrame({
    "age": [66],
    "BMI": [35.4],
    "smoker_yes": [0],
    "region_southeast": [1],
    "region_southwest": [0],
})
predictors_test = sm.add_constant(predictors_test, has_constant='add')
prediction = res3.predict(predictors_test)
result = predictors_test.copy()
result["prediction"] = prediction
pd.DataFrame(result)

Unnamed: 0,const,age,BMI,smoker_yes,region_southeast,region_southwest,prediction
0,1.0,66,35.4,0,1,0,13856.508189
