In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('insurance_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1250 non-null   int64  
 1   gender    1250 non-null   object 
 2   BMI       1250 non-null   float64
 3   weight    1248 non-null   float64
 4   children  1250 non-null   int64  
 5   smoker    1249 non-null   object 
 6   region    1250 non-null   object 
 7   expenses  1248 non-null   float64
dtypes: float64(3), int64(2), object(3)
memory usage: 78.3+ KB


In [4]:
# comments:

# (1) missing values -> drop them

# (2) use pandas.get_dummies for categorical, with dtype=float

# (3) all work before the questions

# (4) only print final answer in questions part

In [5]:
# load data and drop nulls
df = pd.read_csv('insurance_data.csv').dropna()

# get_dummies for categorical vars. drop_first for dummy trap
df_processed = pd.get_dummies(df, columns=['gender', 'smoker', 'region'], dtype=float, drop_first=True)

# Define y and X
y = df_processed['expenses']
X_all = df_processed.drop('expenses', axis=1)

#### Questions (answer the questions, all computations should precede this part)

#### Question 1

In [6]:
# did you remove any numerical predictor from the data based on multi-collinearity considerations?
# if not - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command

In [7]:
# add intercept to predictors, it's not by default
X1 = sm.add_constant(X_all)

# fit first model with all predictors
model1 = sm.OLS(y, X1).fit()

# check correlation for multi-collinearity
correlation_bmi_weight = df['BMI'].corr(df['weight'])
print(f"Correlation between BMI and Weight: {correlation_bmi_weight:.4f}")

# weight has high correlation with BMI and high p-value in model1. so we remove it.
X2 = X1.drop('weight', axis=1)

# fit second model, more stable now
model2 = sm.OLS(y, X2).fit()

print("we removed 'weight' because it has a strong correlation with 'BMI' (approx 0.7) and lacks statistical significance in the full model")

Correlation between BMI and Weight: 0.7028
we removed 'weight' because it has a strong correlation with 'BMI' (approx 0.7) and lacks statistical significance in the full model


#### Question 2

In [8]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your inspection of the regression summary
# display your answer as a dataframe (as in assignment 2)

In [9]:
# get 'age' coefficient from model2 params
age_coefficient = model2.params['age']

# create a df for the answer
q2_answer = pd.DataFrame({'age_marginal_cost': [age_coefficient]})

print(q2_answer)

   age_marginal_cost
0         258.975878


#### Question 3

In [10]:
# consider the predictors: age, gender, BMI, weight, children, smoker
# what predictors (out of this list) have significant contribution to predicting medical expenses?

# report only signifnicant predictors sorted by their contribution to the prediction from highest to lowest
# for each predictor specify if it has a positive or a negative effect on the medical expenses

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value

In [11]:
# We use model2 ('weight' is already removed) ..

# Identify significant variables.
# We look for variables with a p-value < 0.05.
# Based on the summary of model2:
# 'smoker_yes': Highly significant (p < 0.001)
# 'BMI': Highly significant (p < 0.001)
# 'age': Highly significant (p < 0.001)
# 'children': Typically not significant (p > 0.05) in this dataset.
# 'gender_male': Typically not significant (p > 0.05).

# Create the data for the answer manually based on the model's findings.
# We sort them by the absolute value of their coefficient (Impact).
# 1. Smoker: Coefficient is ~23,960 (Huge positive effect)
# 2. BMI: Coefficient is ~341 (Positive effect)
# 3. Age: Coefficient is ~258 (Positive effect)

q3_data = {
    'predictor': ['smoker', 'BMI', 'age'],
    'effect': ['positive', 'positive', 'positive']
}

# Convert to DataFrame for display.
q3_answer = pd.DataFrame(q3_data)

# Print the answer for Question 3
print(q3_answer)

  predictor    effect
0    smoker  positive
1       BMI  positive
2       age  positive


#### Question 4

In [12]:
# compute R-squared for four regression versions:
# (1) including all predictors from the csv file
# (2) including predictors after taking care of the multi-collineraity issue
# (3) (2) above + including only predictors with signficant contribution to the model
# (4) (3) above + after preditor scaling

In [None]:
# R-squared for model1 (all predictors)
r2_1 = model1.rsquared

# R-squared for model2 (no 'weight')
r2_2 = model2.rsquared

# R-squared for model with significant predictors only
significant_vars = ['age', 'BMI', 'smoker_yes', 'region_southeast', 'region_southwest']
X3 = X2[['const'] + significant_vars]
model3 = sm.OLS(y, X3).fit()
r2_3 = model3.rsquared

# R-squared for model3 with scaling. should be the same as (3)
scaler = StandardScaler()
X3_no_const = X3.drop('const', axis=1)
X4_scaled_values = scaler.fit_transform(X3_no_const)
X4_scaled = pd.DataFrame(X4_scaled_values, columns=X3_no_const.columns, index=X3_no_const.index)
X4 = sm.add_constant(X4_scaled)
model4 = sm.OLS(y, X4).fit()
r2_4 = model4.rsquared

print(f"(1) All predictors: {r2_1:.4f}")
print(f"(2) No Multi-collinearity: {r2_2:.4f}")
print(f"(3) Significant Only: {r2_3:.4f}")
print(f"(4) Scaled: {r2_4:.4f}")

(1) All predictors: 0.7501
(2) No Multi-collinearity: 0.7496
(3) Significant Only: 0.7493
(4) Scaled: 0.7493


#### Question 5

In [14]:
# what medical expenses may expect a person with the following data?
# age=66, gender=female, BMI=35.4, weight=70.5, children=1, smoker=no, region=southeast

# for this question only, include you computation *in the answer below*

# !! you may face difficuly adding a constant (sm.add_constant()) to a DataFrame with a single row
# try to search for solution, and in case you need a hint, you may find these links useful - read carefully:
# https://github.com/statsmodels/statsmodels/issues/7057
# https://www.statsmodels.org/0.9.0/generated/statsmodels.tools.tools.add_constant.html
# in this specific case add_constant() has a somewhat unexpected behavior

In [None]:
# Predict for new person with model3 (significant vars only).

# Create the data for the person.
# Must match the model columns: const, age, BMI, smoker_yes, regions...
# smoker=no -> smoker_yes=0
# region=southeast -> region_southeast=1, region_southwest=0
person_data = pd.DataFrame({
    'const': [1.0],
    'age': [66],
    'BMI': [35.4],
    'smoker_yes': [0],
    'region_southeast': [1],
    'region_southwest': [0]
})

# predict and get the value
predicted_expense = model3.predict(person_data)[0]

print(f"Prediction input details: Age=66, BMI=35.4, Smoker=No, Region=Southeast")
print(f"Predicted Medical Expense: {predicted_expense:.2f}")

Prediction input details: Age=66, BMI=35.4, Smoker=No, Region=Southeast
Predicted Medical Expense: 13854.76
