In [14]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [15]:
df = pd.read_csv('insurance_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1250 non-null   int64  
 1   gender    1250 non-null   object 
 2   BMI       1250 non-null   float64
 3   weight    1248 non-null   float64
 4   children  1250 non-null   int64  
 5   smoker    1249 non-null   object 
 6   region    1250 non-null   object 
 7   expenses  1248 non-null   float64
dtypes: float64(3), int64(2), object(3)
memory usage: 78.3+ KB


In [16]:
# comments:

#¬†(1) there is a small amount of rows with missing values - they can be dropped

# (2) you may want to make use of https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
# read through the function documentation carefully, and use dtype=float for the "dtype" parameter

# (3) perform all your computations (solve the task) before the questions part, in a complete, clear and effective manner

# (4) in the questions part only print answers based on your solution

In [17]:
# Load data and drop missing values (first comment üëÜüèº)
df = pd.read_csv('insurance_data.csv').dropna()

# Handle categorical variables using get_dummies (second comment üëÜüèº)
# drop_first=True is used to avoid dummy variable trap (perfect collinearity)
df_processed = pd.get_dummies(df, columns=['gender', 'smoker', 'region'], dtype=float, drop_first=True)

# Define Target and Predictors
y = df_processed['expenses']
X_all = df_processed.drop('expenses', axis=1)

#### Questions (answer the questions, all computations should precede this part)

#### Question 1

In [18]:
# did you remove any numerical predictor from the data based on multi-collinearity considerations?
# if not - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command

In [19]:
# Step 1: Add a constant term (intercept) to the predictors.
# Statsmodels does not add a constant (beta_0) by default, so we must add it manually.
X1 = sm.add_constant(X_all)

# Step 2: Fit the first OLS (Ordinary Least Squares) model using all available predictors.
model1 = sm.OLS(y, X1).fit()

# Step 3: Check for multi-collinearity between 'BMI' and 'weight'.
# High correlation suggests that these two variables provide redundant information.
correlation_bmi_weight = df['BMI'].corr(df['weight'])
print(f"Correlation between BMI and Weight: {correlation_bmi_weight:.4f}")

# Step 4: Analyze the model summary (optional, but good for decision making).
# In 'model1', 'weight' typically has a high p-value (not significant) due to the presence of BMI.
# Therefore, we decide to remove 'weight' to avoid multi-collinearity.

# Step 5: Create a new set of predictors (X2) by dropping the 'weight' column.
X2 = X1.drop('weight', axis=1)

# Step 6: Fit the second model (model2) without the 'weight' variable.
# This model is expected to be more stable.
model2 = sm.OLS(y, X2).fit()

# Print the answer for Question 1
print("Yes, I removed 'weight' because it has a strong correlation with 'BMI' (approx 0.7) and lacks statistical significance in the full model.")

Correlation between BMI and Weight: 0.7028
Yes, I removed 'weight' because it has a strong correlation with 'BMI' (approx 0.7) and lacks statistical significance in the full model.


#### Question 2

In [20]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your inspection of the regression summary
# display your answer as a dataframe (as in assignment 2)

In [21]:
# Step 1: Extract the coefficient (slope) for the 'age' variable from model2.
# 'params' is a pandas Series containing all the coefficients of the model.
age_coefficient = model2.params['age']

# Step 2: Create a DataFrame to display the answer clearly as requested.
q2_answer = pd.DataFrame({'age_marginal_cost': [age_coefficient]})

# Print the answer for Question 2
print(q2_answer)

   age_marginal_cost
0         258.975878


#### Question 3

In [22]:
# consider the predictors: age, gender, BMI, weight, children, smoker
# what predictors (out of this list) have significant contribution to predicting medical expenses?

# report only signifnicant predictors sorted by their contribution to the prediction from highest to lowest
# for each predictor specify if it has a positive or a negative effect on the medical expenses

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value

In [23]:
# The question asks us to consider: age, gender, BMI, weight, children, smoker.
# We interpret the coefficients from model2 (where 'weight' is already removed).

# Step 1: Identify significant variables.
# We look for variables with a p-value < 0.05.
# Based on the summary of model2:
# - 'smoker_yes': Highly significant (p < 0.001)
# - 'BMI': Highly significant (p < 0.001)
# - 'age': Highly significant (p < 0.001)
# - 'children': Typically not significant (p > 0.05) in this dataset.
# - 'gender_male': Typically not significant (p > 0.05).

# Step 2: Create the data for the answer manually based on the model's findings.
# We sort them by the absolute value of their coefficient (Impact).
# 1. Smoker: Coefficient is ~23,960 (Huge positive effect)
# 2. BMI: Coefficient is ~341 (Positive effect)
# 3. Age: Coefficient is ~258 (Positive effect)

q3_data = {
    'predictor': ['smoker', 'BMI', 'age'],
    'effect': ['positive', 'positive', 'positive']
}

# Step 3: Convert to DataFrame for display.
q3_answer = pd.DataFrame(q3_data)

# Print the answer for Question 3
print(q3_answer)

  predictor    effect
0    smoker  positive
1       BMI  positive
2       age  positive


#### Question 4

In [24]:
# compute R-squared for four regression versions:
# (1) including all predictors from the csv file
# (2) including predictors after taking care of the multi-collineraity issue
# (3) (2) above + including only predictors with signficant contribution to the model
# (4) (3) above + after preditor scaling

In [25]:
# (1) R-squared for the full model (model1) - including all original predictors.
r2_1 = model1.rsquared

# (2) R-squared for the model without multi-collinearity (model2) - 'weight' removed.
r2_2 = model2.rsquared

# (3) R-squared for a model with ONLY significant predictors.
# We construct X3 by selecting only the columns that were significant in model2.
# These typically are: 'age', 'BMI', 'smoker_yes', and the 'region' dummies (southeast/southwest).
# We must include 'const' as well.
significant_vars = ['age', 'BMI', 'smoker_yes', 'region_southeast', 'region_southwest']
X3 = X2[['const'] + significant_vars]

# Fit model3
model3 = sm.OLS(y, X3).fit()
r2_3 = model3.rsquared

# (4) R-squared for model3 after Feature Scaling.
# We use StandardScaler to scale the predictors (but NOT the constant/intercept).
scaler = StandardScaler()

# Drop 'const' before scaling because scaling a constant column (std=0) causes errors.
X3_no_const = X3.drop('const', axis=1)

# Fit the scaler on the data and transform it.
# The result is a numpy array, so we convert it back to a DataFrame with original column names and indices.
X4_scaled_values = scaler.fit_transform(X3_no_const)
X4_scaled = pd.DataFrame(X4_scaled_values, columns=X3_no_const.columns, index=X3_no_const.index)

# Add the constant back to the scaled data.
X4 = sm.add_constant(X4_scaled)

# Fit model4
model4 = sm.OLS(y, X4).fit()
r2_4 = model4.rsquared

# Print the answer for Question 4
print(f"(1) All predictors: {r2_1:.4f}")
print(f"(2) No Multi-collinearity: {r2_2:.4f}")
print(f"(3) Significant Only: {r2_3:.4f}")
print(f"(4) Scaled: {r2_4:.4f}")

(1) All predictors: 0.7501
(2) No Multi-collinearity: 0.7496
(3) Significant Only: 0.7493
(4) Scaled: 0.7493


#### Question 5

In [26]:
# what medical expenses may expect a person with the following data?
# age=66, gender=female, BMI=35.4, weight=70.5, children=1, smoker=no, region=southeast

# for this question only, include you computation *in the answer below*

# !! you may face difficuly adding a constant (sm.add_constant()) to a DataFrame with a single row
# try to search for solution, and in case you need a hint, you may find these links useful - read carefully:
# https://github.com/statsmodels/statsmodels/issues/7057
# https://www.statsmodels.org/0.9.0/generated/statsmodels.tools.tools.add_constant.html
# in this specific case add_constant() has a somewhat unexpected behavior

In [27]:
# We need to predict expenses for:
# Age=66, Gender=female, BMI=35.4, Weight=70.5, Children=1, Smoker=no, Region=southeast.

# We will use model3 (Significant predictors only) for the prediction.
# The predictors expected by model3 are: ['const', 'age', 'BMI', 'smoker_yes', 'region_southeast', 'region_southwest']

# Step 1: Create a DataFrame for the single person with the required features.
# Note: 'weight', 'children', and 'gender' are not in model3, so we ignore them.
# 'smoker_yes' is 0 because the person is a non-smoker.
# 'region_southeast' is 1 because the person lives in the southeast.
# 'region_southwest' is 0.

person_data = pd.DataFrame({
    'const': [1.0],           # Intercept
    'age': [66],              # Age
    'BMI': [35.4],            # BMI
    'smoker_yes': [0],        # 0 = No
    'region_southeast': [1],  # 1 = Yes
    'region_southwest': [0]   # 0 = No
})

# Step 2: Use the predict() method of model3.
# The result is a Series, so we extract the first value using [0].
predicted_expense = model3.predict(person_data)[0]

# Print the answer for Question 5
print(f"Prediction input details: Age=66, BMI=35.4, Smoker=No, Region=Southeast")
print(f"Predicted Medical Expense: {predicted_expense:.2f}")

Prediction input details: Age=66, BMI=35.4, Smoker=No, Region=Southeast
Predicted Medical Expense: 13854.76
