In [2]:
import pandas as pd
import numpy as np


In [3]:
data_path = "../data/raw/insurance.csv"
df = pd.read_csv(data_path)

df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df_fe = df.copy()


In [5]:
df_fe["log_charges"] = np.log(df_fe["charges"])


Medical cost data is highly right-skewed. Applying a log transformation reduces skewness and stabilizes variance, improving regression model performance.

In [6]:
df_fe["age_group"] = pd.cut(
    df_fe["age"],
    bins=[18, 30, 45, 60, 100],
    labels=["18-30", "31-45", "46-60", "60+"]
)


Age effects on healthcare cost are not strictly linear; grouping captures lifecycle-based risk differences.

In [7]:
def bmi_category(bmi):
    if bmi < 18.5:
        return "underweight"
    elif bmi < 25:
        return "normal"
    elif bmi < 30:
        return "overweight"
    else:
        return "obese"

df_fe["bmi_category"] = df_fe["bmi"].apply(bmi_category)


BMI categories align with clinical risk stratification used in healthcare analytics.

In [8]:
df_fe["smoker_flag"] = df_fe["smoker"].map({"yes": 1, "no": 0})
df_fe["smoker_bmi_interaction"] = df_fe["smoker_flag"] * df_fe["bmi"]


The combined effect of smoking and high BMI is expected to significantly increase healthcare costs beyond either factor alone.

In [9]:
categorical_cols = ["sex", "region", "age_group", "bmi_category"]

df_encoded = pd.get_dummies(
    df_fe,
    columns=categorical_cols,
    drop_first=True
)


In [10]:
X = df_encoded.drop(columns=["charges", "log_charges"])
y = df_encoded["log_charges"]

X.head(), y.head()


(   age     bmi  children smoker  smoker_flag  smoker_bmi_interaction  \
 0   19  27.900         0    yes            1                    27.9   
 1   18  33.770         1     no            0                     0.0   
 2   28  33.000         3     no            0                     0.0   
 3   33  22.705         0     no            0                     0.0   
 4   32  28.880         0     no            0                     0.0   
 
    sex_male  region_northwest  region_southeast  region_southwest  \
 0     False             False             False              True   
 1      True             False              True             False   
 2      True             False              True             False   
 3      True              True             False             False   
 4      True              True             False             False   
 
    age_group_31-45  age_group_46-60  age_group_60+  bmi_category_obese  \
 0            False            False          False            

In [11]:
df_encoded.to_csv("../data/processed/insurance_feature_engineered.csv", index=False)


Feature Engineering Summary

Feature engineering focused on incorporating domain-informed risk factors relevant to healthcare cost prediction. Age and BMI were transformed into categorical risk groups, and interaction effects between smoking status and BMI were introduced to capture compounding risk. The target variable was log-transformed to address right-skewness in medical costs. Categorical variables were one-hot encoded to prepare the dataset for regression and tree-based models.