In [1]:
# EDA libraries
import pandas as pd 

# Feature engineering
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


# Mounting the data dir 
from pathlib import Path
data_dir_interim = Path('.').resolve().parent / "data" / "interim"

In [2]:
# Load interim data 
df = pd.read_csv(data_dir_interim / "cleaned_health_insurance_us_v1.csv")
print(f'Shape of the data: {df.shape}')
print(f'Preview of the data:\n {df.head()}')

Shape of the data: (1337, 7)
Preview of the data:
    age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


Not much feature engineering needs to be done. Only need to encode the categorical features so that they can be used in the model.

In [3]:
# Examine categorical variables
categorical_cols = ['sex', 'smoker', 'region']

print("Categorical Variables Analysis:")
print("=" * 40)
for col in categorical_cols:
    print(f"\n{col.upper()}:")
    print(f"  Unique values: {df[col].unique()}")
    print(f"  Value counts:\n{df[col].value_counts()}")
    print(f"  Number of unique values: {df[col].nunique()}")

Categorical Variables Analysis:

SEX:
  Unique values: ['female' 'male']
  Value counts:
sex
male      675
female    662
Name: count, dtype: int64
  Number of unique values: 2

SMOKER:
  Unique values: ['yes' 'no']
  Value counts:
smoker
no     1063
yes     274
Name: count, dtype: int64
  Number of unique values: 2

REGION:
  Unique values: ['southwest' 'southeast' 'northwest' 'northeast']
  Value counts:
region
southeast    364
southwest    325
northwest    324
northeast    324
Name: count, dtype: int64
  Number of unique values: 4


## Feature Engineering Approaches

A key approach for encoding categorical variables:
- Binary categorical - Use Label Encoding (since they are binary)
- Multi-class categorical - Use One-Hot Encoding (to avoid ordinal relationships) or label encoding (represent categories as integers all in one column)

### One-Hot Encoding
- Transform `region` feature using one-hot encoding for categorical variables

In [4]:
# Create a copy of the original data
df_encoded_v1 = df.copy()

print("Direct Transformation")
print("=" * 40)

# For binary variables (sex, smoker) - use label encoding (0/1)
label_encoder_sex = LabelEncoder()
label_encoder_smoker = LabelEncoder()

df_encoded_v1['sex_encoded'] = label_encoder_sex.fit_transform(df_encoded_v1['sex'])
df_encoded_v1['smoker_encoded'] = label_encoder_smoker.fit_transform(df_encoded_v1['smoker'])

# For region (4 categories) - use one-hot encoding to avoid ordinal assumption
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid dummy variable trap
region_dummies_array = onehot_encoder.fit_transform(df_encoded_v1[['region']])

# Convert to DataFrame with proper column names
region_dummy_columns = onehot_encoder.get_feature_names_out(['region'])
region_dummies_df = pd.DataFrame(region_dummies_array, columns=region_dummy_columns, index=df_encoded_v1.index)

# Concatenate with main dataframe
df_encoded_v1 = pd.concat([df_encoded_v1, region_dummies_df], axis=1)

# Drop original categorical columns
df_encoded_v1 = df_encoded_v1.drop(['sex', 'smoker', 'region'], axis=1)

print("Encoded data shape:", df_encoded_v1.shape)
print("\nEncoded data preview:")
print(df_encoded_v1.head())

print("\nEncoding mappings:")
print(f"Sex mapping: {dict(zip(label_encoder_sex.classes_, label_encoder_sex.transform(label_encoder_sex.classes_)))}")
print(f"Smoker mapping: {dict(zip(label_encoder_smoker.classes_, label_encoder_smoker.transform(label_encoder_smoker.classes_)))}")
print(f"Region columns created: {list(region_dummy_columns)}")

Direct Transformation
Encoded data shape: (1337, 9)

Encoded data preview:
   age     bmi  children      charges  sex_encoded  smoker_encoded  \
0   19  27.900         0  16884.92400            0               1   
1   18  33.770         1   1725.55230            1               0   
2   28  33.000         3   4449.46200            1               0   
3   33  22.705         0  21984.47061            1               0   
4   32  28.880         0   3866.85520            1               0   

   region_northwest  region_southeast  region_southwest  
0               0.0               0.0               1.0  
1               0.0               1.0               0.0  
2               0.0               1.0               0.0  
3               1.0               0.0               0.0  
4               1.0               0.0               0.0  

Encoding mappings:
Sex mapping: {'female': np.int64(0), 'male': np.int64(1)}
Smoker mapping: {'no': np.int64(0), 'yes': np.int64(1)}
Region columns created

### Label Encoding
- Transform `region` feature using label encoding for categorical variables

In [5]:
# Create a copy of the original data
df_encoded_v2 = df.copy()

print("Label Encoding Approach")
print("=" * 40)

# For binary variables (sex, smoker) - use label encoding (0/1)
label_encoder_sex = LabelEncoder()
label_encoder_smoker = LabelEncoder()

df_encoded_v2['sex_encoded'] = label_encoder_sex.fit_transform(df_encoded_v2['sex'])
df_encoded_v2['smoker_encoded'] = label_encoder_smoker.fit_transform(df_encoded_v2['smoker'])

# For region (4 categories) - use label encoding to represent as integers
label_encoder_region = LabelEncoder()
df_encoded_v2['region_encoded'] = label_encoder_region.fit_transform(df_encoded_v2['region'])

# Drop original categorical columns
df_encoded_v2 = df_encoded_v2.drop(['sex', 'smoker', 'region'], axis=1)

print("Encoded data shape:", df_encoded_v2.shape)
print("\nEncoded data preview:")
print(df_encoded_v2.head())

print("\nEncoding mappings:")
print(f"Sex mapping: {dict(zip(label_encoder_sex.classes_, label_encoder_sex.transform(label_encoder_sex.classes_)))}")
print(f"Smoker mapping: {dict(zip(label_encoder_smoker.classes_, label_encoder_smoker.transform(label_encoder_smoker.classes_)))}")
print(f"Region mapping: {dict(zip(label_encoder_region.classes_, label_encoder_region.transform(label_encoder_region.classes_)))}")

# Show reverse mapping for region (useful for interpretation)
region_reverse_mapping = {v: k for k, v in dict(zip(label_encoder_region.classes_, label_encoder_region.transform(label_encoder_region.classes_))).items()}
print(f"Region reverse mapping: {region_reverse_mapping}")

print(f"\nComparison:")
print(f"One-Hot Encoding: Creates {len(region_dummy_columns)} columns for region")
print(f"Label Encoding: Creates 1 column for region (more compact)")
print(f"Label Encoding avoids multicollinearity but assumes ordinal relationship")

Label Encoding Approach
Encoded data shape: (1337, 7)

Encoded data preview:
   age     bmi  children      charges  sex_encoded  smoker_encoded  \
0   19  27.900         0  16884.92400            0               1   
1   18  33.770         1   1725.55230            1               0   
2   28  33.000         3   4449.46200            1               0   
3   33  22.705         0  21984.47061            1               0   
4   32  28.880         0   3866.85520            1               0   

   region_encoded  
0               3  
1               2  
2               2  
3               1  
4               1  

Encoding mappings:
Sex mapping: {'female': np.int64(0), 'male': np.int64(1)}
Smoker mapping: {'no': np.int64(0), 'yes': np.int64(1)}
Region mapping: {'northeast': np.int64(0), 'northwest': np.int64(1), 'southeast': np.int64(2), 'southwest': np.int64(3)}
Region reverse mapping: {np.int64(0): 'northeast', np.int64(1): 'northwest', np.int64(2): 'southeast', np.int64(3): 'southwest'