In [17]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('healthdata.csv')

data.head() 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Feature Engineering ( Handling Categorical and Numerical Data)


## One-Hot Encoding  (Categorical)
#### Machine learning models require numerical input.
#### One-hot encoding ensures that categorical variables are represented as binary vectors without implying any ordinal relationship between categories.

In [18]:


# Load the dataset
data = pd.read_csv('healthdata.csv')

# Select categorical columns for one-hot encoding
categorical_columns = ['sex', 'smoker', 'region']

#  Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=False , drop ='first')  

#  Apply OneHotEncoder
encoded_data = encoder.fit_transform(data[categorical_columns])

# Step 4: Convert encoded data into a DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

#  Concatenate with original dataset (excluding original categorical columns)
final_data = pd.concat([data.drop(categorical_columns, axis=1), encoded_df], axis=1)
 
# Display the final dataset after one-hot encoding
print(final_data.head())

   age     bmi  children      charges  sex_male  smoker_yes  region_northwest  \
0   19  27.900         0  16884.92400       0.0         1.0               0.0   
1   18  33.770         1   1725.55230       1.0         0.0               0.0   
2   28  33.000         3   4449.46200       1.0         0.0               0.0   
3   33  22.705         0  21984.47061       1.0         0.0               1.0   
4   32  28.880         0   3866.85520       1.0         0.0               1.0   

   region_southeast  region_southwest  
0               0.0               1.0  
1               1.0               0.0  
2               1.0               0.0  
3               0.0               0.0  
4               0.0               0.0  




## Standardization  (Numerical)
#### Without scaling, features with larger ranges  may dominate smaller-range features  in certain machine learning algorithms


In [19]:
# Identify numerical features to scale
numerical_features = ['age', 'bmi', 'children']

# Scale numerical features
scaler = StandardScaler()
data_encoded[numerical_features] = scaler.fit_transform(data_encoded[numerical_features])
data_encoded[numerical_features]

Unnamed: 0,age,bmi,children
0,-1.438764,-0.453320,-0.908614
1,-1.509965,0.509621,-0.078767
2,-0.797954,0.383307,1.580926
3,-0.441948,-1.305531,-0.908614
4,-0.513149,-0.292556,-0.908614
...,...,...,...
1333,0.768473,0.050297,1.580926
1334,-1.509965,0.206139,-0.908614
1335,-1.509965,1.014878,-0.908614
1336,-1.296362,-0.797813,-0.908614
