FEATURE ENGINEERING 

| Benefit                        | Description                                                                                  |
| ------------------------------ | -------------------------------------------------------------------------------------------- |
|  **Better Model Accuracy**   | Creating more meaningful features helps models make better predictions.                      |
|  **Faster Training**         | Good features reduce model confusion, speeding up training.                                  |
|  **Handle Real-world Data**  | Most real data is messy. Feature engineering makes it usable.                                |
|  **More Explainable Models** | Helps you understand *why* a model predicts something (important in interviews + real jobs). |
| **Outperform Defaults**     | Often, feature engineering improves performance *more* than just switching algorithms.       |


In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv("C:\ML Projects\ML datasets\insurancecharges.csv")

# View the first few rows
print(df.head())

# View data types and missing values
print(df.info())


   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None


  df = pd.read_csv("C:\ML Projects\ML datasets\insurancecharges.csv")


In [4]:
def bmi_category(bmi):
    if bmi < 18.5:
        return 'underweight'
    elif 18.5 <= bmi < 25:
        return 'normal'
    elif 25 <= bmi < 30:
        return 'overweight'
    else:
        return 'obese'

df['bmi_category'] = df['bmi'].apply(bmi_category)


In [5]:
df['age_group'] = pd.cut(df['age'], bins=[17, 25, 35, 45, 55, 65, 100], 
                         labels=['18–25', '26–35', '36–45', '46–55', '56–65', '65+'])


In [6]:
df['is_smoker'] = df['smoker'].apply(lambda x: 1 if x == 'yes' else 0)


In [7]:
df['region_code'] = df['region'].astype('category').cat.codes


In [8]:
df['bmi_age_ratio'] = df['bmi'] / df['age']


In [9]:
import numpy as np
df['log_charges'] = np.log(df['charges'])


In [10]:
print(df[['bmi', 'bmi_category', 'age', 'age_group', 'is_smoker', 'region_code', 'bmi_age_ratio', 'log_charges']].head())


      bmi bmi_category  age age_group  is_smoker  region_code  bmi_age_ratio  \
0  27.900   overweight   19     18–25          1            3       1.468421   
1  33.770        obese   18     18–25          0            2       1.876111   
2  33.000        obese   28     26–35          0            2       1.178571   
3  22.705       normal   33     26–35          0            1       0.688030   
4  28.880   overweight   32     26–35          0            1       0.902500   

   log_charges  
0     9.734176  
1     7.453302  
2     8.400538  
3     9.998092  
4     8.260197  


In [11]:
df_encoded = pd.get_dummies(df, columns=['bmi_category', 'age_group'], drop_first=True)
