## Import Libraries

In [52]:
import pandas as pd
import numpy as np

## Load the Data

In [53]:
path = r'../Data/synthetic_COPD_data.csv'
try:
    df = pd.read_csv(path)
    print("Data loaded successfully.")
except Exception as e:
    print(f"Error loading data: {e}")

df.head()

Data loaded successfully.


Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis
0,31,Male,Former,1,1,1,27.56,Lalitpur,84,0,0
1,60,Male,Never,1,0,0,30.3,Pokhara,131,1,0
2,33,Male,Former,0,0,1,28.45,Pokhara,123,1,0
3,36,Female,Current,1,0,0,27.49,Kathmandu,253,0,1
4,58,Male,Never,0,0,0,25.49,Pokhara,117,1,0


## Feature Engineering
### Age Categories

In [54]:
df['Age_Category'] = pd.cut(df['Age'], bins=[29, 39, 49, 59, 69, 79], 
                             labels=['30 - 39', '40 - 49', '50 - 59', '60 - 69', '70 - 79'], 
                             right=True)

### BMI Categories

In [55]:
df['BMI_Categories'] = pd.cut(df['BMI'], bins=[0, 18.5, 24.9, 29.9, 39], 
                               labels=['Underweight', 'Normal Weight', 'Overweight', 'Obesity'], 
                               right=True)

### Pollution Risk Score

In [56]:
df['Pollution_Risk_Score'] = np.where(df['Air_Pollution_Level'] > 150, 1, 0)

### Encode Smoking Status

In [57]:
df['Smoking_Status_encoded'] = df['Smoking_Status'].map({'Current': 1, 'Former': 0.5, 'Never': 0})


### Encode Gender

In [58]:
df['Gender_encoded'] = df['Gender'].map({'Male': 1, 'Female': 0})

### Interaction Features

In [59]:
df['Smoking_Pollution_interaction'] = df['Smoking_Status_encoded'] * df['Air_Pollution_Level']
df['Age_BMI_Interaction'] = df['Age'] * df['BMI'] 


### One-Hot Encoding for Location

In [60]:
df = pd.get_dummies(df, columns=['Location'], drop_first=True)


 ## Prepare Machine Learning Data

In [61]:
df = df.drop(columns=['Smoking_Status', 'Age_Category', 'Pollution_Risk_Score', 'BMI_Categories', 'Gender', 'Smoking_Pollution_interaction'])


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Age                               1000 non-null   int64  
 1   Biomass_Fuel_Exposure             1000 non-null   int64  
 2   Occupational_Exposure             1000 non-null   int64  
 3   Family_History_COPD               1000 non-null   int64  
 4   BMI                               1000 non-null   float64
 5   Air_Pollution_Level               1000 non-null   int64  
 6   Respiratory_Infections_Childhood  1000 non-null   int64  
 7   COPD_Diagnosis                    1000 non-null   int64  
 8   Smoking_Status_encoded            1000 non-null   float64
 9   Gender_encoded                    1000 non-null   int64  
 10  Age_BMI_Interaction               1000 non-null   float64
 11  Location_Biratnagar               1000 non-null   bool   
 12  Locatio

### Save Engineered Data

In [63]:
output_file = 'engineered_COPD_data_bin.csv'
df.to_csv(output_file, index=False)