# Featured Engineering Based on Scaling and Encoding
# Step 1: Import necessary libraries and data

In [45]:
# Necessary Libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [46]:
#Importing the data 
data = pd.read_csv('../Data/synthetic_COPD_data.csv') 

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Age                               1000 non-null   int64  
 1   Gender                            1000 non-null   object 
 2   Smoking_Status                    1000 non-null   object 
 3   Biomass_Fuel_Exposure             1000 non-null   int64  
 4   Occupational_Exposure             1000 non-null   int64  
 5   Family_History_COPD               1000 non-null   int64  
 6   BMI                               1000 non-null   float64
 7   Location                          1000 non-null   object 
 8   Air_Pollution_Level               1000 non-null   int64  
 9   Respiratory_Infections_Childhood  1000 non-null   int64  
 10  COPD_Diagnosis                    1000 non-null   int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 86.1+ KB


# Step 2: Encode categorical variables


In [47]:
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['Smoking_Status'] = label_encoder.fit_transform(data['Smoking_Status'])
data['Location'] = label_encoder.fit_transform(data['Location'])


# One-hot encode Location
data = pd.get_dummies(data, columns=['Location'], drop_first=True)

# Step 6: Create interaction features

In [48]:
data['Age_BMI_Interaction'] = data['Age'] * data['BMI']
data['Biomass_Fuel_Smoking_Interaction'] = data['Biomass_Fuel_Exposure'] * data['Smoking_Status']

# Step 7: Normalize numerical features

In [49]:
scaler = StandardScaler()
numerical_features = ['Age', 'Biomass_Fuel_Exposure', 'Occupational_Exposure',
                     'Family_History_COPD', 'BMI', 'Air_Pollution_Level',
                     'Respiratory_Infections_Childhood']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [50]:
# Ensure target variable is binary
data['COPD_Diagnosis'] = data['COPD_Diagnosis'].astype(int)

# Verify shape before fitting RFE
X = data.drop('COPD_Diagnosis', axis=1)
y = data['COPD_Diagnosis']

# Step 9: Feature selection

In [51]:
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=5)
fit = rfe.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:
if len(data.columns) - 1 == len(fit.support_):
    selected_features = X.columns[fit.support_].tolist()
    print("Selected Features: ", selected_features)
else:
    print("Mismatch in feature selection; check the number of features and RFE output.")


Selected Features:  ['Smoking_Status', 'Biomass_Fuel_Exposure', 'Family_History_COPD', 'Location_1', 'Location_6']


# Step 11: Save the transformed DataFrame to a CSV file

In [53]:
output_file = 'featured_data.csv'  
data.to_csv(output_file, index=False)