# Feature Engineering and Modelling

---

1. Import packages
2. Load data
3. Feature engineering

---

## 1.) Import packages

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import os

# Shows plots in jupyter notebook
%matplotlib inline

---
## 2.) Loading data

In [4]:
df=pd.read_csv('../data/processed_data/indian_youngsters_health_data.csv')

In [5]:
df.head(3)

Unnamed: 0,ID,Age,Gender,Region,Family_Income,Family_History_Diabetes,Parent_Diabetes_Type,Genetic_Risk_Score,BMI,Physical_Activity_Level,...,Smoking,Alcohol_Consumption,Fasting_Blood_Sugar,HbA1c,Cholesterol_Level,Prediabetes,Diabetes_Type,Sleep_Hours,Stress_Level,Screen_Time
0,1,21,Male,North,2209393,No,No Diabetes,6,31.4,Sedentary,...,Yes,No,95.6,9.5,163.3,Yes,No Diabetes,7.7,7,6.8
1,2,18,Female,Central,387650,No,No Diabetes,5,24.4,Active,...,No,No,164.9,5.0,169.1,Yes,No Diabetes,7.9,8,6.0
2,3,25,Male,North,383333,No,No Diabetes,6,20.0,Moderate,...,No,No,110.5,8.3,296.3,Yes,Type 1,7.6,8,4.6


---

## 3.) Feature engineering

### 3.1 Combine BMI and Physical Activity Level

Categorize individuals based on BMI and activity level into a risk score.

In [6]:
def bmi_activity_risk(bmi, activity):
    if bmi >= 30 and activity == 'Sedentary':
        return 'High Risk'
    elif bmi < 25 and activity == 'Active':
        return 'Low Risk'
    else:
        return 'Moderate Risk'

df['BMI_Activity_Risk'] = df.apply(lambda row: bmi_activity_risk(row['BMI'], row['Physical_Activity_Level']), axis=1)

### 3.2 Calculate Age Groups
Group individuals into age categories.

In [7]:
def age_group(age):
    if age < 20:
        return 'Teen'
    elif 20 <= age <= 40:
        return 'Young Adult'
    elif 40 < age <= 60:
        return 'Middle Aged'
    else:
        return 'Senior'

df['Age_Group'] = df['Age'].apply(age_group)

### 3.3 Combine Stress and Sleep Hours
Create a stress-to-sleep ratio as a new feature.

In [8]:
df['Stress_Sleep_Ratio'] = df['Stress_Level'] / (df['Sleep_Hours'] + 1)  # Add 1 to avoid division by zero

---
## 4.) Feature Interaction

Combine existing features to create interaction terms.

### 4.1 Interaction: Family History and Genetic Risk Score

In [9]:
df['Genetic_Family_Risk'] = df['Family_History_Diabetes'] * df['Genetic_Risk_Score']

### 4.2 Interaction: Smoking and Alcohol Consumption

In [10]:
df['Unhealthy_Behavior_Score'] = df['Smoking'] + df['Alcohol_Consumption']

---
## 5.) Drop Irrelevant Features
Remove columns not useful for modeling.

In [11]:
df.drop(['ID','Region','Diabetes_Type'], axis=1, inplace=True) 

---
## 6. Save the Processed Data
Save the dataset with engineered features.

In [12]:
save_location = '../data/feature_engineered_data/indian_youngsters_health_data.csv'
os.makedirs(os.path.dirname(save_location), exist_ok=True)  # Ensure the directory exists
df.to_csv(save_location, index=False)