In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
PROCESSED_PATH = '../data/processed/'
FEATURES_PATH = '../data/feature_engineered/'
os.makedirs(FEATURES_PATH, exist_ok=True)

## DIABETES DATASET ENGINEERING

In [None]:
diabetes['age_group'] = pd.cut(diabetes['age'],
                              bins=[0, 30, 45, 60, 100],
                              labels=['<30', '30-45', '46-60', '60+'])

In [None]:
diabetes['bmi_category'] = pd.cut(diabetes['bmi'],
                                 bins=[0, 18.5, 25, 30, 100],
                                 labels=['Under', 'Normal', 'Over', 'Obese'])

In [None]:
diabetes['glucose_tolerance'] = pd.cut(diabetes['blood_glucose_level'],
                                      bins=[0, 140, 200, 300],
                                      labels=['Normal', 'Prediabetes', 'Diabetes'])

## STROKE DATASET ENGINEERING

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
smote = SMOTE(sampling_strategy=0.3)
X_res, y_res = smote.fit_resample(X_train, y_train)

In [None]:
stroke_eng['glucose_risk'] = pd.cut(stroke_eng['avg_glucose_level'],
                                   bins=[0, 90, 160, 300],
                                   labels=['Low', 'Normal', 'High'])

In [None]:
stroke_eng['bp_risk'] = stroke_eng.apply(lambda x: 1 if (x['hypertension'] == 1) or (x['heart_disease'] == 1) else 0, axis=1)

## HEART DISEASE DATASET ENGINEERING

In [None]:
heart['blood_pressure'] = pd.cut(heart['trestbps'],
                                bins=[0, 120, 130, 140, 200],
                                labels=['Normal', 'Elevated', 'Stage1', 'Stage2'])

In [None]:
heart['chol_risk'] = pd.cut(heart['chol'],
                           bins=[0, 200, 240, 600],
                           labels=['Normal', 'Borderline', 'High'])

In [None]:
scaler = StandardScaler()
heart[num_cols] = scaler.fit_transform(heart[num_cols])

## CROSS-DATASET ANALYSIS

In [None]:
risk_factors = pd.DataFrame({
    'Diabetes': diabetes.corr()['diabetes'].abs().sort_values(ascending=False)[1:6].index.tolist(),
    'Stroke': stroke_eng.corr()['stroke'].abs().sort_values(ascending=False)[1:6].index.tolist(),
    'Heart': heart.corr()['target'].abs().sort_values(ascending=False)[1:6].index.tolist()
})