In [83]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import os

## DIABETES DATASET ENGINEERING

In [84]:
# Load the diabetes data 
diabetes = pd.read_csv('/workspace/COMP-3608---PROJECT/data/processed/processed_diabetes.csv')

In [85]:
diabetes['age_group'] = pd.cut(diabetes['age'],
                              bins=[0, 30, 45, 60, 100],
                              labels=['<30', '30-45', '46-60', '60+'])

In [86]:
diabetes['bmi_category'] = pd.cut(diabetes['bmi'],
                                 bins=[0, 18.5, 25, 30, 100],
                                 labels=['Under', 'Normal', 'Over', 'Obese'])

In [87]:
diabetes['glucose_tolerance'] = pd.cut(diabetes['blood_glucose_level'],
                                      bins=[0, 140, 200, 300],
                                      labels=['Normal', 'Prediabetes', 'Diabetes'])

## STROKE DATASET ENGINEERING

In [88]:
# Load the Stroke data 
Stroke = pd.read_csv('/workspace/COMP-3608---PROJECT/data/processed/processed_stroke.csv')

In [89]:
# One-hot encode categorical variables
Stroke = pd.get_dummies(Stroke, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)

In [90]:
# Define the feature set (X) including all relevant features
X = Stroke[['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 
        'gender_male', 'gender_other', 'ever_married_yes', 
        'work_type_govt_job', 'work_type_never_worked', 'work_type_private', 
        'work_type_self-employed', 'Residence_type_urban', 
        'smoking_status_former', 'smoking_status_never', 'smoking_status_unknown']]

y = Stroke['stroke']  # Target column

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(sampling_strategy=0.3)
X_res, y_res = smote.fit_resample(X_train, y_train)

In [92]:
Stroke['glucose_risk'] = pd.cut(Stroke['avg_glucose_level'],
                                   bins=[0, 90, 160, 300],
                                   labels=['Low', 'Normal', 'High'])

In [93]:
Stroke['bp_risk'] = Stroke.apply(lambda x: 1 if (x['hypertension'] == 1) or (x['heart_disease'] == 1) else 0, axis=1)

## HEART DISEASE DATASET ENGINEERING

In [94]:
# Load the Heart Disease data 
heart = pd.read_csv('/workspace/COMP-3608---PROJECT/data/processed/processed_heart_disease.csv')

In [95]:
heart['blood_pressure'] = pd.cut(heart['trestbps'],
                                bins=[0, 120, 130, 140, 200],
                                labels=['Normal', 'Elevated', 'Stage1', 'Stage2'])

In [96]:
heart['chol_risk'] = pd.cut(heart['chol'],
                           bins=[0, 200, 240, 600],
                           labels=['Normal', 'Borderline', 'High'])

In [97]:
# Identify the numerical columns (you can manually specify or use df.select_dtypes)
num_cols = heart.select_dtypes(include=['float64', 'int64']).columns

scaler = StandardScaler()
heart[num_cols] = scaler.fit_transform(heart[num_cols])

In [98]:
# Check the columns after encoding
print(heart_encoded.columns)


Index(['id', 'age', 'trestbps', 'chol', 'fbs', 'thalch', 'exang', 'oldpeak',
       'num', 'sex_Male', 'dataset_Hungary', 'dataset_Switzerland',
       'dataset_VA Long Beach', 'cp_atypical angina', 'cp_non anginal',
       'cp_typical angina', 'restecg_normal', 'restecg_st-t abnormality',
       'age_group_40-49', 'age_group_50-59', 'age_group_60-69',
       'age_group_70+', 'age_group_<30', 'blood_pressure_Elevated',
       'blood_pressure_Stage1', 'blood_pressure_Stage2',
       'chol_risk_Borderline', 'chol_risk_High'],
      dtype='object')


## CROSS-DATASET ANALYSIS

In [99]:
# Calculate correlation on the encoded data
risk_factors = pd.DataFrame({
    'Diabetes': diabetes_encoded.corr()['diabetes'].abs().sort_values(ascending=False)[1:6].index.tolist(),
    'Stroke': stroke_encoded.corr()['stroke'].abs().sort_values(ascending=False)[1:6].index.tolist(),
    'Heart': heart_encoded.corr()['num'].abs().sort_values(ascending=False)[1:6].index.tolist()
})

print(risk_factors)


                     Diabetes             Stroke               Heart
0  glucose_tolerance_Diabetes                age             oldpeak
1         blood_glucose_level      age_group_60+     dataset_Hungary
2                 hba1c_level            bp_risk               exang
3                         age  glucose_risk_High              thalch
4               age_group_60+      heart_disease  cp_atypical angina


In [100]:
# Save the newly constructed feature datasets as CSV files

# Saving the diabetes dataset
diabetes.to_csv('/workspace/COMP-3608---PROJECT/data/feature_engineering/diabetes_feature_engineering', index=False)

# Saving the stroke dataset
Stroke.to_csv('/workspace/COMP-3608---PROJECT/data/feature_engineering/Stroke_feature_engineering', index=False)

# Saving the heart dataset
heart.to_csv('/workspace/COMP-3608---PROJECT/data/feature_engineering/heart_feature_engineering', index=False)

# Output message to confirm saving
print("Datasets saved successfully!")


Datasets saved successfully!
