In [114]:
import pandas as pd
import numpy as np

In [115]:
df = pd.read_csv("final_dataset.csv")

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          10500 non-null  int64  
 1   Age                             10500 non-null  int64  
 2   Menopause Age                   10500 non-null  int64  
 3   has_menopauseAge                10500 non-null  int64  
 4   Height                          10500 non-null  float64
 5   Weight                          10500 non-null  float64
 6   BMI                             10500 non-null  float64
 7   Smoker                          10500 non-null  int64  
 8   Alcoholic                       10500 non-null  int64  
 9   Diabetic                        10500 non-null  int64  
 10  Hypothyroidism                  10500 non-null  int64  
 11  Estrogen Use                    10500 non-null  int64  
 12  History of Fracture             

In [117]:
df.head(5)

Unnamed: 0,Gender,Age,Menopause Age,has_menopauseAge,Height,Weight,BMI,Smoker,Alcoholic,Diabetic,Hypothyroidism,Estrogen Use,History of Fracture,Dialysis,Family History of Osteoporosis,Diagnosis
0,0,48,0,0,183.3,113.9,33.9,0,1,1,0,0,0,1,0,0
1,1,49,48,1,164.2,52.3,19.4,1,1,0,1,0,0,0,0,0
2,0,77,0,0,188.0,130.6,37.0,0,0,1,0,0,1,0,1,0
3,0,20,0,0,180.1,77.3,23.8,1,0,1,1,0,0,0,0,0
4,0,40,0,0,173.7,78.0,25.9,0,1,0,0,0,1,0,0,0


## FEATURE ENGINEERING

In [118]:
df["Age_x_BMI"] = df["Age"] * df["BMI"]

In [119]:
def func(age, menopause_age):
  if menopause_age!=0:
    return age-menopause_age

df["years_since_menopause"] = df.apply(lambda x: func(x["Age"], x["Menopause Age"]), axis=1).fillna(0).clip(lower=0).astype(int)

In [120]:
def bmi_category(bmi):
  if bmi < 18.5:
    return "Underweight"
  elif bmi < 25:
    return "Normal"
  elif bmi < 30:
    return "Overweight"
  else:
    return "Obese"

df["BMI_category"] = df["BMI"].apply(bmi_category)

In [121]:
bmi_dummies = pd.get_dummies(df["BMI_category"], prefix = "BMI_cat").astype(int)
df = pd.concat([df, bmi_dummies], axis = 1)

In [123]:
df = df.drop(columns = ['BMI_category'])

In [124]:
risk_features = ["Smoker", "Alcoholic", "Diabetic", "Hypothyroidism",
                 "History of Fracture", "Dialysis", "Family History of Osteoporosis"]

In [125]:
df["risk_count"] = df[risk_features].sum(axis=1)

Introduce Non-Linearity

In [126]:
df["Age_squared"] = df["Age"] ** 2
df["BMI_squared"] = df["BMI"] ** 2


In [127]:
df["menopause_estrogen_interaction"] = df["years_since_menopause"] * df["Estrogen Use"]

In [128]:
df["Weight_Height_ratio"] = df["Weight"] / df["Height"].replace(0, np.nan)

In [129]:
df["Body_Surface_Area"] = 0.007184 * (df["Weight"] ** 0.425) * (df["Height"] ** 0.725)

In [134]:
modify_cols = ['Gender',
 'Age',
 'Menopause Age',
 'has_menopauseAge',
 'Height',
 'Weight',
 'BMI',
 'Smoker',
 'Alcoholic',
 'Diabetic',
 'Hypothyroidism',
 'Estrogen Use',
 'History of Fracture',
 'Dialysis',
 'Family History of Osteoporosis',
 'Diagnosis',
 'Age_x_BMI',
 'years_since_menopause',
 'BMI_cat_Normal',
 'BMI_cat_Obese',
 'BMI_cat_Overweight',
 'BMI_cat_Underweight',
 'risk_count',
 'Age_squared',
 'BMI_squared',
 'menopause_estrogen_interaction',
 'Weight_Height_ratio',
 'Body_Surface_Area']

In [135]:
modify_cols = [col.lower().replace(" ", "_") for col in modify_cols]
print(modify_cols)

['gender', 'age', 'menopause_age', 'has_menopauseage', 'height', 'weight', 'bmi', 'smoker', 'alcoholic', 'diabetic', 'hypothyroidism', 'estrogen_use', 'history_of_fracture', 'dialysis', 'family_history_of_osteoporosis', 'diagnosis', 'age_x_bmi', 'years_since_menopause', 'bmi_cat_normal', 'bmi_cat_obese', 'bmi_cat_overweight', 'bmi_cat_underweight', 'risk_count', 'age_squared', 'bmi_squared', 'menopause_estrogen_interaction', 'weight_height_ratio', 'body_surface_area']


In [136]:
modify_cols[3] = "has_menopause_age"

In [137]:
df.columns = modify_cols

In [138]:
df.columns

Index(['gender', 'age', 'menopause_age', 'has_menopause_age', 'height',
       'weight', 'bmi', 'smoker', 'alcoholic', 'diabetic', 'hypothyroidism',
       'estrogen_use', 'history_of_fracture', 'dialysis',
       'family_history_of_osteoporosis', 'diagnosis', 'age_x_bmi',
       'years_since_menopause', 'bmi_cat_normal', 'bmi_cat_obese',
       'bmi_cat_overweight', 'bmi_cat_underweight', 'risk_count',
       'age_squared', 'bmi_squared', 'menopause_estrogen_interaction',
       'weight_height_ratio', 'body_surface_area'],
      dtype='object')

In [139]:
df.to_csv("final_dataset_engineered.csv", index = False)