In [75]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split

# Download latest version
path = kagglehub.dataset_download("kukuroo3/body-signal-of-smoking")

df = pd.read_csv(path+"/smoking.csv")
df = df.drop(columns = ["oral", "ID"])
df["tartar"] = df["tartar"].map({"Y": 0, "N": 1})
df["gender"] = df["gender"].map({"F": 0, "M": 1})

In [79]:
# split off check set
df_work, df_check = train_test_split(
    df, 
    test_size=5000,
    stratify=df["smoking"],
    random_state=5510
)
df = df_work

In [80]:
#equalise smokers per gender

df_f = df[df["gender"] == 0]
df_m = df[df["gender"] == 1]


f_smoke = df_f["smoking"].value_counts().max()

df_f_equal = (
    df_f.groupby("smoking")
        .sample(n=f_smoke, replace=True, random_state=5510)
        .reset_index(drop=True)
)


m_smoke = df_m["smoking"].value_counts().max()

df_m_equal = (
    df_m.groupby("smoking")
        .sample(n=m_smoke, replace=True, random_state=5510)
        .reset_index(drop=True)
)

df = pd.concat([df_f_equal, df_m_equal], axis=0).reset_index(drop=True)

In [81]:
#equalise gender

gender_count = df["gender"].value_counts().max()

df_gender_equal = (
    df.groupby("gender")
        .sample(n=gender_count, replace=True, random_state=1)
        .reset_index(drop=True)
)

df = df_gender_equal

In [82]:
# print to csv


df.to_csv("./dataset/dataset.csv", index=False)

df_check.to_csv("./dataset/check.csv", index=False)

In [35]:
print(df.std())

gender                  0.481250
age                    12.071418
height(cm)              9.194597
weight(kg)             12.820306
waist(cm)               9.274223
eyesight(left)          0.486873
eyesight(right)         0.485964
hearing(left)           0.157902
hearing(right)          0.159564
systolic               13.675989
relaxation              9.679278
fasting blood sugar    20.795591
Cholesterol            36.297940
triglyceride           71.639817
HDL                    14.738963
LDL                    40.926476
hemoglobin              1.564498
Urine protein           0.404882
serum creatinine        0.221524
AST                    19.355460
ALT                    30.947853
Gtp                    50.290539
dental caries           0.409665
tartar                  0.496908
smoking                 0.482070
dtype: float64


In [12]:
print(df.mean())

gender                   0.635657
age                     44.182917
height(cm)             164.649321
weight(kg)              65.864936
waist(cm)               82.046418
eyesight(left)           1.012623
eyesight(right)          1.007443
hearing(left)            1.025587
hearing(right)           1.026144
systolic               121.494218
relaxation              76.004830
fasting blood sugar     99.312325
Cholesterol            196.901422
triglyceride           126.665697
HDL                     57.290347
LDL                    114.964501
hemoglobin              14.622592
Urine protein            1.087212
serum creatinine         0.885738
AST                     26.182935
ALT                     27.036037
Gtp                     39.952201
dental caries            0.213334
tartar                   0.444444
smoking                  0.367288
pule_pressure           45.489388
map                     91.167959
BMI                     24.165677
Deurenberg              26.895789
dtype: float64

In [3]:
#grab smokers from current set
df = df[df["smoking"]==1]

In [4]:
#grab non-smokers from current set
df = df[df["smoking"]==0]

In [76]:
#introduce blood pressure difference
df["pulse_pressure"] = df["systolic"] - df["relaxation"]

In [77]:
#introduce MAP (mean arterial pressure)
df["map"] = df["relaxation"] + (df["systolic"] - df["relaxation"]) / 3

In [78]:
#introduce BMI and deurenberg formula
df["BMI"] = df["weight(kg)"] / (df["height(cm)"] * df["height(cm)"] / 10000)

df["Deurenberg"] = (1.2*df["BMI"]) + (0.23 * df["age"]) - (10.8 * df["gender"]) - 5.4

In [62]:
import numpy as np

#triglycerides feature engineering
df["log_triglyceride"] = np.log1p(df["triglyceride"])

#df["tri_hdl_ratio"] = df["triglyceride"] / df["HDL"]
#df["cho_hdl_ratio"] = df["Cholesterol"] / df["HDL"]

#df["ldl_hdl_ratio"] = df["LDL"] / df["HDL"]

In [63]:
# "non-hdl" feature
df["non_HDL"] = df["Cholesterol"] - df["HDL"]