In [60]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split

# Download latest version
path = kagglehub.dataset_download("kukuroo3/body-signal-of-smoking")

df = pd.read_csv(path+"/smoking.csv")
df = df.drop(columns = ["oral", "ID"])
df["tartar"] = df["tartar"].map({"Y": 0, "N": 1})
df["gender"] = df["gender"].map({"F": 0, "M": 1})

In [61]:
# split off check set
df_work, df_check = train_test_split(
    df, 
    test_size=5000,
    stratify=df["smoking"],
    random_state=5510
)
df = df_work

In [62]:
def equalise_set(dataframe):

    #equalise smokers per gender
    
    df_f = dataframe[dataframe["gender"] == 0]
    df_m = dataframe[dataframe["gender"] == 1]
    
    
    f_smoke = df_f["smoking"].value_counts().max()
    
    df_f_equal = (
        df_f.groupby("smoking")
            .sample(n=f_smoke, replace=True, random_state=5510)
            .reset_index(drop=True)
    )
    
    
    m_smoke = df_m["smoking"].value_counts().max()
    
    df_m_equal = (
        df_m.groupby("smoking")
            .sample(n=m_smoke, replace=True, random_state=5510)
            .reset_index(drop=True)
    )
    
    dataframe = pd.concat([df_f_equal, df_m_equal], axis=0).reset_index(drop=True)
    
    #equalise gender
    
    gender_count = dataframe["gender"].value_counts().max()
    
    df_gender_equal = (
        df.groupby("gender")
            .sample(n=gender_count, replace=True, random_state=1)
            .reset_index(drop=True)
    )
    
    return dataframe
df = equalise_set(df)
#df_check = equalise_set(df_check)

In [58]:
# print to csv
# call this to print a final dataset 

df.to_csv("./dataset/dataset.csv", index=False)

df_check.to_csv("./dataset/check.csv", index=False)

In [5]:
print(df.std())

gender                  0.500000
age                    11.389555
height(cm)              9.138357
weight(kg)             12.870101
waist(cm)               9.489115
eyesight(left)          0.470108
eyesight(right)         0.473516
hearing(left)           0.158318
hearing(right)          0.163005
systolic               13.902533
relaxation              9.814148
fasting blood sugar    20.903902
Cholesterol            36.206373
triglyceride           71.583535
HDL                    15.020956
LDL                    38.719170
hemoglobin              1.540204
Urine protein           0.405325
serum creatinine        0.211344
AST                    16.109945
ALT                    29.588852
Gtp                    48.612624
dental caries           0.412571
tartar                  0.494615
smoking                 0.500004
dtype: float64


In [59]:
print(df.mean())

gender                   0.000000
age                     47.685614
height(cm)             156.080348
weight(kg)              56.226127
waist(cm)               77.017903
eyesight(left)           0.910744
eyesight(right)          0.916508
hearing(left)            1.025822
hearing(right)           1.029410
systolic               117.204147
relaxation              73.456718
fasting blood sugar     97.934512
Cholesterol            200.320940
triglyceride           113.822522
HDL                     63.739801
LDL                    113.899508
hemoglobin              13.313617
Urine protein            1.082721
serum creatinine         0.735120
AST                     23.531981
ALT                     20.029693
Gtp                     29.545824
dental caries            0.202650
tartar                   0.417477
smoking                  0.500000
dtype: float64


In [42]:
#introduce blood pressure difference
df["pulse_pressure"] = df["systolic"] - df["relaxation"]

In [43]:
#introduce MAP (mean arterial pressure)
df["map"] = df["relaxation"] + (df["systolic"] - df["relaxation"]) / 3

In [44]:
#introduce BMI and deurenberg formula
df["BMI"] = df["weight(kg)"] / (df["height(cm)"] * df["height(cm)"] / 10000)

df["Deurenberg"] = (1.2*df["BMI"]) + (0.23 * df["age"]) - (10.8 * df["gender"]) - 5.4

In [19]:
import numpy as np

#triglycerides feature engineering
df["log_triglyceride"] = np.log1p(df["triglyceride"])

df["tri_hdl_ratio"] = df["triglyceride"] / df["HDL"]
df["cho_hdl_ratio"] = df["Cholesterol"] / df["HDL"]

df["ldl_hdl_ratio"] = df["LDL"] / df["HDL"]

In [63]:
# "non-hdl" feature
df["non_HDL"] = df["Cholesterol"] - df["HDL"]

In [12]:
df = df[df["smoking"]==1]

In [23]:
df = df[df["smoking"]==0]

In [57]:
df = df[df["gender"]==0]
df_check = df_check[df_check["gender"]==0]