In [7]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split

# Download latest version
path = kagglehub.dataset_download("kukuroo3/body-signal-of-smoking")

df = pd.read_csv(path+"/smoking.csv")
df = df.drop(columns = ["oral", "ID"])
df["tartar"] = df["tartar"].map({"Y": 0, "N": 1})
df["gender"] = df["gender"].map({"F": 0, "M": 1})

In [8]:
# split off check set
df_work, df_check = train_test_split(
    df, 
    test_size=5000,
    stratify=df["smoking"],
    random_state=5510
)
df = df_work

In [9]:
#equalise smokers per gender

df_f = df[df["gender"] == 0]
df_m = df[df["gender"] == 1]


f_smoke = df_f["smoking"].value_counts().max()

df_f_equal = (
    df_f.groupby("smoking")
        .sample(n=f_smoke, replace=True, random_state=5510)
        .reset_index(drop=True)
)


m_smoke = df_m["smoking"].value_counts().max()

df_m_equal = (
    df_m.groupby("smoking")
        .sample(n=m_smoke, replace=True, random_state=5510)
        .reset_index(drop=True)
)

df = pd.concat([df_f_equal, df_m_equal], axis=0).reset_index(drop=True)

In [10]:
#equalise gender

gender_count = df["gender"].value_counts().max()

df_gender_equal = (
    df.groupby("gender")
        .sample(n=gender_count, replace=True, random_state=1)
        .reset_index(drop=True)
)

df = df_gender_equal

In [11]:
# print to csv


df.to_csv("./dataset/dataset.csv", index=False)

df_check.to_csv("./dataset/check.csv", index=False)

In [35]:
print(df.std())

gender                  0.481250
age                    12.071418
height(cm)              9.194597
weight(kg)             12.820306
waist(cm)               9.274223
eyesight(left)          0.486873
eyesight(right)         0.485964
hearing(left)           0.157902
hearing(right)          0.159564
systolic               13.675989
relaxation              9.679278
fasting blood sugar    20.795591
Cholesterol            36.297940
triglyceride           71.639817
HDL                    14.738963
LDL                    40.926476
hemoglobin              1.564498
Urine protein           0.404882
serum creatinine        0.221524
AST                    19.355460
ALT                    30.947853
Gtp                    50.290539
dental caries           0.409665
tartar                  0.496908
smoking                 0.482070
dtype: float64


In [32]:
print(df.mean())

gender                   0.958005
age                     41.607431
height(cm)             169.436324
weight(kg)              70.959668
waist(cm)               84.800518
eyesight(left)           1.051733
eyesight(right)          1.047636
hearing(left)            1.020777
hearing(right)           1.022195
systolic               122.806502
relaxation              77.380787
fasting blood sugar    102.049328
Cholesterol            195.541384
triglyceride           150.341481
HDL                     53.837888
LDL                    112.535468
hemoglobin              15.445339
Urine protein            1.094793
serum creatinine         0.948775
AST                     27.688194
ALT                     30.989782
Gtp                     55.570423
dental caries            0.269176
tartar                   0.380103
smoking                  1.000000
dtype: float64


In [3]:
#grab smokers from current set
df = df[df["smoking"]==1]

In [4]:
#grab non-smokers from current set
df = df[df["smoking"]==0]

In [5]:
#introduce blood pressure difference