In [15]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split

# Download latest version
path = kagglehub.dataset_download("kukuroo3/body-signal-of-smoking")

df = pd.read_csv(path+"/smoking.csv")
df = df.drop(columns = ["oral", "ID"])
df["tartar"] = df["tartar"].map({"Y": 0, "N": 1})
df["gender"] = df["gender"].map({"F": 0, "M": 1})

In [2]:
# grab non-smokers of current set
df = df[df["smoking"]==0]

In [3]:
#grab smokers from current set
df = df[df["smoking"]==1]

In [16]:
# split off check set
df_work, df_check = train_test_split(
    df, 
    test_size=5000,
    stratify=df["smoking"],
    random_state=5510
)
df = df_work

In [17]:
#equalise smokers per gender

df_f = df[df["gender"] == 0]
df_m = df[df["gender"] == 1]


f_smoke = df_f["smoking"].value_counts().max()

df_f_equal = (
    df_f.groupby("smoking")
        .sample(n=f_smoke, replace=True, random_state=5510)
        .reset_index(drop=True)
)


m_smoke = df_m["smoking"].value_counts().max()

df_m_equal = (
    df_m.groupby("smoking")
        .sample(n=m_smoke, replace=True, random_state=5510)
        .reset_index(drop=True)
)

df = pd.concat([df_f_equal, df_m_equal], axis=0).reset_index(drop=True)

In [18]:
#equalise gender

gender_count = df["gender"].value_counts().max()

df_gender_equal = (
    df.groupby("gender")
        .sample(n=gender_count, replace=True, random_state=1)
        .reset_index(drop=True)
)

df = df_gender_equal

In [19]:
# print to csv


df.to_csv("./dataset/dataset.csv", index=False)

df_check.to_csv("./dataset/check.csv", index=False)

In [20]:
print(df.std())

gender                  0.500004
age                    11.363810
height(cm)              9.130579
weight(kg)             12.851763
waist(cm)               9.537448
eyesight(left)          0.456500
eyesight(right)         0.465899
hearing(left)           0.158243
hearing(right)          0.164252
systolic               13.935452
relaxation              9.816767
fasting blood sugar    20.952065
Cholesterol            36.279662
triglyceride           71.368378
HDL                    15.108167
LDL                    40.448983
hemoglobin              1.542676
Urine protein           0.405945
serum creatinine        0.212197
AST                    15.310533
ALT                    28.591552
Gtp                    48.527413
dental caries           0.411883
tartar                  0.494702
smoking                 0.499996
dtype: float64
