In [44]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split

# Download latest version
path = kagglehub.dataset_download("kukuroo3/body-signal-of-smoking")

df = pd.read_csv(path+"/smoking.csv")
df = df.drop(columns = ["oral", "ID"])
df["tartar"] = df["tartar"].map({"Y": 1, "N": 0})
df["gender"] = df["gender"].map({"F": -1, "M": 1})

In [2]:
# grab non-smokers of current set
df = df[df["smoking"]==0]

In [30]:
#grab smokers from current set
df = df[df["smoking"]==1]

In [45]:
# split off check set
df_work, df_check = train_test_split(
    df, 
    test_size=5000,
    stratify=df["smoking"],
    random_state=5510
)
df = df_work

In [46]:
#equalise smokers per gender

df_f = df[df["gender"] == -1]
df_m = df[df["gender"] == 1]


f_smoke = df_f["smoking"].value_counts().max()

df_f_equal = (
    df_f.groupby("smoking")
        .sample(n=f_smoke, replace=True, random_state=5510)
        .reset_index(drop=True)
)


m_smoke = df_m["smoking"].value_counts().max()

df_m_equal = (
    df_m.groupby("smoking")
        .sample(n=m_smoke, replace=True, random_state=5510)
        .reset_index(drop=True)
)

df = pd.concat([df_f_equal, df_m_equal], axis=0).reset_index(drop=True)

In [47]:
#equalise gender

gender_count = df["gender"].value_counts().max()

df_gender_equal = (
    df.groupby("gender")
        .sample(n=gender_count, replace=True, random_state=1)
        .reset_index(drop=True)
)

df = df_gender_equal

In [48]:
# print to csv


df.to_csv("./dataset/dataset.csv", index=False)

df_check.to_csv("./dataset/check.csv", index=False)

In [41]:
print(df.mean())

gender                   0.271314
age                     44.182917
height(cm)             164.649321
weight(kg)              65.864936
waist(cm)               82.046418
eyesight(left)           1.012623
eyesight(right)          1.007443
hearing(left)            1.025587
hearing(right)           1.026144
systolic               121.494218
relaxation              76.004830
fasting blood sugar     99.312325
Cholesterol            196.901422
triglyceride           126.665697
HDL                     57.290347
LDL                    114.964501
hemoglobin              14.622592
Urine protein            1.087212
serum creatinine         0.885738
AST                     26.182935
ALT                     27.036037
Gtp                     39.952201
dental caries            0.213334
tartar                   0.555556
smoking                  0.367288
dtype: float64
