In [10]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split

# Download latest version
path = kagglehub.dataset_download("kukuroo3/body-signal-of-smoking")

df = pd.read_csv(path+"/smoking.csv")
df = df.drop(columns = ["oral", "ID"])
df["tartar"] = df["tartar"].map({"Y": 0, "N": 1})
df["gender"] = df["gender"].map({"F": 0, "M": 1})



In [11]:
# split off check set
df_work, df_check = train_test_split(
    df, 
    test_size=5000,
    stratify=df["smoking"],
    random_state=5510
)
df = df_work

In [7]:
def equalise_set(dataframe):

    #equalise smokers per gender
    
    df_f = dataframe[dataframe["gender"] == 0]
    df_m = dataframe[dataframe["gender"] == 1]
    
    
    f_smoke = df_f["smoking"].value_counts().max()
    
    df_f_equal = (
        df_f.groupby("smoking")
            .sample(n=f_smoke, replace=True, random_state=5510)
            .reset_index(drop=True)
    )
    
    
    m_smoke = df_m["smoking"].value_counts().max()
    
    df_m_equal = (
        df_m.groupby("smoking")
            .sample(n=m_smoke, replace=True, random_state=5510)
            .reset_index(drop=True)
    )
    
    dataframe = pd.concat([df_f_equal, df_m_equal], axis=0).reset_index(drop=True)
    
    #equalise gender
    
    gender_count = dataframe["gender"].value_counts().max()
    
    df_gender_equal = (
        df.groupby("gender")
            .sample(n=gender_count, replace=True, random_state=1)
            .reset_index(drop=True)
    )
    
    return dataframe
df = equalise_set(df)
#df_check = equalise_set(df_check)

In [14]:
# print to csv
# call this to print a final dataset 

df.to_csv("./dataset/dataset.csv", index=False)

df_check.to_csv("./dataset/check.csv", index=False)

In [3]:
print(df.std())

gender                  0.481349
age                    12.085964
height(cm)              9.210748
weight(kg)             12.832531
waist(cm)               9.274001
eyesight(left)          0.484467
eyesight(right)         0.483538
hearing(left)           0.158076
hearing(right)          0.159841
systolic               13.678203
relaxation              9.680114
fasting blood sugar    20.691820
Cholesterol            36.353188
triglyceride           71.423623
HDL                    14.794106
LDL                    40.444997
hemoglobin              1.567223
Urine protein           0.404194
serum creatinine        0.222427
AST                    19.165102
ALT                    31.396169
Gtp                    50.583119
dental caries           0.409038
tartar                  0.496894
smoking                 0.482073
dtype: float64


In [13]:
print(df.mean())

gender                   0.000000
age                     48.846216
height(cm)             155.607183
weight(kg)              56.083464
waist(cm)               76.917299
eyesight(left)           0.928160
eyesight(right)          0.926586
hearing(left)            1.027587
hearing(right)           1.028885
systolic               118.463894
relaxation              73.733867
fasting blood sugar     96.696814
Cholesterol            200.978904
triglyceride           103.847785
HDL                     63.285011
LDL                    117.165684
hemoglobin              13.168026
Urine protein            1.078974
serum creatinine         0.736977
AST                     23.778277
ALT                     20.238221
Gtp                     24.252502
dental caries            0.166333
tartar                   0.480067
smoking                  0.042679
dtype: float64


In [42]:
#introduce blood pressure difference
df["pulse_pressure"] = df["systolic"] - df["relaxation"]

In [43]:
#introduce MAP (mean arterial pressure)
df["map"] = df["relaxation"] + (df["systolic"] - df["relaxation"]) / 3

In [5]:
#introduce BMI and deurenberg formula
df["BMI"] = df["weight(kg)"] / (df["height(cm)"] * df["height(cm)"] / 10000)

df["Deurenberg"] = (1.2*df["BMI"]) + (0.23 * df["age"]) - (10.8 * df["gender"]) - 5.4

In [19]:
import numpy as np

#triglycerides feature engineering
df["log_triglyceride"] = np.log1p(df["triglyceride"])

df["tri_hdl_ratio"] = df["triglyceride"] / df["HDL"]
df["cho_hdl_ratio"] = df["Cholesterol"] / df["HDL"]

df["ldl_hdl_ratio"] = df["LDL"] / df["HDL"]

In [63]:
# "non-hdl" feature
df["non_HDL"] = df["Cholesterol"] - df["HDL"]

In [12]:
df = df[df["smoking"]==1]

In [23]:
df = df[df["smoking"]==0]

In [12]:
df = df[df["gender"]==0]
df_check = df_check[df_check["gender"]==0]

In [8]:
pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-1.0.2-py2.py3-none-any.whl.metadata (3.6 kB)
Downloading statsmodels-0.14.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (10.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m279.0 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading patsy-1.0.2-py2.py3-none-any.whl (233 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.3/233.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m[31m1.3 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: patsy, statsmodels
Successfully installed patsy-1.0.2 statsmodels-0.14.6
Note: you may need to restart the kernel to use updated packages.
