In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("human_vital_signs_dataset_2024.csv")

In [3]:
df.columns = df.columns.str.strip()


In [4]:
DROP_COLS = ['Patient ID', 'Timestamp']
df.drop(columns=DROP_COLS, inplace=True)


In [5]:
df['Risk Category'] = (
    df['Risk Category']
    .astype(str)
    .str.lower()
    .str.strip()
)

df['Risk Category'] = df['Risk Category'].map({
    'low risk': 0,
    'high risk': 1
})


In [6]:
df['Gender'] = (
    df['Gender']
    .astype(str)
    .str.lower()
    .str.strip()
)

df['Gender'] = df['Gender'].map({
    'male': 0,
    'female': 1
})


In [7]:
print(df['Gender'].unique())


[1 0]


In [8]:
num_cols = df.columns.drop('Risk Category')

df[num_cols] = df[num_cols].apply(
    pd.to_numeric, errors='coerce'
)


In [9]:
df.fillna(df.mean(), inplace=True)


In [10]:
df.isnull().sum().sum()


np.int64(0)

In [11]:
df = df[
    df['Heart Rate'].between(40, 180) &
    df['Oxygen Saturation'].between(70, 100) &
    df['Body Temperature'].between(34, 42)
]


In [12]:
print(df.dtypes)
print(df.isnull().sum().sum())


Heart Rate                    int64
Respiratory Rate              int64
Body Temperature            float64
Oxygen Saturation           float64
Systolic Blood Pressure       int64
Diastolic Blood Pressure      int64
Age                           int64
Gender                        int64
Weight (kg)                 float64
Height (m)                  float64
Derived_HRV                 float64
Derived_Pulse_Pressure        int64
Derived_BMI                 float64
Derived_MAP                 float64
Risk Category                 int64
dtype: object
0


In [13]:
df.to_csv("cleaned_dataset.csv", index=False)
