In [17]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

In [19]:
# Age Group Mapping (Months & Years)
age_groups = {
    "Premie": (0, 0),
    "0-3 mo": (0, 3),
    "3-6 mo": (3, 6),
    "6-12 mo": (6, 12),
    "1-3 yr": (12, 36),
    "3-6 yr": (36, 72),
    "6-12 yr": (72, 144),
    ">12 yr": (144, float('inf'))
}

# Pediatric vital sign ranges
normal_ranges = {
    "Premie": {"HR": (120, 170), "Temp": (36.1, 37.5), "Oxygen": (94, 100)},
    "0-3 mo": {"HR": (100, 150), "Temp": (36.1, 37.5), "Oxygen": (94, 100)},
    "3-6 mo": {"HR": (90, 120), "Temp": (36.1, 37.5), "Oxygen": (94, 100)},
    "6-12 mo": {"HR": (80, 120), "Temp": (36.1, 37.5), "Oxygen": (94, 100)},
    "1-3 yr": {"HR": (70, 110), "Temp": (36.1, 37.5), "Oxygen": (94, 100)},
    "3-6 yr": {"HR": (65, 110), "Temp": (36.1, 37.5), "Oxygen": (94, 100)},
    "6-12 yr": {"HR": (60, 95), "Temp": (36.1, 37.5), "Oxygen": (94, 100)},
    ">12 yr": {"HR": (55, 85), "Temp": (36.1, 37.5), "Oxygen": (94, 100)}
}

# Function to map exact age to the correct age group
def map_age_group(age_months):
    for group, (min_age, max_age) in age_groups.items():
        if min_age <= age_months < max_age:
            return group
    return ">12 yr"

# Function to classify health status based on mapped age group
def classify_health_status(row):
    age_group = map_age_group(row["Age_Months"])
    ranges = normal_ranges[age_group]

    if (row["Heart_Rate_BPM"] < ranges["HR"][0] or row["Heart_Rate_BPM"] > ranges["HR"][1] or
        row["Body_Temperature_C"] < ranges["Temp"][0] or row["Body_Temperature_C"] > ranges["Temp"][1] or
        row["Oxygen_Level"] < ranges["Oxygen"][0] or row["Oxygen_Level"] > ranges["Oxygen"][1]):
        return "Abnormal"

    return "Normal"

# Generate 5000 samples
np.random.seed(42)
sample_size = 5000

original_data = pd.DataFrame({
    "Age_Months": np.random.randint(0, 200, sample_size),
    "Heart_Rate_BPM": np.random.randint(50, 180, sample_size),
    "Body_Temperature_C": np.round(np.random.uniform(35, 40, sample_size), 1),
    "Oxygen_Level": np.random.randint(85, 100, sample_size)
})

# Apply health classification
original_data["Health_Status"] = original_data.apply(classify_health_status, axis=1)

# Balance Dataset (Oversample if needed)
normal_count = (original_data["Health_Status"] == "Normal").sum()
abnormal_count = (original_data["Health_Status"] == "Abnormal").sum()

if normal_count > abnormal_count:
    abnormal_data = resample(original_data[original_data["Health_Status"] == "Abnormal"],
                             replace=True, n_samples=normal_count, random_state=42)
    balanced_data = pd.concat([original_data[original_data["Health_Status"] == "Normal"], abnormal_data])
elif abnormal_count > normal_count:
    normal_data = resample(original_data[original_data["Health_Status"] == "Normal"],
                           replace=True, n_samples=abnormal_count, random_state=42)
    balanced_data = pd.concat([original_data[original_data["Health_Status"] == "Abnormal"], normal_data])
else:
    balanced_data = original_data  # Already balanced

# Ensure exactly 5000 rows
balanced_data = balanced_data.sample(n=5000, replace=True, random_state=42).reset_index(drop=True)

# Save dataset
balanced_data.to_csv("balanced_sensor_data.csv", index=False)

print("✅ Balanced dataset with exactly 5000 rows saved as 'balanced_sensor_data.csv'.")

✅ Balanced dataset with exactly 5000 rows saved as 'balanced_sensor_data.csv'.
