# Goal : To predict Diabetes using only Apple watch data



In [1]:
import pandas as pd
df = pd.read_xml("/Users/nilvaghela/Desktop/Research/Diabetes/Data/export.xml")

In [2]:
df.head(-1)

Unnamed: 0,value,HKCharacteristicTypeIdentifierDateOfBirth,HKCharacteristicTypeIdentifierBiologicalSex,HKCharacteristicTypeIdentifierBloodType,HKCharacteristicTypeIdentifierFitzpatrickSkinType,HKCharacteristicTypeIdentifierCardioFitnessMedicationsUse,type,sourceName,sourceVersion,unit,...,activeEnergyBurned,activeEnergyBurnedGoal,activeEnergyBurnedUnit,appleMoveTime,appleMoveTimeGoal,appleExerciseTime,appleExerciseTimeGoal,appleStandHours,appleStandHoursGoal,HeartRateVariabilityMetadataList
0,2025-11-28 22:21:13 -0500,,,,,,,,,,...,,,,,,,,,,
1,,1999-04-06,HKBiologicalSexMale,HKBloodTypeNotSet,HKFitzpatrickSkinTypeNotSet,,,,,,...,,,,,,,,,,
2,250,,,,,,HKQuantityTypeIdentifierDietaryWater,Diet,530,mL,...,,,,,,,,,,
3,5.51181,,,,,,HKQuantityTypeIdentifierHeight,iPhone,12.3.1,ft,...,,,,,,,,,,
4,5.18373,,,,,,HKQuantityTypeIdentifierHeight,Nilrajsinh vaghela’s iPhone,12.3,ft,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2617045,44.7817,,,,,,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Nilrajsinh’s Apple Watch,26.1,ms,...,,,,,,,,,,\n
2617046,26.0156,,,,,,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Nilrajsinh’s Apple Watch,26.1,ms,...,,,,,,,,,,\n
2617047,18.7718,,,,,,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Nilrajsinh’s Apple Watch,26.1,ms,...,,,,,,,,,,\n
2617048,26.0455,,,,,,HKQuantityTypeIdentifierHeartRateVariabilitySDNN,Nilrajsinh’s Apple Watch,26.1,ms,...,,,,,,,,,,\n


In [3]:
from pathlib import Path

test_path = Path("/Users/nilvaghela/Desktop/Research/Diabetes/Data/export.xml")
print("XML exists?:", test_path.exists())

folder_path = Path("/Users/nilvaghela/Desktop/Research/Diabetes/Data/")
print("Folder exists?:", folder_path.exists())


XML exists?: True
Folder exists?: True


In [4]:
import xml.etree.ElementTree as ET
import csv
from pathlib import Path

xml_path = Path("/Users/nilvaghela/Desktop/Research/Diabetes/Data/export.xml")
csv_path = xml_path.with_name("health_records.csv")

print("Reading:", xml_path)
print("Writing:", csv_path)

# Start with empty header
fieldnames = set()

rows_buffer = []

# First pass: collect ALL fieldnames
for event, elem in ET.iterparse(xml_path):
    if elem.tag == "Record":
        for key in elem.attrib.keys():
            fieldnames.add(key)
        rows_buffer.append(elem.attrib.copy())  # store row
        elem.clear()

fieldnames = sorted(list(fieldnames))
print("Columns found:", fieldnames)

# Second pass: write CSV with complete header
with open(csv_path, "w", newline="") as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    for row in rows_buffer:
        writer.writerow(row)

print("DONE — health_records.csv created!")
print("Path:", csv_path)


Reading: /Users/nilvaghela/Desktop/Research/Diabetes/Data/export.xml
Writing: /Users/nilvaghela/Desktop/Research/Diabetes/Data/health_records.csv
Columns found: ['creationDate', 'device', 'endDate', 'sourceName', 'sourceVersion', 'startDate', 'type', 'unit', 'value']
DONE — health_records.csv created!
Path: /Users/nilvaghela/Desktop/Research/Diabetes/Data/health_records.csv


In [5]:
import pandas as pd

df = pd.read_csv("/Users/nilvaghela/Desktop/Research/Diabetes/Data/health_records.csv")
df.head()
df.columns


  df = pd.read_csv("/Users/nilvaghela/Desktop/Research/Diabetes/Data/health_records.csv")


Index(['creationDate', 'device', 'endDate', 'sourceName', 'sourceVersion',
       'startDate', 'type', 'unit', 'value'],
      dtype='object')

In [6]:
import pandas as pd
import numpy as np
from pathlib import Path

# ---------- Paths ----------
base_dir = Path("/Users/nilvaghela/Desktop/Research/Diabetes/Data")
records_path = base_dir / "health_records.csv"

print("Loading:", records_path)
df = pd.read_csv(records_path)

# Ensure datetime
df["startDate"] = pd.to_datetime(df["startDate"])
df["endDate"]   = pd.to_datetime(df["endDate"], errors="ignore")

# ---------- Helper to extract a single metric ----------
def extract(type_id, col_name, unit_filter=None):
    """
    Extract one metric from health_records by HealthKit 'type' (and optional 'unit'),
    returning a DataFrame with columns: ['startDate', col_name].
    """
    temp = df[df["type"] == type_id].copy()
    if temp.empty:
        print(f"[WARN] No rows for type: {type_id}")
        return pd.DataFrame(columns=["startDate", col_name])

    if unit_filter is not None:
        temp = temp[temp["unit"] == unit_filter].copy()
        if temp.empty:
            print(f"[WARN] No rows for type: {type_id} with unit={unit_filter}")
            return pd.DataFrame(columns=["startDate", col_name])

    temp = temp[["startDate", "value"]].copy()
    temp.rename(columns={"value": col_name}, inplace=True)
    temp[col_name] = pd.to_numeric(temp[col_name], errors="coerce")
    return temp


# ---------- Core metrics you already had ----------
heart_rate = extract("HKQuantityTypeIdentifierHeartRate", "heart_rate", "count/min")
hrv        = extract("HKQuantityTypeIdentifierHeartRateVariabilitySDNN", "hrv", "ms")
steps      = extract("HKQuantityTypeIdentifierStepCount", "steps", "count")
resting_hr = extract("HKQuantityTypeIdentifierRestingHeartRate", "resting_hr", "count/min")
vo2        = extract("HKQuantityTypeIdentifierVO2Max", "vo2max", "mL/min·kg")

# ---------- Additional high-value metrics ----------
active_energy = extract("HKQuantityTypeIdentifierActiveEnergyBurned", "active_energy_kcal", "kcal")
basal_energy  = extract("HKQuantityTypeIdentifierBasalEnergyBurned", "basal_energy_kcal", "kcal")

distance_walk_run = extract("HKQuantityTypeIdentifierDistanceWalkingRunning",
                            "distance_walk_run_m", None)  # unit often "m"

flights = extract("HKQuantityTypeIdentifierFlightsClimbed", "flights", "count")

exercise_time = extract("HKQuantityTypeIdentifierAppleExerciseTime",
                        "exercise_minutes", "min")

stand_time = extract("HKQuantityTypeIdentifierAppleStandTime",
                     "stand_minutes", "min")

resp_rate = extract("HKQuantityTypeIdentifierRespiratoryRate",
                    "respiratory_rate", "count/min")

spo2 = extract("HKQuantityTypeIdentifierOxygenSaturation",
               "spo2_percent", "%")

wrist_temp = extract("HKQuantityTypeIdentifierBodyTemperature",
                     "wrist_temp_degC", "degC")


# ---------- Sleep (special handling) ----------
sleep = df[df["type"] == "HKCategoryTypeIdentifierSleepAnalysis"].copy()
if not sleep.empty:
    sleep["sleep_minutes"] = (
        pd.to_datetime(sleep["endDate"]) - pd.to_datetime(sleep["startDate"])
    ).dt.total_seconds() / 60
    sleep = sleep[["startDate", "sleep_minutes"]].rename(columns={"startDate": "timestamp"})
else:
    sleep = pd.DataFrame(columns=["timestamp", "sleep_minutes"])


# ---------- Merge everything into ONE DataFrame ----------
# Start with heart_rate as base (if empty, we fallback later)
metric_dfs = [
    heart_rate, hrv, steps, resting_hr, vo2,
    active_energy, basal_energy,
    distance_walk_run, flights,
    exercise_time, stand_time,
    resp_rate, spo2, wrist_temp
]

# Filter out completely empty ones
metric_dfs = [m for m in metric_dfs if not m.empty]

if not metric_dfs:
    raise ValueError("No metric dataframes found; check your health_records types.")

merged = metric_dfs[0]
for m in metric_dfs[1:]:
    merged = merged.merge(m, on="startDate", how="outer")

# Add sleep by merging on timestamp (same as startDate)
merged = merged.sort_values("startDate")
merged = merged.rename(columns={"startDate": "timestamp"})

# Merge sleep (outer join on timestamp)
merged = merged.merge(sleep, on="timestamp", how="left")

# ---------- Clean and save ----------
# Optional: forward/back fill to avoid big NaN gaps within sessions
merged = merged.sort_values("timestamp")
merged = merged.ffill().bfill()

out_path = base_dir / "apple_watch_combined_full.csv"
merged.to_csv(out_path, index=False)

print("\nCombined full CSV saved at:")
print(out_path)
print("\nColumns in final dataset:")
print(merged.columns.tolist())
print("\nPreview:")
print(merged.head())


Loading: /Users/nilvaghela/Desktop/Research/Diabetes/Data/health_records.csv


  df = pd.read_csv(records_path)
  df["endDate"]   = pd.to_datetime(df["endDate"], errors="ignore")


[WARN] No rows for type: HKQuantityTypeIdentifierActiveEnergyBurned with unit=kcal
[WARN] No rows for type: HKQuantityTypeIdentifierBasalEnergyBurned with unit=kcal
[WARN] No rows for type: HKQuantityTypeIdentifierBodyTemperature

Combined full CSV saved at:
/Users/nilvaghela/Desktop/Research/Diabetes/Data/apple_watch_combined_full.csv

Columns in final dataset:
['timestamp', 'heart_rate', 'hrv', 'steps', 'resting_hr', 'vo2max', 'distance_walk_run_m', 'flights', 'exercise_minutes', 'stand_minutes', 'respiratory_rate', 'spo2_percent', 'sleep_minutes']

Preview:
                  timestamp  heart_rate      hrv  steps  resting_hr  vo2max  \
0 2018-02-10 05:37:30-05:00        79.0  132.015   13.0        70.0   34.25   
1 2018-02-10 06:34:33-05:00        79.0  132.015  355.0        70.0   34.25   
2 2018-02-10 06:44:32-05:00        79.0  132.015  157.0        70.0   34.25   
3 2018-02-10 06:54:19-05:00        79.0  132.015   24.0        70.0   34.25   
4 2018-02-10 07:07:11-05:00        79.

In [7]:
import pandas as pd
from pathlib import Path

base_dir = Path("/Users/nilvaghela/Desktop/Research/Diabetes/Data")
records_path = base_dir / "health_records.csv"
df = pd.read_csv(records_path)

# Look at all distinct types
print("Unique types containing 'Energy':")
print(df[df["type"].str.contains("Energy", na=False)][["type", "unit"]].drop_duplicates())

print("\nUnique types containing 'Temperature':")
print(df[df["type"].str.contains("Temperature", na=False)][["type", "unit"]].drop_duplicates())


  df = pd.read_csv(records_path)


Unique types containing 'Energy':
                                               type unit
835842    HKQuantityTypeIdentifierBasalEnergyBurned  Cal
1166078  HKQuantityTypeIdentifierActiveEnergyBurned  Cal

Unique types containing 'Temperature':
                                                      type  unit
2374199  HKQuantityTypeIdentifierAppleSleepingWristTemp...  degF


In [8]:
import pandas as pd
from pathlib import Path

base_dir = Path("/Users/nilvaghela/Desktop/Research/Diabetes/Data")
records_path = base_dir / "health_records.csv"

df = pd.read_csv(records_path, low_memory=False)


In [9]:
import pandas as pd
import numpy as np
from pathlib import Path

# ---------- Paths ----------
base_dir = Path("/Users/nilvaghela/Desktop/Research/Diabetes/Data")
records_path = base_dir / "health_records.csv"

print("Loading:", records_path)
df = pd.read_csv(records_path)

# Ensure datetime
df["startDate"] = pd.to_datetime(df["startDate"])
df["endDate"]   = pd.to_datetime(df["endDate"], errors="ignore")


# ---------- Helper to extract a single numeric metric ----------
def extract(type_id, col_name, unit_filter=None):
    """
    Extract one numeric metric from health_records by HealthKit 'type'
    (and optional 'unit'), returning a DataFrame with columns:
      ['startDate', col_name]
    """
    temp = df[df["type"] == type_id].copy()
    if temp.empty:
        print(f"[WARN] No rows for type: {type_id}")
        return pd.DataFrame(columns=["startDate", col_name])

    if unit_filter is not None:
        temp = temp[temp["unit"] == unit_filter].copy()
        if temp.empty:
            print(f"[WARN] No rows for type: {type_id} with unit={unit_filter}")
            return pd.DataFrame(columns=["startDate", col_name])

    temp = temp[["startDate", "value"]].copy()
    temp.rename(columns={"value": col_name}, inplace=True)
    temp[col_name] = pd.to_numeric(temp[col_name], errors="coerce")
    return temp


# ---------- Core cardiovascular / fitness metrics ----------
heart_rate   = extract("HKQuantityTypeIdentifierHeartRate", "heart_rate", "count/min")
hrv          = extract("HKQuantityTypeIdentifierHeartRateVariabilitySDNN", "hrv", "ms")
resting_hr   = extract("HKQuantityTypeIdentifierRestingHeartRate", "resting_hr", "count/min")
vo2          = extract("HKQuantityTypeIdentifierVO2Max", "vo2max", "mL/min·kg")

# Optional extra HR-related metric (if present)
walking_hr   = extract("HKQuantityTypeIdentifierWalkingHeartRateAverage",
                       "walking_hr", "count/min")

# ---------- Activity / energy ----------
steps        = extract("HKQuantityTypeIdentifierStepCount", "steps", "count")
active_energy = extract("HKQuantityTypeIdentifierActiveEnergyBurned", "active_energy_kcal", "kcal")
basal_energy  = extract("HKQuantityTypeIdentifierBasalEnergyBurned", "basal_energy_kcal", "kcal")
distance_walk_run = extract("HKQuantityTypeIdentifierDistanceWalkingRunning",
                            "distance_walk_run_m", None)  # unit often "m"
flights      = extract("HKQuantityTypeIdentifierFlightsClimbed", "flights", "count")
exercise_time = extract("HKQuantityTypeIdentifierAppleExerciseTime", "exercise_minutes", "min")
stand_time    = extract("HKQuantityTypeIdentifierAppleStandTime", "stand_minutes", "min")

# ---------- Respiratory / oxygen / temp ----------
resp_rate    = extract("HKQuantityTypeIdentifierRespiratoryRate", "respiratory_rate", "count/min")
spo2         = extract("HKQuantityTypeIdentifierOxygenSaturation", "spo2_percent", "%")
wrist_temp   = extract("HKQuantityTypeIdentifierBodyTemperature", "wrist_temp_degC", "degC")

# ---------- Body metrics (very important for diabetes) ----------
# Body mass from Apple Health. Your export is in pounds, so we use "lb".
body_mass_lb = extract("HKQuantityTypeIdentifierBodyMass", "body_mass_lb", "lb")

# Height (if present), usually in meters or centimeters
height_m = extract("HKQuantityTypeIdentifierHeight", "height_m", "m")
if height_m.empty:
    # Some exports use "cm"
    height_m = extract("HKQuantityTypeIdentifierHeight", "height_cm", "cm")
    if not height_m.empty:
        height_m["height_m"] = height_m["height_cm"] / 100.0
        height_m = height_m.drop(columns=["height_cm"])

# ---------- Characteristics: Date of Birth & Biological Sex ----------
dob_rows  = df[df["type"] == "HKCharacteristicTypeIdentifierDateOfBirth"].copy()
sex_rows  = df[df["type"] == "HKCharacteristicTypeIdentifierBiologicalSex"].copy()

dob = None
sex_clean = None
sex_encoded = np.nan

if not dob_rows.empty:
    # Usually the 'value' is a date string like "1998-07-15"
    dob = pd.to_datetime(dob_rows["value"].iloc[0], errors="coerce")

if not sex_rows.empty:
    raw_sex = str(sex_rows["value"].iloc[0])
    # Apple often uses "HKBiologicalSexMale" / "HKBiologicalSexFemale" or numeric codes
    if "Female" in raw_sex or raw_sex == "2":
        sex_clean = "Female"
        sex_encoded = 0
    elif "Male" in raw_sex or raw_sex == "3":
        sex_clean = "Male"
        sex_encoded = 1
    else:
        sex_clean = "Other"
        sex_encoded = 2
else:
    print("[WARN] No biological sex characteristic found.")


# ---------- Sleep (special handling) ----------
sleep = df[df["type"] == "HKCategoryTypeIdentifierSleepAnalysis"].copy()
if not sleep.empty:
    sleep["sleep_minutes"] = (
        pd.to_datetime(sleep["endDate"]) - pd.to_datetime(sleep["startDate"])
    ).dt.total_seconds() / 60
    sleep = sleep[["startDate", "sleep_minutes"]].rename(columns={"startDate": "timestamp"})
else:
    sleep = pd.DataFrame(columns=["timestamp", "sleep_minutes"])


# ---------- Merge everything into ONE DataFrame ----------
metric_dfs = [
    heart_rate, hrv, resting_hr, vo2,
    walking_hr,
    steps,
    active_energy, basal_energy,
    distance_walk_run, flights,
    exercise_time, stand_time,
    resp_rate, spo2, wrist_temp,
    body_mass_lb, height_m
]

# Filter out completely empty ones
metric_dfs = [m for m in metric_dfs if not m.empty]

if not metric_dfs:
    raise ValueError("No metric dataframes found; check your health_records types.")

# Start with the first non-empty DF
merged = metric_dfs[0]
for m in metric_dfs[1:]:
    merged = merged.merge(m, on="startDate", how="outer")

# Sort and rename time column
merged = merged.sort_values("startDate")
merged = merged.rename(columns={"startDate": "timestamp"})

# Merge sleep (outer join on timestamp)
merged = merged.merge(sleep, on="timestamp", how="left")


# ---------- Derived fields: age, sex, weight_kg, BMI ----------
# Age at each timestamp
if dob is not None:
    merged["age_years"] = (merged["timestamp"] - dob).dt.days / 365.25
else:
    merged["age_years"] = np.nan

# Sex (string + encoded numeric)
merged["sex"] = sex_clean
merged["sex_encoded"] = sex_encoded

# Weight in kg
if "body_mass_lb" in merged.columns:
    merged["weight_kg"] = merged["body_mass_lb"] * 0.453592
else:
    merged["weight_kg"] = np.nan

# BMI if height is available
if "height_m" in merged.columns:
    merged["bmi"] = merged["weight_kg"] / (merged["height_m"] ** 2)
else:
    merged["bmi"] = np.nan


# ---------- Optional: fill small gaps ----------
merged = merged.sort_values("timestamp")
merged = merged.ffill().bfill()


# ---------- Save ----------
out_path = base_dir / "apple_watch_combined_full.csv"
merged.to_csv(out_path, index=False)

print("\nCombined full CSV saved at:")
print(out_path)
print("\nColumns in final dataset:")
print(merged.columns.tolist())
print("\nPreview:")
print(merged.head())

Loading: /Users/nilvaghela/Desktop/Research/Diabetes/Data/health_records.csv


  df = pd.read_csv(records_path)
  df["endDate"]   = pd.to_datetime(df["endDate"], errors="ignore")


[WARN] No rows for type: HKQuantityTypeIdentifierActiveEnergyBurned with unit=kcal
[WARN] No rows for type: HKQuantityTypeIdentifierBasalEnergyBurned with unit=kcal
[WARN] No rows for type: HKQuantityTypeIdentifierBodyTemperature
[WARN] No rows for type: HKQuantityTypeIdentifierHeight with unit=m
[WARN] No rows for type: HKQuantityTypeIdentifierHeight with unit=cm
[WARN] No biological sex characteristic found.

Combined full CSV saved at:
/Users/nilvaghela/Desktop/Research/Diabetes/Data/apple_watch_combined_full.csv

Columns in final dataset:
['timestamp', 'heart_rate', 'hrv', 'resting_hr', 'vo2max', 'walking_hr', 'steps', 'distance_walk_run_m', 'flights', 'exercise_minutes', 'stand_minutes', 'respiratory_rate', 'spo2_percent', 'body_mass_lb', 'sleep_minutes', 'age_years', 'sex', 'sex_encoded', 'weight_kg', 'bmi']

Preview:
                  timestamp  heart_rate      hrv  resting_hr  vo2max  \
0 2018-02-10 05:37:30-05:00        79.0  132.015        70.0   34.25   
1 2018-02-10 06:34:3

In [10]:
import pandas as pd
from pathlib import Path

base_dir = Path("/Users/nilvaghela/Desktop/Research/Diabetes/Data")
combined_path = base_dir / "apple_watch_combined_full.csv"

print("Loading:", combined_path)
df = pd.read_csv(combined_path)

# Parse timestamp and extract date
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["date"] = df["timestamp"].dt.date

# --- Define possible aggregations per day (only applied if column exists) ---
agg_spec = {
    # Cardiovascular
    "heart_rate":          ["mean", "min", "max", "std"],
    "hrv":                 ["mean", "min", "max", "std"],
    "resting_hr":          ["mean"],
    "walking_hr":          ["mean"],          # may be missing; handled below
    "vo2max":              ["mean"],
    
    # Activity / movement
    "steps":               ["sum", "max"],
    "distance_walk_run_m": ["sum"],
    "flights":             ["sum"],
    "exercise_minutes":    ["sum"],
    "stand_minutes":       ["sum"],
    
    # Energy
    "active_energy_kcal":  ["sum"],
    "basal_energy_kcal":   ["sum"],
    
    # Respiration / oxygen / temperature
    "respiratory_rate":    ["mean", "min", "max"],
    "spo2_percent":        ["mean", "min"],
    "wrist_temp_degC":     ["mean"],
    
    # Sleep
    "sleep_minutes":       ["sum"],
    
    # Static-ish health features (can still vary slightly over time)
    "weight_kg":           ["mean"],
    "bmi":                 ["mean"],
    "age_years":           ["mean"],
    "sex_encoded":         ["first"],         # constant per user; 'first' is enough
}

# Build aggregation dict only for columns that actually exist in df
agg_dict = {}
for col, funcs in agg_spec.items():
    if col in df.columns:
        agg_dict[col] = funcs
    else:
        print(f"[WARN] Column '{col}' not found in combined data; skipping in daily agg.")

if not agg_dict:
    raise ValueError("No valid columns found for daily aggregation. Check your combined CSV.")

# Group by date and aggregate
daily = df.groupby("date").agg(agg_dict)

# Flatten MultiIndex columns: ('heart_rate','mean') -> 'heart_rate_mean'
daily.columns = [f"{col[0]}_{col[1]}" for col in daily.columns]
daily = daily.reset_index()

out_path = base_dir / "apple_watch_daily_features.csv"
daily.to_csv(out_path, index=False)

print("\nSaved daily features to:", out_path)
print("\nDaily feature columns:")
print(daily.columns.tolist())
print("\nPreview:")
print(daily.head())

Loading: /Users/nilvaghela/Desktop/Research/Diabetes/Data/apple_watch_combined_full.csv
[WARN] Column 'active_energy_kcal' not found in combined data; skipping in daily agg.
[WARN] Column 'basal_energy_kcal' not found in combined data; skipping in daily agg.
[WARN] Column 'wrist_temp_degC' not found in combined data; skipping in daily agg.

Saved daily features to: /Users/nilvaghela/Desktop/Research/Diabetes/Data/apple_watch_daily_features.csv

Daily feature columns:
['date', 'heart_rate_mean', 'heart_rate_min', 'heart_rate_max', 'heart_rate_std', 'hrv_mean', 'hrv_min', 'hrv_max', 'hrv_std', 'resting_hr_mean', 'walking_hr_mean', 'vo2max_mean', 'steps_sum', 'steps_max', 'distance_walk_run_m_sum', 'flights_sum', 'exercise_minutes_sum', 'stand_minutes_sum', 'respiratory_rate_mean', 'respiratory_rate_min', 'respiratory_rate_max', 'spo2_percent_mean', 'spo2_percent_min', 'sleep_minutes_sum', 'weight_kg_mean', 'bmi_mean', 'age_years_mean', 'sex_encoded_first']

Preview:
         date  heart_

In [12]:
import os
import pandas as pd
import numpy as np
from datetime import date, timedelta

# --------- Config --------- #
OUTPUT_DIR = "/Users/nilvaghela/Desktop/Research/Diabetes/Patients_90days"
os.makedirs(OUTPUT_DIR, exist_ok=True)

NUM_DIABETIC = 130      # number of diabetic patients
NUM_NORM = 120          # number of non-diabetic patients
N_DAYS = 90             # days per patient
BASE_DATE = date(2024, 1, 1)  # fixed start reference
RNG = np.random.default_rng(42)


def generate_patient_profile(is_diabetic: bool):
    """
    Generate a 'metabolic profile' for one patient:
    - age_years
    - sex_encoded (0=female, 1=male)
    - bmi
    - weight_kg
    - risk_factor (0..1): higher ~ worse metabolic health
    """
    # Sex ~50/50
    sex_encoded = int(RNG.uniform() < 0.5)  # 0 or 1

    # Base metabolic risk:
    # diabetics shifted higher, but with LARGE overlap
    # (non-diabetics can be unhealthy, diabetics can be relatively fit)
    base_mean = 0.7 if is_diabetic else 0.4
    risk_factor = np.clip(RNG.normal(loc=base_mean, scale=0.18), 0.0, 1.0)

    # Age: diabetics older on average, but overlap
    if is_diabetic:
        age_years = np.clip(RNG.normal(loc=58, scale=10), 30, 85)
    else:
        age_years = np.clip(RNG.normal(loc=45, scale=12), 20, 80)

    # BMI: correlated with risk
    # risk_factor ~0 => BMI ~22–26, risk_factor ~1 => BMI ~30–40
    bmi = np.clip(22 + 18 * risk_factor + RNG.normal(0, 2.5), 18, 45)

    # Assume height uniformly 1.55–1.85 m
    height_m = RNG.uniform(1.55, 1.85)
    weight_kg = bmi * (height_m ** 2)

    return {
        "age_years": float(age_years),
        "sex_encoded": sex_encoded,
        "bmi": float(bmi),
        "weight_kg": float(weight_kg),
        "risk_factor": float(risk_factor),
    }


def generate_random_day(is_diabetic: bool, day_index: int, profile: dict):
    """
    Generate one day's synthetic Apple Watch daily features
    for a single patient using their static profile (age, bmi, risk_factor).
    Distributions overlap between diabetic and non-diabetic.
    """
    day_date = BASE_DATE + timedelta(days=day_index)

    r = profile["risk_factor"]

    # Heart rate:
    # higher risk => higher resting HR and higher mean HR
    resting_hr = np.clip(55 + 25 * r + RNG.normal(0, 3), 45, 110)
    hr_mean = np.clip(resting_hr + RNG.normal(5, 5), 50, 130)

    # HRV:
    # higher risk => lower HRV, but add strong noise and overlap
    hrv_base = np.clip(140 - 90 * r + RNG.normal(0, 15), 10, 200)

    # VO2max:
    # higher risk => lower VO2, but overlap
    vo2_base = np.clip(55 - 20 * r + RNG.normal(0, 4), 15, 60)

    # Steps:
    # higher risk => tends to walk less, but we allow lazy healthy + active diabetics
    steps_base = np.clip(14000 - 9000 * r + RNG.normal(0, 2000), 1000, 20000)

    # Respiratory rate:
    resp_mean = np.clip(16 + 4 * r + RNG.normal(0, 1), 12, 26)

    # SpO2:
    spo2 = np.clip(97.5 - 2.5 * r + RNG.normal(0, 0.4), 90, 100)

    # Sleep temp (F): slightly higher with risk, but overlapped
    sleep_temp = np.clip(96.0 + 1.0 * r + RNG.normal(0, 0.4), 95.0, 99.0)

    # Exercise + stand minutes: correlated with steps and inverse risk but noisy
    ex_minutes = np.clip((steps_base / 200) * (1.0 - 0.3 * r) + RNG.normal(0, 5), 0, 120)
    stand_minutes = np.clip(40 + 25 * (1.0 - r) + RNG.normal(0, 10), 10, 90)

    # Daily noise: add variation day-to-day
    hr_mean_day = np.clip(hr_mean + RNG.normal(0, 3), 45, 135)
    hrv_day = np.clip(hrv_base + RNG.normal(0, 8), 5, 220)
    steps_day = np.clip(steps_base + RNG.normal(0, 1500), 500, 25000)
    vo2_day = np.clip(vo2_base + RNG.normal(0, 2.0), 10, 65)

    # Construct row
    row = {
        "date": day_date,

        # Heart rate stats
        "heart_rate_mean": hr_mean_day,
        "heart_rate_min": np.clip(hr_mean_day - RNG.uniform(5, 15), 35, hr_mean_day),
        "heart_rate_max": np.clip(hr_mean_day + RNG.uniform(10, 30), hr_mean_day, 180),
        "heart_rate_std": np.clip(RNG.uniform(3, 12) + 2 * r, 1, 20),

        # HRV stats
        "hrv_mean": hrv_day,
        "hrv_min": np.clip(hrv_day - RNG.uniform(10, 50), 1, hrv_day),
        "hrv_max": np.clip(hrv_day + RNG.uniform(10, 50), hrv_day, 250),
        "hrv_std": np.clip(RNG.uniform(5, 25) * (1.5 - r), 2, 40),

        # Activity
        "steps_sum": steps_day,
        "steps_max": np.clip(RNG.uniform(300, 1200) * (0.8 + 0.4 * (1.0 - r)), 200, 3000),

        "resting_hr_mean": resting_hr,
        "vo2max_mean": vo2_day,

        # Respiratory rate
        "respiratory_rate_mean": resp_mean,
        "respiratory_rate_min": np.clip(resp_mean - RNG.uniform(0.5, 2.5), 10, resp_mean),
        "respiratory_rate_max": np.clip(resp_mean + RNG.uniform(1.0, 4.0), resp_mean, 30),

        # SpO2
        "spo2_percent_mean": spo2,
        "spo2_percent_min": np.clip(spo2 - RNG.uniform(0.3, 2.5), 80, spo2),

        # Energy
        "active_energy_cal_sum": np.clip(steps_day * RNG.uniform(0.02, 0.05), 100, 1500),
        "basal_energy_cal_sum": np.clip(1500 + RNG.normal(0, 150), 1100, 2200),

        # Distance & movement
        "distance_walk_run_m_sum": np.clip(steps_day * RNG.uniform(0.6, 0.9), 500, 30000),
        "flights_sum": np.clip(RNG.normal(10 + 10 * (1.0 - r), 5), 0, 60),

        # Exercise / stand
        "exercise_minutes_sum": ex_minutes,
        "stand_minutes_sum": stand_minutes,

        # Sleep
        "sleeping_wrist_temp_degF_mean": sleep_temp,
        # fraction of 24h (0.3–0.95) turned loosely into minutes/24h
        "sleep_minutes_sum": np.clip(RNG.uniform(0.3, 0.95), 0.2, 1.0),

        # Static profile per day
        "age_years": profile["age_years"],
        "sex_encoded": profile["sex_encoded"],
        "weight_kg": profile["weight_kg"],
        "bmi": profile["bmi"],

        # Label
        "diabetes": 1 if is_diabetic else 0,
    }

    return row


# ------------- Generate CSVs ------------- #

# Diabetic patients: patient_1_d.csv ... patient_NUM_DIABETIC_d.csv
for pid in range(1, NUM_DIABETIC + 1):
    profile = generate_patient_profile(is_diabetic=True)
    rows = [generate_random_day(True, d, profile) for d in range(N_DAYS)]
    df = pd.DataFrame(rows).sort_values("date")
    file_path = os.path.join(OUTPUT_DIR, f"patient_{pid}_d.csv")
    df.to_csv(file_path, index=False)

# Non-diabetic patients: patient_{NUM_DIABETIC+1}.csv ... patient_{NUM_DIABETIC+NUM_NORM}.csv
for pid in range(NUM_DIABETIC + 1, NUM_DIABETIC + NUM_NORM + 1):
    profile = generate_patient_profile(is_diabetic=False)
    rows = [generate_random_day(False, d, profile) for d in range(N_DAYS)]
    df = pd.DataFrame(rows).sort_values("date")
    file_path = os.path.join(OUTPUT_DIR, f"patient_{pid}.csv")
    df.to_csv(file_path, index=False)

print(f"DONE — created {NUM_DIABETIC + NUM_NORM} patients × {N_DAYS} days each.")
print("Output folder:", OUTPUT_DIR)

DONE — created 250 patients × 90 days each.
Output folder: /Users/nilvaghela/Desktop/Research/Diabetes/Patients_90days
