In [77]:
import numpy as np
import pandas as pd
import random

# --- 1. Static Patient Records (The "Files") ---
# Patient ID, Age, Weight (kg), Base Risk Factor (Genetic)
patient_ids = [101, 102, 103, 104, 105]
static_data = pd.DataFrame({
    "Patient_ID": patient_ids,
    "Age": [65, 30, 50, 75, 40],
    "Weight": [80, 70, 95, 60, 85],
    "Genetic_Risk": [1.2, 1.0, 1.1, 1.5, 1.0] # Multiplier for sepsis risk
})

# --- 2. Streaming Sensor Logs (The "Mess") ---
# Sensors fire roughly every minute, but sometimes gap or drift.
sensor_data = []

start_time = pd.Timestamp("2025-01-01 08:00:00")

for pid in patient_ids:
    # Each patient has ~60 minutes of data
    current_time = start_time
    for _ in range(60): 
        # 1. Random Time Drift (Sensors aren't perfect)
        current_time += pd.Timedelta(seconds=np.random.randint(50, 70))
        
        # 2. Simulate Vitals
        # Patient 104 is the "Code Blue" case (Vitals crashing)
        if pid == 104 and _ > 40: 
            hr = np.random.randint(130, 160) # Tachycardia
            o2 = np.random.randint(80, 90)   # Hypoxia
            temp = np.random.uniform(101.0, 104.0) # Fever
        else:
            hr = np.random.randint(60, 100)
            o2 = np.random.randint(95, 100)
            temp = np.random.uniform(98.0, 99.5)

        # 3. Simulate Sensor Failure (NaNs)
        # 10% chance a sensor drops a reading
        if random.random() < 0.1: hr = np.nan 
        if random.random() < 0.1: o2 = np.nan

        sensor_data.append([current_time, pid, hr, o2, temp])

# Convert to DataFrame
logs = pd.DataFrame(sensor_data, columns=["Timestamp", "Patient_ID", "HR", "O2", "Temp"])


In [78]:
logs["Timestamp"] = pd.to_datetime(logs["Timestamp"])

In [80]:
logs.set_index("Timestamp", inplace=True)

In [82]:
logs.groupby("Patient_ID")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D6B9EA87D0>

In [89]:
mask101 = logs["Patient_ID"] == 101
mask102 = logs["Patient_ID"] == 102
mask103 = logs["Patient_ID"] == 103
mask104 = logs["Patient_ID"] == 104
mask105 = logs["Patient_ID"] == 105


In [90]:
logs_101 = logs[mask101]
logs_102 = logs[mask102]
logs_103 = logs[mask103]
logs_104 = logs[mask104]
logs_105 = logs[mask105]


In [109]:
cleaned_101 = logs_101.resample("1T").mean().interpolate(method="linear")
cleaned_102 = logs_102.resample("1T").mean().interpolate(method="linear")
cleaned_103 = logs_103.resample("1T").mean().interpolate(method="linear")
cleaned_104 = logs_104.resample("1T").mean().interpolate(method="linear")
cleaned_105 = logs_105.resample("1T").mean().interpolate(method="linear")
cleaned_101.dropna(inplace=True)
cleaned_102.dropna(inplace=True)
cleaned_103.dropna(inplace=True)
cleaned_104.dropna(inplace=True)
cleaned_105.dropna(inplace=True)


  cleaned_101 = logs_101.resample("1T").mean().interpolate(method="linear")
  cleaned_102 = logs_102.resample("1T").mean().interpolate(method="linear")
  cleaned_103 = logs_103.resample("1T").mean().interpolate(method="linear")
  cleaned_104 = logs_104.resample("1T").mean().interpolate(method="linear")
  cleaned_105 = logs_105.resample("1T").mean().interpolate(method="linear")


In [111]:
final_logs = pd.concat([cleaned_101, cleaned_102, cleaned_103, cleaned_104, cleaned_105])