In [2]:
import numpy as np
import pandas as pd
import random

# --- 1. Static Patient Records (The "Files") ---
# Patient ID, Age, Weight (kg), Base Risk Factor (Genetic)
patient_ids = [101, 102, 103, 104, 105]
static_data = pd.DataFrame({
    "Patient_ID": patient_ids,
    "Age": [65, 30, 50, 75, 40],
    "Weight": [80, 70, 95, 60, 85],
    "Genetic_Risk": [1.2, 1.0, 1.1, 1.5, 1.0] # Multiplier for sepsis risk
})

# --- 2. Streaming Sensor Logs (The "Mess") ---
# Sensors fire roughly every minute, but sometimes gap or drift.
sensor_data = []

start_time = pd.Timestamp("2025-01-01 08:00:00")

for pid in patient_ids:
    # Each patient has ~60 minutes of data
    current_time = start_time
    for _ in range(60): 
        # 1. Random Time Drift (Sensors aren't perfect)
        current_time += pd.Timedelta(seconds=np.random.randint(50, 70))
        
        # 2. Simulate Vitals
        # Patient 104 is the "Code Blue" case (Vitals crashing)
        if pid == 104 and _ > 40: 
            hr = np.random.randint(130, 160) # Tachycardia
            o2 = np.random.randint(80, 90)   # Hypoxia
            temp = np.random.uniform(101.0, 104.0) # Fever
        else:
            hr = np.random.randint(60, 100)
            o2 = np.random.randint(95, 100)
            temp = np.random.uniform(98.0, 99.5)

        # 3. Simulate Sensor Failure (NaNs)
        # 10% chance a sensor drops a reading
        if random.random() < 0.1: hr = np.nan 
        if random.random() < 0.1: o2 = np.nan

        sensor_data.append([current_time, pid, hr, o2, temp])

# Convert to DataFrame
logs = pd.DataFrame(sensor_data, columns=["Timestamp", "Patient_ID", "HR", "O2", "Temp"])

print("--- Patient Records ---")
print(static_data)
print("\n--- Raw Sensor Logs (First 5 rows) ---")
print(logs)

--- Patient Records ---
   Patient_ID  Age  Weight  Genetic_Risk
0         101   65      80           1.2
1         102   30      70           1.0
2         103   50      95           1.1
3         104   75      60           1.5
4         105   40      85           1.0

--- Raw Sensor Logs (First 5 rows) ---
              Timestamp  Patient_ID    HR    O2       Temp
0   2025-01-01 08:00:50         101  78.0  99.0  98.094319
1   2025-01-01 08:01:42         101  66.0  96.0  99.476023
2   2025-01-01 08:02:39         101   NaN  99.0  98.103176
3   2025-01-01 08:03:33         101  89.0  97.0  99.482519
4   2025-01-01 08:04:35         101  76.0  98.0  98.005762
..                  ...         ...   ...   ...        ...
295 2025-01-01 08:54:01         105  98.0   NaN  99.417490
296 2025-01-01 08:55:01         105  61.0  98.0  99.203715
297 2025-01-01 08:55:58         105  86.0  97.0  98.661151
298 2025-01-01 08:56:55         105  90.0  95.0  99.365562
299 2025-01-01 08:57:56         105  65.0

In [4]:
logs = logs.groupby("Patient_ID")

In [6]:
print(logs)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017357C1E490>
