In [3]:
import numpy as np
import pandas as pd
import random

# --- 1. Static Patient Records (The "Files") ---
# Patient ID, Age, Weight (kg), Base Risk Factor (Genetic)
patient_ids = [101, 102, 103, 104, 105]
static_data = pd.DataFrame({
    "Patient_ID": patient_ids,
    "Age": [65, 30, 50, 75, 40],
    "Weight": [80, 70, 95, 60, 85],
    "Genetic_Risk": [1.2, 1.0, 1.1, 1.5, 1.0] # Multiplier for sepsis risk
})

# --- 2. Streaming Sensor Logs (The "Mess") ---
# Sensors fire roughly every minute, but sometimes gap or drift.
sensor_data = []

start_time = pd.Timestamp("2025-01-01 08:00:00")

for pid in patient_ids:
    # Each patient has ~60 minutes of data
    current_time = start_time
    for _ in range(60): 
        # 1. Random Time Drift (Sensors aren't perfect)
        current_time += pd.Timedelta(seconds=np.random.randint(50, 70))
        
        # 2. Simulate Vitals
        # Patient 104 is the "Code Blue" case (Vitals crashing)
        if pid == 104 and _ > 40: 
            hr = np.random.randint(130, 160) # Tachycardia
            o2 = np.random.randint(80, 90)   # Hypoxia
            temp = np.random.uniform(101.0, 104.0) # Fever
        else:
            hr = np.random.randint(60, 100)
            o2 = np.random.randint(95, 100)
            temp = np.random.uniform(98.0, 99.5)

        # 3. Simulate Sensor Failure (NaNs)
        # 10% chance a sensor drops a reading
        if random.random() < 0.1: hr = np.nan 
        if random.random() < 0.1: o2 = np.nan

        sensor_data.append([current_time, pid, hr, o2, temp])

# Convert to DataFrame
logs = pd.DataFrame(sensor_data, columns=["Timestamp", "Patient_ID", "HR", "O2", "Temp"])
logs["Timestamp"] = pd.to_datetime(logs["Timestamp"])
logs.set_index("Timestamp", inplace=True)


In [8]:
def process_patient(patient_df):
    temp_df = patient_df.drop(columns=["Patient_ID"])
    clean_df = temp_df.resample("1T").mean().interpolate(method="linear")
    return clean_df.dropna()

In [37]:
final_logs =logs.groupby("Patient_ID").apply(process_patient)
final_logs.reset_index()
final_logs.shape
final_logs


  clean_df = temp_df.resample("1T").mean().interpolate(method="linear")
  clean_df = temp_df.resample("1T").mean().interpolate(method="linear")
  clean_df = temp_df.resample("1T").mean().interpolate(method="linear")
  clean_df = temp_df.resample("1T").mean().interpolate(method="linear")
  clean_df = temp_df.resample("1T").mean().interpolate(method="linear")
  final_logs =logs.groupby("Patient_ID").apply(process_patient)


Unnamed: 0_level_0,Unnamed: 1_level_0,HR,O2,Temp
Patient_ID,Timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
101,2025-01-01 08:01:00,95.50,97.00,98.532742
101,2025-01-01 08:02:00,78.25,95.00,98.575718
101,2025-01-01 08:03:00,61.00,95.00,98.699993
101,2025-01-01 08:04:00,69.50,96.00,98.734213
101,2025-01-01 08:05:00,78.00,97.00,98.768433
...,...,...,...,...
105,2025-01-01 08:55:00,99.00,95.00,98.373542
105,2025-01-01 08:56:00,78.00,95.00,98.389560
105,2025-01-01 08:57:00,90.00,97.00,98.730799
105,2025-01-01 08:58:00,75.50,96.75,98.931714


In [13]:
static_data

Unnamed: 0,Patient_ID,Age,Weight,Genetic_Risk
0,101,65,80,1.2
1,102,30,70,1.0
2,103,50,95,1.1
3,104,75,60,1.5
4,105,40,85,1.0


In [16]:
merged_df = pd.merge(final_logs, static_data, on='Patient_ID',how='left')

In [17]:
merged_df

Unnamed: 0,Patient_ID,HR,O2,Temp,Age,Weight,Genetic_Risk
0,101,95.50,97.00,98.532742,65,80,1.2
1,101,78.25,95.00,98.575718,65,80,1.2
2,101,61.00,95.00,98.699993,65,80,1.2
3,101,69.50,96.00,98.734213,65,80,1.2
4,101,78.00,97.00,98.768433,65,80,1.2
...,...,...,...,...,...,...,...
290,105,99.00,95.00,98.373542,40,85,1.0
291,105,78.00,95.00,98.389560,40,85,1.0
292,105,90.00,97.00,98.730799,40,85,1.0
293,105,75.50,96.75,98.931714,40,85,1.0


In [20]:
cols = ["HR", "O2", "Temp"]
for col in cols :
    merged_df[f"{col}_Z"]= (merged_df[col] - merged_df[col].mean()) / merged_df[col].std()


print(merged_df[["HR_Z", "Temp_Z", "O2_Z"]].head())

       HR_Z    Temp_Z      O2_Z
0  0.572304 -0.437484  0.225054
1 -0.287482 -0.394615 -0.334028
2 -1.147268 -0.270652 -0.334028
3 -0.723606 -0.236518 -0.054487
4 -0.299943 -0.202384  0.225054


In [21]:
weights = np.array([0.6, 0.3, -0.5])

In [29]:
risk_matrix = merged_df[["HR_Z", "Temp_Z", "O2_Z"]].values


In [30]:
merged_df["Base_Score"] = risk_matrix @ weights

In [32]:
merged_df["Total_Risk"]= merged_df["Base_Score"] * merged_df["Genetic_Risk"]

In [33]:
merged_df

Unnamed: 0,Patient_ID,HR,O2,Temp,Age,Weight,Genetic_Risk,HR_Z,O2_Z,Temp_Z,Base_Score,Total_Risk
0,101,95.50,97.00,98.532742,65,80,1.2,0.572304,0.225054,-0.437484,0.099610,0.119532
1,101,78.25,95.00,98.575718,65,80,1.2,-0.287482,-0.334028,-0.394615,-0.123860,-0.148632
2,101,61.00,95.00,98.699993,65,80,1.2,-1.147268,-0.334028,-0.270652,-0.602543,-0.723051
3,101,69.50,96.00,98.734213,65,80,1.2,-0.723606,-0.054487,-0.236518,-0.477875,-0.573450
4,101,78.00,97.00,98.768433,65,80,1.2,-0.299943,0.225054,-0.202384,-0.353208,-0.423850
...,...,...,...,...,...,...,...,...,...,...,...,...
290,105,99.00,95.00,98.373542,40,85,1.0,0.746753,-0.334028,-0.596284,0.436181,0.436181
291,105,78.00,95.00,98.389560,40,85,1.0,-0.299943,-0.334028,-0.580307,-0.187044,-0.187044
292,105,90.00,97.00,98.730799,40,85,1.0,0.298169,0.225054,-0.239923,-0.005603,-0.005603
293,105,75.50,96.75,98.931714,40,85,1.0,-0.424550,0.155169,-0.039512,-0.344168,-0.344168


In [36]:
# find row with the maximum total risk
max_risk_row = merged_df.loc[merged_df["Total_Risk"].idxmax()]
print("Patient with highest sepsis risk:")
print(max_risk_row)


Patient with highest sepsis risk:
Patient_ID      104.000000
HR              152.000000
O2               83.000000
Temp            102.940942
Age              75.000000
Weight           60.000000
Genetic_Risk      1.500000
HR_Z              3.388414
O2_Z             -3.688522
Temp_Z            3.959661
Base_Score        5.065208
Total_Risk        7.597812
Name: 235, dtype: float64
