# Stress & Hypoxemia Prediction

This notebook cleans and merges the provided `bloodgas.csv` and `pulseoximeter.csv`, and creates labels for **stress** and **hypoxemia**. The final dataset will align with the features streamed from Firebase (`RR`, `dev60_HR`, `saturation`, `pi`, `date`, `time`).

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline


## Load Data

In [10]:
# Load blood gas and pulse oximeter data
df_bg = pd.read_csv(r"C:\#PROJECT\MONITOR\openoximetry-repository-1.1.1\bloodgas.csv", dtype={'encounter_id': str})
df_pox = pd.read_csv(r"C:\#PROJECT\MONITOR\openoximetry-repository-1.1.1\pulseoximeter.csv")

print("Blood gas shape:", df_bg.shape)
print("Pulse oximeter shape:", df_pox.shape)


Blood gas shape: (32877, 44)
Pulse oximeter shape: (89404, 6)


## Clean & Merge Data

In [11]:
# Handle duplicate HR columns (keep main version)
for base in ['dev60_HR','dev59_HR','dev64_HR']:
    cols = [c for c in df_bg if c.startswith(base)]
    if len(cols) > 1:
        df_bg[base] = df_bg[cols].bfill(axis=1).iloc[:,0]
        df_bg.drop(columns=[c for c in cols if c!=base], inplace=True)

# Ensure numeric saturation
df_pox['saturation'] = pd.to_numeric(df_pox['saturation'], errors='coerce')

# Group by encounter+sample to average across devices
pulse_grp = (df_pox.groupby(['encounter_id','sample_number'])
             .agg({'saturation':'mean','pi':'mean'})
             .reset_index()
             .rename(columns={'sample_number':'sample'}))

# Merge
df_merged = pd.merge(df_bg, pulse_grp, on=['encounter_id','sample'], how='inner')

# Drop rows with missing key features
df_merged = df_merged.dropna(subset=['RR','dev60_HR','saturation','pi'])

print("Merged shape:", df_merged.shape)
df_merged.head()


Merged shape: (28037, 43)


Unnamed: 0,patient_id,encounter_id,type,date,time,sample,ph,pco2,po2,so2,...,dev71_ABP_SYS,dev71_ABP_DIA,dev71_ABP_MEAN,dev71_ART_SYS,dev71_ART_DIA,dev71_ART_MEAN,dev73_HR,dev76_HR,saturation,pi
1,55475c97e020e3161c4c98884cacd4ae73bf48629373c6...,01a542fef90c977867f1016bb46772e233006183c5c879...,1,2218-10-03,14:43:00,16.0,7.416,39.3,72.0,95.4,...,,,,126.0,62.0,82.0,93.0,,96.714286,9.266667
2,55475c97e020e3161c4c98884cacd4ae73bf48629373c6...,01a542fef90c977867f1016bb46772e233006183c5c879...,1,2218-10-03,14:44:00,15.0,7.415,39.2,72.5,95.5,...,,,,132.0,64.0,85.0,87.0,,97.0,12.3
3,55475c97e020e3161c4c98884cacd4ae73bf48629373c6...,01a542fef90c977867f1016bb46772e233006183c5c879...,1,2218-10-03,14:45:00,16.0,7.421,38.5,71.6,95.6,...,,,,126.0,62.0,82.0,93.0,,96.714286,9.266667
4,55475c97e020e3161c4c98884cacd4ae73bf48629373c6...,01a542fef90c977867f1016bb46772e233006183c5c879...,1,2218-10-03,14:46:00,17.0,7.418,38.9,71.0,95.2,...,,,,128.0,67.0,66.0,95.0,,96.714286,11.066667
5,55475c97e020e3161c4c98884cacd4ae73bf48629373c6...,01a542fef90c977867f1016bb46772e233006183c5c879...,1,2218-10-03,14:46:00,18.0,7.411,40.2,44.7,82.8,...,,,,129.0,61.0,78.0,100.0,,84.285714,13.5


## Create Labels (Stress & Hypoxemia)

In [12]:
# --- Stress classification ---
def categorize_stress_lactate(val, hr, spo2):
    if hr == 0 or spo2 == 0:   # invalid reading
        return "N/A"
    elif val < 2:
        return "normal"
    elif val < 4:
        return "moderate"
    else:
        return "severe"

# --- Hypoxemia classification ---
def categorize_po2(val, hr):
    if val == 0 or hr == 0:   # invalid reading
        return "N/A"
    elif val < 80:
        return "severe"
    elif val < 85:
        return "moderate"
    elif val < 93:
        return "mild"
    else:
        return "normal"

# --- Binary stress (with N/A handling) ---
df_merged['stress'] = df_merged.apply(
    lambda row: "N/A" if row['dev60_HR'] == 0 or row['saturation'] == 0 else int(row['lactate'] > 2),
    axis=1
)

# --- Stress multiclass ---
df_merged['stress_class'] = df_merged.apply(
    lambda row: categorize_stress_lactate(row['lactate'], row['dev60_HR'], row['saturation']),
    axis=1
)

# --- Binary hypoxemia (with N/A handling) ---
df_merged['hypoxemia'] = df_merged.apply(
    lambda row: "N/A" if row['saturation'] == 0 or row['dev60_HR'] == 0 else int(row['saturation'] < 90),
    axis=1
)

# --- Hypoxemia multiclass ---
df_merged['hypoxemia_class'] = df_merged.apply(
    lambda row: categorize_po2(row['saturation'], row['dev60_HR']),
    axis=1
)

# --- Final Preview ---
print(df_merged[['RR','dev60_HR','saturation','pi','stress','stress_class','hypoxemia','hypoxemia_class']].head())


     RR  dev60_HR  saturation         pi  stress stress_class  hypoxemia  \
1  20.0      93.0   96.714286   9.266667       0       normal          0   
2  20.0      88.0   97.000000  12.300000       0       normal          0   
3  20.0      93.0   96.714286   9.266667       0       normal          0   
4  20.0      95.0   96.714286  11.066667       0       normal          0   
5  20.0      99.0   84.285714  13.500000       0       normal          1   

  hypoxemia_class  
1          normal  
2          normal  
3          normal  
4          normal  
5        moderate  


## Prepare Final Dataset

In [13]:
# Keep aligned features
final_df = df_merged[['RR','dev60_HR','saturation','pi','stress','stress_class','hypoxemia','hypoxemia_class']].copy()

print("Final dataset shape:", final_df.shape)
final_df.head()


Final dataset shape: (28037, 8)


Unnamed: 0,RR,dev60_HR,saturation,pi,stress,stress_class,hypoxemia,hypoxemia_class
1,20.0,93.0,96.714286,9.266667,0,normal,0,normal
2,20.0,88.0,97.0,12.3,0,normal,0,normal
3,20.0,93.0,96.714286,9.266667,0,normal,0,normal
4,20.0,95.0,96.714286,11.066667,0,normal,0,normal
5,20.0,99.0,84.285714,13.5,0,normal,1,moderate


## Example ML Pipeline

In [14]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Features & labels
X_00 = final_df[['RR','dev60_HR','saturation','pi']]
y_stress = final_df['stress_class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_00, y_stress, train_size=0.6, stratify=y_stress, random_state=42
)

# Pipeline
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=0))
])

# Train
pipeline.fit(X_train, y_train)

# Evaluate
print("Train score (stress):", pipeline.score(X_train, y_train))
print("Test score (stress):", pipeline.score(X_test, y_test))

# --- Save model ---
joblib.dump(pipeline, "model/stress_model.pkl")
print("✅ Model saved as stress_model.pkl")

# --- Load model later ---
loaded_model = joblib.load("model/stress_model.pkl")

# Test prediction
print("Sample prediction:", loaded_model.predict(X_test[:5]))


Train score (stress): 0.9977410533824753
Test score (stress): 0.9804725813642443
✅ Model saved as stress_model.pkl
Sample prediction: ['normal' 'normal' 'normal' 'normal' 'normal']


In [15]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# -------------------------------
# Train Hypoxemia Model
# -------------------------------
X_01 = final_df[['RR','dev60_HR','saturation','pi']]
y_hypox = final_df['hypoxemia_class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_01, y_hypox, train_size=0.6, stratify=y_hypox, random_state=42
)

# Pipeline
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=0))
])

# Train
pipeline.fit(X_train, y_train)

# Evaluate
print("Train accuracy:", pipeline.score(X_train, y_train))
print("Test accuracy:", pipeline.score(X_test, y_test))

# --- Save model ---
joblib.dump(pipeline, "model/hypoxemia_model.pkl")
print("✅ Hypoxemia model saved as hypoxemia_model.pkl")

# --- Load model later ---
loaded_hypoxemia_model = joblib.load("model/hypoxemia_model.pkl")

# Test prediction
print("Sample prediction:", loaded_hypoxemia_model.predict(X_test[:5]))


Train accuracy: 1.0
Test accuracy: 1.0
✅ Hypoxemia model saved as hypoxemia_model.pkl
Sample prediction: ['normal' 'mild' 'mild' 'severe' 'mild']
