In [1]:
import os
import sys
import time
import requests
import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, model_selection
import copy
import pickle
import itertools
from datetime import datetime
import pandas as pd
import heapq
import re

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Subset, Dataset
import torch.nn.functional as F

import torchvision
from torchvision import transforms

from tqdm import tqdm

import glob
from collections import defaultdict

In [2]:
dir = "./data/mimic3/benchmark/multitask/test/"

In [33]:
# prepare priori information about mimic3 benchmark dataset
# mimic3 benchmark dataset feature list: [(name, mean_value_from_paper, is_numeric]
feature_list = [
    ("capillary_refill_rate", "0.0", "categorical"),
    ("diastolic_blood_pressure", 59.0, "continuous"),
    ("fraction_inspired_oxygen", 0.21, "continuous"),
    ("glascow_coma_scale_eye_opening", "4 spontaneously", "categorical"),
    ("glascow_coma_scale_motor_response", "6 obeys commands", "categorical"),
    ("glascow_coma_scale_total", "15", "categorical"),
    ("glascow_coma_scale_verbal_response", "5 oriented", "categorical"),
    ("glucose", 128.0, "continuous"),
    ("heart_rate", 86, "continuous"),
    ("height", 170.0, "continuous"),
    ("mean_blood_pressure", 77.0, "continuous"),
    ("oxygen_saturation", 98.0, "continuous"),
    ("respiratory_rate", 19, "continuous"),
    ("systolic_blood_pressure", 118.0, "continuous"),
    ("temperature", 36.6, "continuous"),
    ("weight", 81.0, "continuous"),
    ("ph", 7.4, "continuous"),
]
# form it into dictionary
feature_dict = {}
for name, avg, ftype in feature_list:
    feature_dict[name] = {"avg": avg, "type": ftype}

# for some columns that may contain abnormally big /small values, add boundaries
feature_dict["weight"]["bound"] = (0, 300)

def map_categorical_to_numeric(name, value):
    """
    Map categorical string labels that contain certain substring to integer labels (starting from 1, 0 for unknown)
    """

    if pd.isna(value):
        return np.nan
    substring_mapping = {
        # "capillary_refill_rate": 0.0 and 1.0
        "glascow_coma_scale_eye_opening": {
            "respon": 1,
            "pain": 2,
            "speech": 3,
            "spont": 4,
        },
        "glascow_coma_scale_motor_response": {
            "respon": 1,
            "extens": 2,
            "flex": 3,
            "withd": 4,
            "pain": 5,
            "obey": 6,
        },
        # "glascow_coma_scale_total": 3.0 to 15.0
        "glascow_coma_scale_verbal_response": {
            "respon": 1,
            "trach": 1,
            "incomp": 2,
            "inap": 3,
            "conf": 4,
            "orient": 5,
        },
    }
    if name == "capillary_refill_rate":
        return int(float(value)) + 1
    elif name == "glascow_coma_scale_total":
        return int(float(value)) - 2
    elif name in substring_mapping.keys():
        for substring, label in substring_mapping[name].items():
            if substring in value.lower():
                return label
        return 0
    else:
        return value

In [35]:
print(f"Start preprocessing data under {dir}")
episode_path_list = glob.glob(os.path.join(dir, "*_episode*_timeseries.csv"))
episode_df = {}
# read all dataframes into episode_df, fill all nan cells and convert categorical to numeric
for i, path in enumerate(episode_path_list):
    re_match = re.match(r"(\d+)_episode(\d+)_timeseries.csv", os.path.basename(path))
    if not re_match:
        raise ValueError(f"Error parsing csv file: {path}")
    subject_id, episode_number = map(int, re_match.groups())
    key = f"{subject_id}_episode{episode_number}"
    df = pd.read_csv(path, index_col=0) # "hours" as index
    # rename columns
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    # get rid of out-of-boundary numeric values by replacing with nan
    for name in feature_dict.keys():
        if feature_dict[name]["type"] == "continuous" and "bound" in feature_dict[name].keys():
            lo, hi = feature_dict[name]["bound"]
            df.loc[(df[name] < lo) | (df[name] > hi), name] = np.nan
    # map categorical features to numeric ones
    for name in feature_dict.keys():
        if feature_dict[name]["type"] == "categorical":
            df[name] = df[name].map(lambda x: map_categorical_to_numeric(name, x))
    # get rid of nan values by 1. forward filling 2. impute with reported avgs in paper
    df.ffill(inplace=True) # foward fill
    for name in feature_dict.keys(): # replace remaining nans (at the beginning) with reported avgs in paper
        if feature_dict[name]["type"] == "continuous":
            df[name].fillna(feature_dict[name]["avg"], inplace=True)
        else:
            df[name].fillna(map_categorical_to_numeric(name, feature_dict[name]["avg"]), inplace=True)
    # now df has to be a fully filled time series with all numeric values and no nan
    episode_df[key] = df


Start preprocessing data under ./data/mimic3/benchmark/multitask/test/


In [36]:
print(list(episode_df.values())[0].head())

          capillary_refill_rate  diastolic_blood_pressure  \
Hours                                                       
0.158333                    1.0                      69.0   
0.658333                    1.0                      63.0   
1.708333                    1.0                      63.0   
4.158333                    1.0                      63.0   
4.408333                    1.0                      62.0   

          fraction_inspired_oxygen  glascow_coma_scale_eye_opening  \
Hours                                                                
0.158333                      0.21                             2.0   
0.658333                      0.21                             2.0   
1.708333                      0.21                             2.0   
4.158333                      0.21                             2.0   
4.408333                      0.21                             2.0   

          glascow_coma_scale_motor_response  glascow_coma_scale_total  \
Hours   

In [37]:
df_in_one = pd.concat(episode_df.values(), ignore_index=True)

In [40]:
# normalizing all columns
avg = df_in_one.mean().to_dict()
std = df_in_one.std().to_dict()
print(avg, std)

for df in episode_df.values():
    for name in feature_dict.keys():
        df[name] = (df[name] - avg[name]) / std[name]

{'capillary_refill_rate': 1.0040801853396526, 'diastolic_blood_pressure': 62.56490035137904, 'fraction_inspired_oxygen': 0.3113870799733931, 'glascow_coma_scale_eye_opening': 3.4428173150262436, 'glascow_coma_scale_motor_response': 5.197116145468305, 'glascow_coma_scale_total': 11.123562148159872, 'glascow_coma_scale_verbal_response': 3.0868476195349137, 'glucose': 137.44480773271871, 'heart_rate': 86.30767465131255, 'height': 169.9375290664122, 'mean_blood_pressure': 79.36889965119009, 'oxygen_saturation': 104.34702037001912, 'respiratory_rate': 19.779955583222286, 'systolic_blood_pressure': 121.4659352001362, 'temperature': 36.91911732404078, 'weight': 83.4849075520237, 'ph': 7.179924525567048} {'capillary_refill_rate': 0.06374591957623935, 'diastolic_blood_pressure': 303.6462958831118, 'fraction_inspired_oxygen': 0.1775374918111497, 'glascow_coma_scale_eye_opening': 0.9146847087265197, 'glascow_coma_scale_motor_response': 1.5569547655221985, 'glascow_coma_scale_total': 3.32852735624

In [41]:
print(list(episode_df.values())[0].head())

          capillary_refill_rate  diastolic_blood_pressure  \
Hours                                                       
0.158333              -0.064007                  0.021193   
0.658333              -0.064007                  0.001433   
1.708333              -0.064007                  0.001433   
4.158333              -0.064007                  0.001433   
4.408333              -0.064007                 -0.001860   

          fraction_inspired_oxygen  glascow_coma_scale_eye_opening  \
Hours                                                                
0.158333                 -0.571074                       -1.577393   
0.658333                 -0.571074                       -1.577393   
1.708333                 -0.571074                       -1.577393   
4.158333                 -0.571074                       -1.577393   
4.408333                 -0.571074                       -1.577393   

          glascow_coma_scale_motor_response  glascow_coma_scale_total  \
Hours   