In [1]:
import json

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from tensordict import TensorDict

In [13]:
with open(
    "../data/s3/mimic_iv__multiclass_multioutput__json_files/sample__test_set__chexpert.json"
) as f:
    data = json.load(f)
    np.asarray(data)

In [3]:
input_cols = [
    "patient_id",
    "visit_id",
    "study_id",
    "temperature",
    "heartrate",
    "resprate",
    "o2sat",
    "sbp",
    "dbp",
]
output_cols = [
    "atelectasis",
    "cardiomegaly",
    "edema",
    "lung_opacity",
    "pleural_effusion",
    "pneumonia",
]
input_df = pd.DataFrame(
    data=data,
    columns=input_cols,
)
output_df = pd.DataFrame(data=data, columns=output_cols)
input_df = input_df.astype(
    {
        "patient_id": int,
        "study_id": int,
    }
)
output_df = output_df.astype(
    {
        "atelectasis": int,
        "cardiomegaly": int,
        "edema": int,
        "lung_opacity": int,
        "pleural_effusion": int,
        "pneumonia": int,
    }
)

In [4]:
def scale_min_max(df, col_name):
    xmin = df[col_name].min()
    xmax = df[col_name].max()
    df[col_name] = (df[col_name] - xmin) / (xmax - xmin)
    return df

In [5]:
input_df = scale_min_max(input_df, "temperature")
input_df = scale_min_max(input_df, "heartrate")
input_df = scale_min_max(input_df, "resprate")
input_df = scale_min_max(input_df, "sbp")
input_df = scale_min_max(input_df, "dbp")
input_df["o2sat"] = input_df["o2sat"] * 0.01

In [6]:
input_tensor = torch.tensor(input_df.values)

In [7]:
output_tensor = torch.tensor(output_df.values)

In [8]:
test = TensorDataset(input_tensor, output_tensor)

In [9]:
test[0][1]

tensor([1, 0, 0, 0, 0, 1])

In [10]:
output_df.head(1)

Unnamed: 0,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion,pneumonia
2561,1,0,0,0,0,1


In [11]:
test[0][0]

tensor([1.0595e+07, 2.4341e+07, 5.1622e+07, 5.3896e-01, 3.5433e-01, 2.0000e-01,
        9.9000e-01, 8.5279e-01, 4.8485e-01], dtype=torch.float64)

In [12]:
input_df.head(1)

Unnamed: 0,patient_id,visit_id,study_id,temperature,heartrate,resprate,o2sat,sbp,dbp
2561,10594962,24340966,51621916,0.538961,0.354331,0.2,0.99,0.852792,0.484848


In [14]:
def get_tabular_df(path):
    """
    Generates the tabular dataframes
    """
    with open(path) as f:
        data = json.load(f)
    index_cols = ["patient_id", "visit_id", "study_id"]
    input_cols = [
        "patient_id",
        "visit_id",
        "study_id",
        "temperature",
        "heartrate",
        "resprate",
        "o2sat",
        "sbp",
        "dbp",
    ]
    output_cols = [
        "atelectasis",
        "cardiomegaly",
        "edema",
        "lung_opacity",
        "pleural_effusion",
        "pneumonia",
    ]
    input_df = pd.DataFrame(
        data=data,
        columns=input_cols,
    )
    output_df = pd.DataFrame(data=data, columns=output_cols)
    input_df = input_df.astype(
        {
            "patient_id": int,
            "study_id": int,
        }
    )
    output_df = output_df.astype(
        {
            "atelectasis": int,
            "cardiomegaly": int,
            "edema": int,
            "lung_opacity": int,
            "pleural_effusion": int,
            "pneumonia": int,
        }
    )
    return [input_df, output_df]

In [45]:
def generate_tabular(path, name, all_inputs):
    """
    Generates the tabular tensor files for loading.
    """
    with open(path) as f:
        data = json.load(f)
    input_cols = [
        "patient_id",
        "visit_id",
        "study_id",
        "temperature",
        "heartrate",
        "resprate",
        "o2sat",
        "sbp",
        "dbp",
        "atelectasis",
        "cardiomegaly",
        "edema",
        "lung_opacity",
        "pleural_effusion",
        "pneumonia",
    ]
    input_df = pd.DataFrame(
        data=data,
        columns=input_cols,
    )
    input_df = input_df.astype(
        {
            "patient_id": int,
            "study_id": int,
            "atelectasis": int,
            "cardiomegaly": int,
            "edema": int,
            "lung_opacity": int,
            "pleural_effusion": int,
            "pneumonia": int,
        }
    )

    def scale_min_max(df, col_name):
        xmin = all_inputs[col_name].min()
        xmax = all_inputs[col_name].max()
        df[col_name] = (df[col_name] - xmin) / (xmax - xmin)
        return df

    # normalize
    input_df = scale_min_max(input_df, "temperature")
    input_df = scale_min_max(input_df, "heartrate")
    input_df = scale_min_max(input_df, "resprate")
    input_df = scale_min_max(input_df, "sbp")
    input_df = scale_min_max(input_df, "dbp")
    input_df["o2sat"] = input_df["o2sat"] * 0.01

    input_df.to_csv("../data/s3/" + name, index=False)
    print("generated " + name)

In [47]:
root = "../data/s3/mimic_iv__multiclass_multioutput__json_files/"
paths = [
    ["tabular_test.csv", "test_set__chexpert.json"],
    ["tabular_train.csv", "train_set__chexpert.json"],
    ["tabular_valid.csv", "validation_set__chexpert.json"],
]
inputs = []
outputs = []
for [name, path] in paths:
    [input_df, output_df] = get_tabular_df(root + path)
    inputs.append(input_df)

all_inputs = pd.concat(inputs, axis=0)

for [name, path] in paths:
    generate_tabular(root + path, name, all_inputs)

generated tabular_test.csv
generated tabular_train.csv
generated tabular_valid.csv


In [24]:
np.array(input_df.columns)

array(['patient_id', 'visit_id', 'study_id', 'temperature', 'heartrate',
       'resprate', 'o2sat', 'sbp', 'dbp'], dtype=object)

In [25]:
np.array(output_df.columns)

array(['atelectasis', 'cardiomegaly', 'edema', 'lung_opacity',
       'pleural_effusion', 'pneumonia'], dtype=object)

In [34]:
np.loadtxt("../data/s3/tensors/tabular_label_names.txt", delimiter=",", dtype="str")

array(['atelectasis', ' cardiomegaly', ' edema', ' lung_opacity',
       ' pleural_effusion', ' pneumonia'], dtype='<U17')

In [2]:
from types import SimpleNamespace

import mimic

In [3]:
config_test = '{"data_mode": "tabular_numpy_train", "batch_size": 32}'
x = json.loads(config_test, object_hook=lambda d: SimpleNamespace(**d))
dataloaders = mimic.MimicDataLoader(x)

In [5]:
next(iter(dataloaders.train_loader))

{'features': tensor([[6.8757e-01, 7.3171e-01, 1.0106e-01, 9.8000e-01, 2.1016e-01, 7.6193e-04],
         [6.5998e-01, 3.2927e-01, 9.0426e-02, 9.5000e-01, 1.9615e-01, 9.8917e-04],
         [6.6402e-01, 5.1220e-01, 2.0745e-01, 9.8000e-01, 1.9965e-01, 1.0293e-03],
         [6.6604e-01, 6.0976e-01, 9.0426e-02, 1.0000e+00, 1.8739e-01, 9.3570e-04],
         [6.6200e-01, 3.4756e-01, 7.9787e-02, 9.9000e-01, 1.9440e-01, 6.2826e-04],
         [6.5595e-01, 3.6585e-01, 7.9787e-02, 1.0000e+00, 2.6445e-01, 1.0827e-03],
         [6.5864e-01, 4.3293e-01, 7.9787e-02, 9.8000e-01, 1.9440e-01, 1.0159e-03],
         [6.6133e-01, 5.2439e-01, 7.9787e-02, 1.0000e+00, 1.9615e-01, 9.3570e-04],
         [6.5797e-01, 3.4146e-01, 9.0426e-02, 9.9000e-01, 2.6270e-01, 1.0159e-03],
         [6.7142e-01, 4.5732e-01, 7.9787e-02, 9.9000e-01, 1.5937e-01, 8.1540e-04],
         [6.6671e-01, 5.0610e-01, 1.0106e-01, 1.0000e+00, 2.3468e-01, 1.1228e-03],
         [6.6469e-01, 3.2927e-01, 1.0106e-01, 1.0000e+00, 2.3993e-01, 1.363