In [1]:
import json

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset

In [2]:
with open(
    "../data/s3/mimic_iv__multiclass_multioutput__json_files/sample__test_set__chexpert.json"
) as f:
    data = json.load(f)
    np.asarray(data)

In [3]:
input_cols = [
    "patient_id",
    "visit_id",
    "study_id",
    "temperature",
    "heartrate",
    "resprate",
    "o2sat",
    "sbp",
    "dbp",
]
output_cols = [
    "atelectasis",
    "cardiomegaly",
    "edema",
    "lung_opacity",
    "pleural_effusion",
    "pneumonia",
]
input_df = pd.DataFrame(
    data=data,
    columns=input_cols,
)
output_df = pd.DataFrame(data=data, columns=output_cols)
input_df = input_df.astype(
    {
        "patient_id": int,
        "study_id": int,
    }
)
output_df = output_df.astype(
    {
        "atelectasis": int,
        "cardiomegaly": int,
        "edema": int,
        "lung_opacity": int,
        "pleural_effusion": int,
        "pneumonia": int,
    }
)

In [4]:
def scale_min_max(df, col_name):
    xmin = df[col_name].min()
    xmax = df[col_name].max()
    df[col_name] = (df[col_name] - xmin) / (xmax - xmin)
    return df

In [5]:
input_df = scale_min_max(input_df, "temperature")
input_df = scale_min_max(input_df, "heartrate")
input_df = scale_min_max(input_df, "resprate")
input_df = scale_min_max(input_df, "sbp")
input_df = scale_min_max(input_df, "dbp")
input_df["o2sat"] = input_df["o2sat"] * 0.01

In [6]:
input_tensor = torch.tensor(input_df.values)

In [7]:
output_tensor = torch.tensor(output_df.values)

In [8]:
test = TensorDataset(input_tensor, output_tensor)

In [9]:
test[0][1]

tensor([1, 0, 0, 0, 0, 1])

In [10]:
output_df.head(1)

Unnamed: 0,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion,pneumonia
2561,1,0,0,0,0,1


In [11]:
test[0][0]

tensor([1.0595e+07, 2.4341e+07, 5.1622e+07, 5.3896e-01, 3.5433e-01, 2.0000e-01,
        9.9000e-01, 8.5279e-01, 4.8485e-01], dtype=torch.float64)

In [12]:
input_df.head(1)

Unnamed: 0,patient_id,visit_id,study_id,temperature,heartrate,resprate,o2sat,sbp,dbp
2561,10594962,24340966,51621916,0.538961,0.354331,0.2,0.99,0.852792,0.484848


In [16]:
def get_tabular_df(path):
    """
    Generates the tabular dataframes
    """
    with open(path) as f:
        data = json.load(f)
    input_cols = [
        "patient_id",
        "visit_id",
        "study_id",
        "temperature",
        "heartrate",
        "resprate",
        "o2sat",
        "sbp",
        "dbp",
    ]
    output_cols = [
        "atelectasis",
        "cardiomegaly",
        "edema",
        "lung_opacity",
        "pleural_effusion",
        "pneumonia",
    ]
    input_df = pd.DataFrame(
        data=data,
        columns=input_cols,
    )
    output_df = pd.DataFrame(data=data, columns=output_cols)
    input_df = input_df.astype(
        {
            "patient_id": int,
            "study_id": int,
        }
    )
    output_df = output_df.astype(
        {
            "atelectasis": int,
            "cardiomegaly": int,
            "edema": int,
            "lung_opacity": int,
            "pleural_effusion": int,
            "pneumonia": int,
        }
    )
    return [input_df, output_df]

In [17]:
def generate_tabular(path, name, all_inputs):
    """
    Generates the tabular tensor files for loading.
    """
    with open(path) as f:
        data = json.load(f)
    input_cols = [
        "patient_id",
        "visit_id",
        "study_id",
        "temperature",
        "heartrate",
        "resprate",
        "o2sat",
        "sbp",
        "dbp",
    ]
    output_cols = [
        "atelectasis",
        "cardiomegaly",
        "edema",
        "lung_opacity",
        "pleural_effusion",
        "pneumonia",
    ]
    input_df = pd.DataFrame(
        data=data,
        columns=input_cols,
    )
    output_df = pd.DataFrame(data=data, columns=output_cols)
    input_df = input_df.astype(
        {
            "patient_id": int,
            "study_id": int,
        }
    )
    output_df = output_df.astype(
        {
            "atelectasis": int,
            "cardiomegaly": int,
            "edema": int,
            "lung_opacity": int,
            "pleural_effusion": int,
            "pneumonia": int,
        }
    )
    def scale_min_max(df, col_name):
        xmin = all_inputs[col_name].min()
        xmax = all_inputs[col_name].max()
        df[col_name] = (df[col_name] - xmin) / (xmax - xmin)
        return df

    # normalize
    input_df = scale_min_max(input_df, "temperature")
    input_df = scale_min_max(input_df, "heartrate")
    input_df = scale_min_max(input_df, "resprate")
    input_df = scale_min_max(input_df, "sbp")
    input_df = scale_min_max(input_df, "dbp")
    input_df["o2sat"] = input_df["o2sat"] * 0.01

    # generate tensors
    input_tensor = torch.tensor(input_df.values)
    output_tensor = torch.tensor(output_df.values)

    ds = TensorDataset(input_tensor, output_tensor)
    torch.save(ds, "../data/s3/tensors/" + name)
    print("generated " + name)

In [19]:
root = "../data/s3/mimic_iv__multiclass_multioutput__json_files/"
paths = [
    ["tabular_test.pt", "test_set__chexpert.json"],
    ["tabular_train.pt", "train_set__chexpert.json"],
    ["tabular_valid.pt", "validation_set__chexpert.json"],
]
inputs = []
outputs = []
for [name, path] in paths:
    [input_df, output_df] = get_tabular_df(root + path)
    inputs.append(input_df)

all_inputs = pd.concat(inputs, axis=0)

for [name, path] in paths:
    generate_tabular(root + path, name, all_inputs)

generated tabular_test.pt
generated tabular_train.pt
generated tabular_valid.pt


In [24]:
np.array(input_df.columns)

array(['patient_id', 'visit_id', 'study_id', 'temperature', 'heartrate',
       'resprate', 'o2sat', 'sbp', 'dbp'], dtype=object)

In [25]:
np.array(output_df.columns)

array(['atelectasis', 'cardiomegaly', 'edema', 'lung_opacity',
       'pleural_effusion', 'pneumonia'], dtype=object)

In [34]:
np.loadtxt("../data/s3/tensors/tabular_label_names.txt", delimiter=",", dtype="str")

array(['atelectasis', ' cardiomegaly', ' edema', ' lung_opacity',
       ' pleural_effusion', ' pneumonia'], dtype='<U17')

In [2]:
from types import SimpleNamespace

import mimic

In [3]:
config_test = '{"data_mode": "tabular_numpy_train", "batch_size": 32}'
x = json.loads(config_test, object_hook=lambda d: SimpleNamespace(**d))
dataloaders = mimic.MimicDataLoader(x)

In [20]:
train = torch.load('../data/s3/tensors/tabular_train.pt')

In [21]:
valid = torch.load('../data/s3/tensors/tabular_valid.pt')

In [22]:
len(train)

11272

In [23]:
len(valid)

2818

In [26]:
output_df.head(10)

Unnamed: 0,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion,pneumonia
0,0,1,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,1,0,0
6,0,0,0,1,0,1
7,0,0,0,1,0,0
8,1,0,0,1,0,0
9,1,0,0,0,0,0


In [30]:
input_df.corrwith(output_df, axis=0)

atelectasis        NaN
cardiomegaly       NaN
dbp                NaN
edema              NaN
heartrate          NaN
lung_opacity       NaN
o2sat              NaN
patient_id         NaN
pleural_effusion   NaN
pneumonia          NaN
resprate           NaN
sbp                NaN
study_id           NaN
temperature        NaN
visit_id           NaN
dtype: float64

In [28]:
input_df.head(1)

Unnamed: 0,patient_id,visit_id,study_id,temperature,heartrate,resprate,o2sat,sbp,dbp
0,19850525,24462634,59616328,97.0,73.0,22.0,97.0,107.0,47.0


In [29]:
output_df.head(1)

Unnamed: 0,atelectasis,cardiomegaly,edema,lung_opacity,pleural_effusion,pneumonia
0,0,1,0,0,0,0


In [32]:
[input_df, output_df] = get_tabular_df(root + "train_set__chexpert.json")

In [33]:
output_df.sum()

atelectasis         1451
cardiomegaly        1134
edema                927
lung_opacity        2190
pleural_effusion    1380
pneumonia            735
dtype: int64

In [34]:
len(output_df)

11272

In [35]:
output_df.sum()/len(output_df)

atelectasis         0.128726
cardiomegaly        0.100603
edema               0.082239
lung_opacity        0.194287
pleural_effusion    0.122427
pneumonia           0.065206
dtype: float64