In [79]:
import pandas as pd
import os
import sys
import pickle
import json

from tqdm import tqdm

sys.path.append('./MultimodalMIMIC')
from data import data_perpare

sys.path.append('./mimic3-benchmarks')
from mimic3benchmark.readers import InHospitalMortalityReader
from mimic3models.preprocessing import Normalizer
from MultimodalMIMIC.preprocessing import Discretizer_multi
from mimic3models import common_utils
from text_utils import *

# Mimic 3 benchmark

In [80]:
listfile = pd.read_csv('./mimic3-benchmarks/data/in-hospital-mortality/val_listfile.csv')

In [81]:
listfile.head()

Unnamed: 0,stay,y_true
0,49750_episode1_timeseries.csv,0
1,23516_episode1_timeseries.csv,0
2,18152_episode1_timeseries.csv,0
3,3808_episode1_timeseries.csv,0
4,65442_episode1_timeseries.csv,1


In [82]:
listfile['y_true'].value_counts()

0    2200
1     353
Name: y_true, dtype: int64

In [83]:
train_folder = os.listdir('./mimic3-benchmarks/data/in-hospital-mortality/train')

In [84]:
sample, label = listfile.loc[0,'stay'], listfile.loc[0,'y_true']

train_folder.index(sample)

10592

In [85]:
lengths = []
hours = []
for file in tqdm(train_folder):
    if 'episode' not in file:
        continue
    temp = pd.read_csv(f'./mimic3-benchmarks/data/in-hospital-mortality/train/{file}')
    hours.append(temp['Hours'].max())
    temp = temp.drop(columns=['Hours'])
    lengths.append(temp.count(axis=1).sum())

100%|██████████| 17904/17904 [01:06<00:00, 267.78it/s]


In [86]:
pd.Series(lengths).describe()

count    17903.000000
mean       446.641792
std        470.382317
min          1.000000
25%        364.000000
50%        414.000000
75%        489.000000
max      19180.000000
dtype: float64

In [87]:
len([x for x in lengths if x>500])

3971

In [88]:
pd.Series(hours).describe()

count    17903.000000
mean        47.342311
std          1.432299
min          2.295556
25%         47.245139
50%         47.544167
75%         47.793194
max         48.000000
dtype: float64

In [89]:
sample_csv = pd.read_csv(f'./mimic3-benchmarks/data/in-hospital-mortality/train/{sample}')

In [90]:
sample_csv.head()

Unnamed: 0,Hours,Capillary refill rate,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,0.098611,,,,,,,,,,,,,,,,,7.37
1,1.098611,,,,,,,,,,,,,,,,,7.37
2,1.315278,,91.0,,,,,,,,,107.0,,,154.0,,,
3,1.331944,,,,,,,,,107.0,,,,32.0,,,,
4,1.348611,,,,,,,,,,,,99.0,,,,,


In [91]:
sample_csv.loc[2,"Hours"]

1.3152777777777778

In [92]:
temp = sample_csv.melt(var_name='features', value_name='value', id_vars=['Hours'])
temp[temp['Hours']==1.3152777777777778]

Unnamed: 0,Hours,features,value
2,1.315278,Capillary refill rate,
83,1.315278,Diastolic blood pressure,91.0
164,1.315278,Fraction inspired oxygen,
245,1.315278,Glascow coma scale eye opening,
326,1.315278,Glascow coma scale motor response,
407,1.315278,Glascow coma scale total,
488,1.315278,Glascow coma scale verbal response,
569,1.315278,Glucose,
650,1.315278,Heart Rate,
731,1.315278,Height,


In [93]:
len(temp)

1377

In [94]:
temp.head(n=1400)

Unnamed: 0,Hours,features,value
0,0.098611,Capillary refill rate,
1,1.098611,Capillary refill rate,
2,1.315278,Capillary refill rate,
3,1.331944,Capillary refill rate,
4,1.348611,Capillary refill rate,
...,...,...,...
1372,44.231944,pH,
1373,44.881944,pH,
1374,45.881944,pH,
1375,46.881944,pH,


In [95]:
times_inp = np.zeros((5, 100), dtype='float32')
times_inp[0,:100] = temp['Hours'].values[:100]

In [96]:
temp

Unnamed: 0,Hours,features,value
0,0.098611,Capillary refill rate,
1,1.098611,Capillary refill rate,
2,1.315278,Capillary refill rate,
3,1.331944,Capillary refill rate,
4,1.348611,Capillary refill rate,
...,...,...,...
1372,44.231944,pH,
1373,44.881944,pH,
1374,45.881944,pH,
1375,46.881944,pH,


In [97]:
times_inp

array([[ 0.09861111,  1.0986111 ,  1.3152778 ,  1.3319445 ,  1.3486111 ,
         1.3819444 ,  1.6152778 ,  1.8819444 ,  2.1319444 ,  2.148611  ,
         2.8819444 ,  3.8819444 ,  3.948611  ,  4.8819447 ,  5.8819447 ,
         6.8819447 ,  7.0652776 ,  7.8819447 ,  8.881945  ,  8.965278  ,
         8.981944  ,  8.998611  ,  9.148611  ,  9.1652775 ,  9.881945  ,
        10.1652775 , 10.281944  , 10.298611  , 10.881945  , 11.881945  ,
        12.281944  , 12.881945  , 13.881945  , 14.881945  , 15.881945  ,
        16.881945  , 17.581944  , 17.881945  , 18.581944  , 18.881945  ,
        19.081944  , 19.881945  , 19.898611  , 20.081944  , 20.881945  ,
        20.898611  , 21.881945  , 22.881945  , 23.881945  , 24.881945  ,
        25.881945  , 26.881945  , 27.881945  , 28.881945  , 29.881945  ,
        29.981945  , 30.881945  , 31.881945  , 32.881943  , 33.881943  ,
        34.881943  , 35.881943  , 36.881943  , 37.181946  , 37.881943  ,
        38.015278  , 38.881943  , 39.881943  , 40.8

In [98]:
temp[temp['value']=='Spontaneously']

Unnamed: 0,Hours,features,value
248,1.381944,Glascow coma scale eye opening,Spontaneously
256,4.881944,Glascow coma scale eye opening,Spontaneously
261,8.881944,Glascow coma scale eye opening,Spontaneously
274,12.881944,Glascow coma scale eye opening,Spontaneously
278,16.881944,Glascow coma scale eye opening,Spontaneously
292,24.881944,Glascow coma scale eye opening,Spontaneously
296,28.881944,Glascow coma scale eye opening,Spontaneously
301,32.881944,Glascow coma scale eye opening,Spontaneously
306,37.181944,Glascow coma scale eye opening,Spontaneously
313,40.981944,Glascow coma scale eye opening,Spontaneously


In [99]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1377 entries, 0 to 1376
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Hours     1377 non-null   float64
 1   features  1377 non-null   object 
 2   value     376 non-null    object 
dtypes: float64(1), object(2)
memory usage: 32.4+ KB


In [100]:
data = './mimic3-benchmarks/data/in-hospital-mortality'
period_length = 48
timestep = 1.0
imputation = 'previous'
__file__ = './MultimodalMIMIC/preprocessing.py'

In [101]:
val_reader = InHospitalMortalityReader(dataset_dir=os.path.join(data, 'train'),
                                            listfile=os.path.join(data, 'val_listfile.csv'),
                                            period_length=period_length)

In [103]:
discretizer = Discretizer_multi(timestep=float(timestep),
                      store_masks=True,
                      impute_strategy='previous',
                      start_time='zero')

discretizer_header = discretizer.transform(val_reader.read_example(0)["X"])[1].split(',')
cont_channels = [i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1]

normalizer = Normalizer(fields=cont_channels)  # choose here which columns to standardize
normalizer_state = '../mimic3-benchmarks/mimic3models/in_hospital_mortality/ihm_ts{}.input_str-{}.start_time-zero.normalizer'.format(timestep, imputation)
normalizer_state = os.path.join(os.path.dirname(__file__), normalizer_state)
normalizer.load_params(normalizer_state)

In [104]:
len(cont_channels)

34

In [105]:
normalizer_state

'./MultimodalMIMIC/../mimic3-benchmarks/mimic3models/in_hospital_mortality/ihm_ts1.0.input_str-previous.start_time-zero.normalizer'

In [106]:
with open(normalizer_state, "rb") as load_file:
    dct = pickle.load(load_file, encoding='latin1')
dct

{'stds': array([4.99566658e-02, 4.99566658e-02, 2.11109641e+02, 1.69005308e-01,
        1.50437819e-01, 2.94692386e-01, 2.67486530e-01, 4.89201304e-01,
        2.17034645e-01, 2.65211343e-01, 4.30612573e-01, 1.83697659e-01,
        1.99881624e-01, 5.52533886e-02, 4.25379791e-02, 1.65512893e-01,
        1.80878450e-01, 2.03746876e-01, 1.53151436e-01, 4.55889931e-01,
        4.49861250e-02, 4.97725756e-01, 2.59191524e-01, 4.97165089e-02,
        1.89678173e-01, 2.21205475e-01, 1.18416298e-01, 7.55246671e-02,
        4.52650397e-01, 1.99045650e-01, 1.88674144e-01, 5.85038029e-02,
        6.30786672e-02, 1.55073203e-01, 1.36098001e-01, 1.51603537e-01,
        1.58810110e-01, 9.89076872e-02, 1.14655232e-01, 1.88075696e-01,
        5.06285454e-02, 4.06564187e-01, 3.31297505e-01, 4.70909214e-01,
        8.42582868e-02, 4.05775833e-01, 2.03187802e-01, 9.70308927e-02,
        5.85278202e-02, 5.71666298e+01, 1.94063061e+01, 5.99642270e+00,
        1.75581014e+02, 1.16712100e+03, 6.65640647e+00, 

In [107]:
val_reader.read_example(0).keys()

dict_keys(['X', 't', 'y', 'header', 'name'])

In [108]:
len(val_reader.read_example(0)['header'])

18

In [109]:
val_reader.read_example(0)['X'].shape

(81, 18)

In [110]:
val_reader.read_example(0)['t']

48

In [112]:
N = val_reader.get_number_of_examples()
ret = common_utils.read_chunk(val_reader, N)
irg_data = ret["X"]
ts = ret["t"]
labels = ret["y"]
names = ret["name"]
# Saved to `outputdir+"ts_"+mode+".pkl"`
reg_data = [discretizer.transform(X, end=t)[0] for (X, t) in zip(irg_data, ts)]

In [113]:
channel_info_file = open('./MultimodalMIMIC/Data/irregular/channel_info.json')
dis_config_file=open('./MultimodalMIMIC/Data/irregular/discretizer_config.json')
channel_info = json.load(channel_info_file)
dis_config=json.load(dis_config_file)
# List of variables
channel_name=dis_config['id_to_channel']
# Dict to check if is categorical
is_catg=dis_config['is_categorical_channel']

In [119]:
data_irregular=[]

for p_id, x, in enumerate(irg_data):
    print(x)
    data_i={}
    tt=[]
    features_list=[]
    features_mask_list=[]
    for t_idx, feature in enumerate(x):
        f_list_per_t=[]
        f_mask_per_t=[]
        for f_idx, val in enumerate(feature):
            if f_idx==0:
                tt.append(round(float(val),2))
            else:
                head=channel_name[f_idx-1]
                if val=='':
                    f_list_per_t.append(0)
                    f_mask_per_t.append(0)
                else:
                    f_mask_per_t.append(1)
                    if is_catg[head]:
                        val=channel_info[head]['values'][val]
                    f_list_per_t.append(float(round(float(val),2)))
        assert len(f_list_per_t)==len(f_mask_per_t)
        features_list.append(f_list_per_t)
        features_mask_list.append(f_mask_per_t)
    assert len(features_list)==len(features_mask_list)==len(tt)
    data_i['reg_ts']=reg_data[p_id]
    data_i['name']=names[p_id]
    data_i['label']=labels[p_id]
    data_i['ts_tt']=tt
    data_i['irg_ts']=np.array(features_list)
    data_i['irg_ts_mask']=np.array(features_mask_list)
    data_irregular.append(data_i)
    break
# with open(dataPath_out, 'wb') as f:
#     pickle.dump(data_irregular, f)

# channel_info_file.close()
# dis_config_file.close()

[['0.09861111111111112' '' '' ... '' '' '7.37']
 ['1.0986111111111112' '' '' ... '' '' '7.37']
 ['1.3152777777777778' '' '91.0' ... '' '' '']
 ...
 ['45.88194444444444' '' '86.0' ... '' '' '']
 ['46.88194444444444' '' '83.0' ... '' '' '']
 ['47.88194444444444' '' '78.0' ... '' '' '']]


In [146]:
with open('./MultimodalMIMIC/Data/ihm/'+'ts_train.pkl', 'rb') as f:
    data=pickle.load(f)

irg_f_num=data[0]['irg_ts'].shape[1]
reg_f_num=data[0]['reg_ts'].shape[1]
irg_feature_list=[[] for _ in range(irg_f_num)]
reg_feature_list=[[] for _ in range(reg_f_num)]
for p_id, p_data in enumerate(data):
    irg_ts=p_data['irg_ts']
    irg_ts_mask=p_data['irg_ts_mask']
    reg_ts=p_data['reg_ts']

    for t_idx, (ts, mask) in enumerate(zip(irg_ts,irg_ts_mask)):
        for f_idx, (val, mask_val) in enumerate(zip(ts, mask)):
            # print(f_idx)
            if mask_val==1:
                irg_feature_list[f_idx].append(val)

    for ts in reg_ts:
        for f_idx, (val, mask_val) in enumerate(zip(ts[:reg_f_num//2], ts[reg_f_num//2:])):
            reg_feature_list[f_idx].append(val)

In [None]:
irg_means=[]
irg_stds=[]
reg_means=[]
reg_stds=[]

for irg_vals,reg_vals in zip(irg_feature_list,reg_feature_list):
    irg_means.append(stat.mean(irg_vals))
    irg_stds.append(stat.stdev(irg_vals))
    reg_means.append(stat.mean(reg_vals))
    reg_stds.append(stat.stdev(reg_vals))
with open(dataPath_out, 'wb') as f:
    pickle.dump((irg_means,irg_stds,reg_means,reg_stds), f)

In [130]:
with open('./MultimodalMIMIC/Data/ihm/ts_val.pkl', "rb") as load_file:
    ts_val = pickle.load(load_file)

In [154]:
len(ts_val)

2553

In [55]:
ts_val[0].keys()

dict_keys(['reg_ts', 'name', 'label', 'ts_tt', 'irg_ts', 'irg_ts_mask'])

In [60]:
ts_val[0]['name']

'49750_episode1_timeseries.csv'

In [56]:
ts_val[0]['reg_ts'].shape

(48, 34)

In [95]:
ts_val[0]['reg_ts']

array([[ 0.  , 59.  ,  0.21, ...,  0.  ,  0.  ,  1.  ],
       [ 0.  , 91.  ,  0.21, ...,  1.  ,  1.  ,  1.  ],
       [ 0.  , 87.  ,  0.7 , ...,  0.  ,  0.  ,  0.  ],
       ...,
       [ 0.  , 86.  ,  0.5 , ...,  0.  ,  0.  ,  0.  ],
       [ 0.  , 83.  ,  0.5 , ...,  0.  ,  0.  ,  0.  ],
       [ 0.  , 78.  ,  0.5 , ...,  0.  ,  0.  ,  0.  ]])

In [57]:
len(ts_val[0]['ts_tt'])

81

In [62]:
ts_val[0]['irg_ts'].shape

(81, 17)

In [63]:
ts_val[0]['irg_ts']

array([[ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  7.37],
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  7.37],
       [ 0.  , 91.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       ...,
       [ 0.  , 86.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.  , 83.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.  , 78.  ,  0.  , ...,  0.  ,  0.  ,  0.  ]])

In [65]:
ts_val[0]['irg_ts_mask']

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [59]:
ts_val[0]['ts_tt']

[0.1,
 1.1,
 1.32,
 1.33,
 1.35,
 1.38,
 1.62,
 1.88,
 2.13,
 2.15,
 2.88,
 3.88,
 3.95,
 4.88,
 5.88,
 6.88,
 7.07,
 7.88,
 8.88,
 8.97,
 8.98,
 9.0,
 9.15,
 9.17,
 9.88,
 10.17,
 10.28,
 10.3,
 10.88,
 11.88,
 12.28,
 12.88,
 13.88,
 14.88,
 15.88,
 16.88,
 17.58,
 17.88,
 18.58,
 18.88,
 19.08,
 19.88,
 19.9,
 20.08,
 20.88,
 20.9,
 21.88,
 22.88,
 23.88,
 24.88,
 25.88,
 26.88,
 27.88,
 28.88,
 29.88,
 29.98,
 30.88,
 31.88,
 32.88,
 33.88,
 34.88,
 35.88,
 36.88,
 37.18,
 37.88,
 38.02,
 38.88,
 39.88,
 40.88,
 40.9,
 40.98,
 41.88,
 42.88,
 43.23,
 43.88,
 44.1,
 44.23,
 44.88,
 45.88,
 46.88,
 47.88]

In [64]:
ts_val[0]['irg_ts_mask'].shape

(81, 17)

In [151]:
with open('./MultimodalMIMIC/Data/ihm/norm_ts_val.pkl', "rb") as load_file:
    norm_ts_val = pickle.load(load_file)

In [153]:
len(norm_ts_val)

2553

In [157]:
norm_ts_val[0].keys()

dict_keys(['reg_ts', 'name', 'label', 'ts_tt', 'irg_ts', 'irg_ts_mask'])

In [159]:
textdata_fixed = "./mimic3-benchmarks/data/root/text_fixed/"
starttime_path = "./mimic3-benchmarks/data/starttime.pkl"

text_reader = TextReader(textdata_fixed, starttime_path)

In [162]:
names = [norm_ts_val[0]['name']]
data_text, data_times, data_time = text_reader.read_all_text_append_json(names, period_length)

In [163]:
data_text

{'49750_episode1_timeseries.csv': ['4:28 am chest ( portable ap ) clip # reason : eval for infiltrate medical condition : 65 year old woman with respiratory distress reason for this examination : eval for infiltrate final report indication : 65-year-old female with respiratory distress . comparison : . chest , ap : again seen are changes of right upper lobe wedge resection with chain sutures , staples , and superior retraction of the inferior pulmonary ligament . discoid atelectasis in the left upper lobe has improved . there is no focal consolidation . heart size is normal . there are no pleural effusions or pneumothorax . impression : no acute cardiopulmonary process .',
  '1:28 am chest ( portable ap ) clip # reason : r/o acute process , pulmonary edema admitting diagnosis : respiratory distress medical condition : 65 year old woman s/p vats and stent removal today reason for this examination : r/o acute process , pulmonary edema final report history : status post vats , stent remov

In [164]:
data_times

{'49750_episode1_timeseries.csv': ['2143-07-09 04:28:00',
  '2143-07-10 01:28:00',
  '2143-07-11 05:46:00']}

In [177]:
et = np.datetime64('2143-07-09 08:13:00') + np.timedelta64(49, 'h')
t = np.datetime64('2143-07-09 04:28:00')

In [180]:
(et-t).astype('timedelta64[m]')/60


numpy.timedelta64(52,'m')

In [165]:
data_time

{'49750_episode1_timeseries.csv': '2143-07-09 08:13:00'}

In [167]:
text_reader.episodeToStartTime['49750_1']

'2143-07-09 08:13:00'

In [197]:
with open('./MultimodalMIMIC/Data/ihm/valp2x_data.pkl', "rb") as load_file:
    valp2x = pickle.load(load_file)

In [198]:
valp2x[0].keys()

dict_keys(['reg_ts', 'name', 'label', 'ts_tt', 'irg_ts', 'irg_ts_mask', 'text_data', 'text_time_to_end'])

In [209]:
valp2x[0]['irg_ts'].shape

(81, 17)

In [207]:
len(valp2x[0]['ts_tt'])

81

In [199]:
valp2x[0]['name']

'49750_episode1_timeseries.csv'

In [200]:
valp2x[0]['text_data']

['4:28 am chest ( portable ap ) clip # reason : eval for infiltrate medical condition : 65 year old woman with respiratory distress reason for this examination : eval for infiltrate final report indication : 65-year-old female with respiratory distress . comparison : . chest , ap : again seen are changes of right upper lobe wedge resection with chain sutures , staples , and superior retraction of the inferior pulmonary ligament . discoid atelectasis in the left upper lobe has improved . there is no focal consolidation . heart size is normal . there are no pleural effusions or pneumothorax . impression : no acute cardiopulmonary process .',
 '1:28 am chest ( portable ap ) clip # reason : r/o acute process , pulmonary edema admitting diagnosis : respiratory distress medical condition : 65 year old woman s/p vats and stent removal today reason for this examination : r/o acute process , pulmonary edema final report history : status post vats , stent removal . chest ( portable ap ) film dat

In [201]:
valp2x[0]['text_time_to_end']

[52.75, 31.75, 3.45]

# STraTS dataset

In [21]:
import pickle
import numpy as np
import gzip
from tqdm import tqdm

In [75]:
data_dir = './STraTS_torch/mortality_datasets'
with gzip.GzipFile(f'{data_dir}/val_demos.npy.gz', 'r') as f:
    val_demos = np.load(f)
with gzip.GzipFile(f'{data_dir}/val_times.npy.gz', 'r') as f:
    val_times = np.load(f)
with gzip.GzipFile(f'{data_dir}/val_values.npy.gz', 'r') as f:
    val_values = np.load(f)
with gzip.GzipFile(f'{data_dir}/val_varis.npy.gz', 'r') as f:
    val_varis = np.load(f)
with gzip.GzipFile(f'{data_dir}/val_y.npy.gz', 'r') as f:
    val_y = np.load(f)      

In [78]:
val_values[0]

array([-1.54558349e+00, -1.77108788e+00,  7.09460676e-01,  7.09460676e-01,
        4.83956248e-01, -8.69070292e-01,  4.43889469e-01, -3.91112506e-01,
       -2.98334509e-01,  3.29474024e-02,  1.65555462e-01,  3.29474024e-02,
       -1.92557007e-01, -4.18061435e-01, -1.32007909e+00, -3.12411451e+00,
       -2.22209668e+00, -2.89861012e+00, -2.67310572e+00, -1.09457469e+00,
       -1.92557007e-01, -6.43565834e-01, -4.18061435e-01, -8.69070292e-01,
       -1.09457469e+00, -1.92557007e-01,  1.37166941e+00, -6.43565834e-01,
        1.74278140e+00, -6.72075510e-01, -7.62224495e-01, -6.43565834e-01,
       -6.34808004e-01, -5.49969316e-01, -7.40912497e-01, -1.02843976e+00,
       -8.29134583e-01, -8.29134583e-01, -1.02843976e+00, -1.26004124e+00,
       -4.79861647e-01, -1.18322134e+00, -1.20431101e+00, -1.76390672e+00,
        1.55722547e+00, -1.55318165e+00, -6.14952326e-01, -3.83945137e-01,
       -3.22884440e-01, -5.96241236e-01, -6.21918440e-01, -8.14497888e-01,
       -3.77984554e-01, -

0.001856067

In [24]:
val_values.shape

(7144, 880)

In [29]:
val_times[0]

array([21.933332  , 22.933332  , 23.183332  , 23.433332  , 23.683332  ,
       10.716666  ,  1.6833333 ,  5.983333  , 10.716666  , 20.933332  ,
       18.25      , 23.933332  , 19.933332  , 11.933333  , 18.933332  ,
       18.683332  , 18.516666  , 18.266666  , 18.25      , 17.933332  ,
       16.933332  , 15.933333  , 14.933333  , 13.933333  , 12.933333  ,
       10.933333  , 19.25      , 19.266666  , 20.8       ,  2.9333334 ,
       23.433332  ,  9.933333  , 23.25      ,  2.9333334 , 23.25      ,
        2.9333334 ,  1.4333333 ,  1.1833333 ,  0.93333334, 23.25      ,
        2.9333334 , 23.25      ,  2.9333334 , 23.25      , 21.95      ,
        2.9333334 , 17.933332  ,  2.9333334 ,  2.9333334 , 23.933332  ,
       23.433332  , 21.95      , 20.8       , 19.25      , 18.25      ,
       10.716666  ,  5.983333  ,  1.6833333 , 23.933332  , 18.933332  ,
        8.933333  , 19.266666  ,  6.9333334 ,  2.9333334 ,  1.9333333 ,
        1.4333333 ,  1.1833333 ,  0.93333334, 23.25      ,  2.93

In [35]:
np.argmax(val_times,axis=1)

array([ 11,   0,   3, ...,  21, 128,  74])

In [38]:
val_times[0]

array([21.933332  , 22.933332  , 23.183332  , 23.433332  , 23.683332  ,
       10.716666  ,  1.6833333 ,  5.983333  , 10.716666  , 20.933332  ,
       18.25      , 23.933332  , 19.933332  , 11.933333  , 18.933332  ,
       18.683332  , 18.516666  , 18.266666  , 18.25      , 17.933332  ,
       16.933332  , 15.933333  , 14.933333  , 13.933333  , 12.933333  ,
       10.933333  , 19.25      , 19.266666  , 20.8       ,  2.9333334 ,
       23.433332  ,  9.933333  , 23.25      ,  2.9333334 , 23.25      ,
        2.9333334 ,  1.4333333 ,  1.1833333 ,  0.93333334, 23.25      ,
        2.9333334 , 23.25      ,  2.9333334 , 23.25      , 21.95      ,
        2.9333334 , 17.933332  ,  2.9333334 ,  2.9333334 , 23.933332  ,
       23.433332  , 21.95      , 20.8       , 19.25      , 18.25      ,
       10.716666  ,  5.983333  ,  1.6833333 , 23.933332  , 18.933332  ,
        8.933333  , 19.266666  ,  6.9333334 ,  2.9333334 ,  1.9333333 ,
        1.4333333 ,  1.1833333 ,  0.93333334, 23.25      ,  2.93

In [25]:
val_times.shape

(7144, 880)

In [26]:
val_varis.shape

(7144, 880)

In [107]:
val_times[1]

array([23.6      ,  6.2833333, 11.8      , 11.8      ,  6.2833333,
       23.35     , 19.85     , 20.35     , 20.1      , 19.35     ,
       18.35     , 16.8      , 13.35     ,  6.2833333, 14.35     ,
       15.35     , 22.35     , 11.8      , 14.35     ,  4.35     ,
       23.35     ,  6.2833333, 10.35     , 22.35     , 18.35     ,
       17.35     , 16.35     , 15.35     , 13.35     , 12.35     ,
       11.35     , 10.35     ,  9.35     ,  8.35     ,  7.35     ,
        6.35     ,  5.35     , 23.6      ,  9.35     , 13.35     ,
        7.35     , 18.35     , 17.35     , 16.35     , 15.35     ,
       14.35     , 12.35     , 11.35     , 10.35     ,  9.35     ,
        8.35     ,  7.35     ,  6.35     ,  5.35     ,  4.35     ,
        3.35     , 11.8      ,  2.35     , 19.35     , 19.85     ,
       20.1      , 20.35     ,  6.35     ,  5.35     ,  3.35     ,
        0.85     ,  6.2833333,  6.2833333, 23.6      , 11.8      ,
        8.35     ,  6.2833333,  6.2833333, 11.8      ,  6.2833

In [28]:
np.unique(val_varis, return_counts=True)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129],
       dtype=int32),
 array([3500610,    3261,    3357,    3361,    2389,     431,     636,
           3250,   12816,   13687,   20344,    2146,   13741,     278,
            263,    3379,    6359,   10663,    1549,   10699,     826,
          21983,   15733,     801, 

In [134]:
# Read data.
data, oc, train_ind, valid_ind, test_ind = pickle.load(open('./STraTS_torch/mimic_iii_preprocessed.pkl', 'rb'))

In [135]:
len(data)

78440995

In [136]:
len(oc)

52871

In [189]:
data.head()

Unnamed: 0,ts_ind,hour,variable,value,TABLE,mean,std
0,0,0.0,Age,66.0,,74.805905,54.748198
1,0,0.0,Gender,1.0,,0.436572,0.495965
2,0,0.033333,DBP,-0.56126,chart,60.220559,14.646605
3,0,0.033333,GCS_eye,0.676416,chart,3.282005,1.061469
4,0,0.033333,GCS_motor,0.516602,chart,5.271424,1.410323


In [214]:
data.groupby(by=['ts_ind'])['hour'].count().describe()

count     52871.000000
mean       1483.629873
std        3253.351478
min           3.000000
25%         370.000000
50%         673.000000
75%        1395.000000
max      197371.000000
Name: hour, dtype: float64

In [192]:
data[data['ts_ind']==27706]

Unnamed: 0,ts_ind,hour,variable,value,TABLE,mean,std
46538480,27706,0.000000,Age,65.000000,,74.805905,54.748198
46538481,27706,0.000000,Gender,1.000000,,0.436572,0.495965
46538482,27706,0.083333,Base Excess,-0.007954,lab,0.040568,5.100587
46538483,27706,0.083333,Lactate,0.160851,lab,2.661545,2.725842
46538484,27706,0.083333,PCO2,0.072777,lab,42.215574,10.778417
...,...,...,...,...,...,...,...
46539758,27706,109.866667,HR,-1.800516,chart,87.010048,18.333656
46539759,27706,109.866667,MBP,0.575351,chart,79.181704,17.064872
46539760,27706,109.866667,O2 Saturation,0.258452,chart,96.853895,4.434503
46539761,27706,109.866667,RR,0.060451,chart,19.611439,6.427721


In [190]:
data.groupby(by=['ts_ind'])['hour'].count()

ts_ind
0        1262
1         774
2         722
3         595
4         477
         ... 
52866     120
52867     135
52868     189
52869     223
52870     211
Name: hour, Length: 52871, dtype: int64

In [150]:
data.groupby(by=['variable'])['value'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ALP,70669.0,-8.043614e-18,1.000000,-0.880152,-0.487400,-0.305254,0.076114,21.495328
ALT,72082.0,8.674532e-18,1.000000,-0.313620,-0.286748,-0.259875,-0.169020,13.666560
AST,72069.0,-1.261978e-17,1.000000,-0.276758,-0.251438,-0.227864,-0.158889,18.896536
Age,52871.0,7.480590e+01,54.748198,18.000000,53.000000,66.000000,78.000000,311.000000
Albumin,48509.0,-3.093583e-16,1.000000,-2.904170,-0.760930,0.004513,0.616867,6.281144
...,...,...,...,...,...,...,...,...
Vasopressin,123050.0,8.113064e-17,1.000000,-0.759247,-0.365571,0.027001,0.027001,128.971756
WBC,287957.0,9.899733e-17,1.000000,-1.358214,-0.496552,-0.154096,0.276735,72.170319
Weight,1907928.0,-1.558784e-16,1.000000,-3.314522,-0.647189,-0.164005,0.495954,8.470455
pH Blood,437719.0,-1.210321e-15,1.000000,-89.514052,-0.536327,0.069789,0.675904,7.343172


In [191]:
oc[oc['SUBJECT_ID']==49750]

Unnamed: 0,ts_ind,HADM_ID,SUBJECT_ID,in_hospital_mortality
32437,27706,177711,49750,0


In [138]:
oc.head()

Unnamed: 0,ts_ind,HADM_ID,SUBJECT_ID,in_hospital_mortality
0,0,110404,268,1
1,1,106296,269,0
2,2,188028,270,0
3,3,173727,271,0
4,4,164716,272,0


In [139]:
# data = data[data['ts_ind']<1000]
# oc = oc[oc['ts_ind']<1000]

In [115]:
# Filter labeled data in first 24h.
data = data.loc[data.ts_ind.isin(np.concatenate((train_ind, valid_ind, test_ind), axis=-1))]
data = data.loc[(data.hour>=0)&(data.hour<=24)]
oc = oc.loc[oc.ts_ind.isin(np.concatenate((train_ind, valid_ind, test_ind), axis=-1))]

In [116]:
# Fix age.
data.loc[(data.variable=='Age')&(data.value>200), 'value'] = 91.4

In [117]:
# Get y and N.
y = np.array(oc.sort_values(by='ts_ind')['in_hospital_mortality']).astype('float32')
N = data.ts_ind.max() + 1

In [118]:
# Get static data with mean fill and missingness indicator.
static_varis = ['Age', 'Gender']
ii = data.variable.isin(static_varis)
static_data = data.loc[ii]
# data without static data
data = data.loc[~ii]
def inv_list(l, start=0):
    d = {}
    for i in range(len(l)):
        d[l[i]] = i+start
    return d
# Encoding of static variable `static_var_to_ind = {'Age': 0, 'Gender': 1}`
static_var_to_ind = inv_list(static_varis)
D = len(static_varis)

In [119]:
# 2d array of shape (# of samples , # of static variables)
demo = np.zeros((N, D))
for row in tqdm(static_data.itertuples()):
    # Each cell will have the value of the static variable. Eg. demo[1,1] = Gender of patient ts_ind=1
    demo[row.ts_ind, static_var_to_ind[row.variable]] = row.value
# Patients with no static variable available will default to 0

89624it [00:00, 739330.36it/s]


In [120]:
# Normalize static data.
means = demo.mean(axis=0, keepdims=True)
stds = demo.std(axis=0, keepdims=True)
stds = (stds==0)*1 + (stds!=0)*stds
demo = (demo-means)/stds

In [121]:
demo.shape

(44812, 2)

In [122]:
# Trim to max len.
# Not sure why they included this line? For testing?
data = data.sample(frac=1)
# Trim to 880 variables for every patient. Such that it is same length for all patient to fit into transformer
data = data.groupby('ts_ind').head(880)

In [123]:
# Get N, V, var_to_ind.
N = data.ts_ind.max() + 1
varis = sorted(list(set(data.variable)))
V = len(varis)
def inv_list(l, start=0):
    d = {}
    for i in range(len(l)):
        d[l[i]] = i+start
    return d
# Encoding of variables 
var_to_ind = inv_list(varis, start=1)

In [124]:
data['vind'] = data.variable.map(var_to_ind)
data = data[['ts_ind', 'vind', 'hour', 'value']].sort_values(by=['ts_ind', 'vind', 'hour'])

In [125]:
# Add obs index.
data = data.sort_values(by=['ts_ind']).reset_index(drop=True)
data = data.reset_index().rename(columns={'index':'obs_ind'})
data = data.merge(data.groupby('ts_ind').agg({'obs_ind':'min'}).reset_index().rename(columns={ \
                                                            'obs_ind':'first_obs_ind'}), on='ts_ind')
data['obs_ind'] = data['obs_ind'] - data['first_obs_ind']

In [126]:
data.tail()

Unnamed: 0,obs_ind,ts_ind,vind,hour,value,first_obs_ind
17468774,627,44811,69,4.383333,-0.063892,17468147
17468775,628,44811,69,4.533333,1.587654,17468147
17468776,629,44811,69,5.133333,-0.063892,17468147
17468777,630,44811,69,2.383333,-0.063892,17468147
17468778,631,44811,128,21.333333,0.433458,17468147


In [127]:
# Find max_len.
max_len = data.obs_ind.max()+1
print ('max_len', max_len)

max_len 880


In [128]:
# Generate times_ip and values_ip matrices.
times_inp = np.zeros((N, max_len), dtype='float32')
values_inp = np.zeros((N, max_len), dtype='float32')
varis_inp = np.zeros((N, max_len), dtype='int32')

In [129]:
for row in tqdm(data.itertuples()):
    ts_ind = row.ts_ind
    l = row.obs_ind
    # ts_ind represent the patietn
    # l represent the observation index 
    times_inp[ts_ind, l] = row.hour
    values_inp[ts_ind, l] = row.value
    varis_inp[ts_ind, l] = row.vind
data.drop(columns=['obs_ind', 'first_obs_ind'], inplace=True)

17468779it [00:27, 636993.92it/s]


In [130]:
# Generate 3 sets of inputs and outputs.
train_ip = [ip[train_ind] for ip in [demo, times_inp, values_inp, varis_inp]]
valid_ip = [ip[valid_ind] for ip in [demo, times_inp, values_inp, varis_inp]]
test_ip = [ip[test_ind] for ip in [demo, times_inp, values_inp, varis_inp]]
del times_inp, values_inp, varis_inp
train_op = y[train_ind]
valid_op = y[valid_ind]
test_op = y[test_ind]
del y

In [145]:
train_ip[2].shape

(28790, 880)

In [148]:
train_ip[2].mean(axis=0)

array([ 7.18153222e-03,  7.60831824e-03,  8.39058310e-03, -1.39544148e-03,
        4.65909997e-03, -3.63272266e-04, -5.81754325e-03,  2.49860575e-03,
       -1.88899552e-03,  3.76272760e-03, -3.72506841e-03,  2.35903263e-03,
        5.49702626e-03,  8.17467365e-03,  2.41228584e-02,  1.70477536e-02,
        1.11522805e-02,  1.76167656e-02,  7.10581662e-03,  3.93562578e-03,
       -2.08682800e-03,  1.35587249e-03, -1.18615001e-03,  8.35474289e-04,
       -7.22189620e-03, -9.45465919e-03, -1.41321560e-02, -1.75802372e-02,
       -1.56119643e-02, -1.50559032e-02, -1.97005942e-02, -1.44071495e-02,
       -1.50286769e-02, -3.63209881e-02, -3.30420136e-02, -2.80426294e-02,
       -2.19849981e-02, -1.61494985e-02, -2.32368764e-02, -2.06331369e-02,
       -1.31021030e-02, -1.62418503e-02, -1.86051298e-02, -5.71578089e-03,
       -1.36550888e-02, -4.31562541e-03,  2.31708260e-03, -6.72449591e-03,
       -6.68677269e-03, -4.27733362e-03, -1.09116714e-02, -1.13089355e-02,
       -7.84109754e-04,  

# New Mimic 3

In [195]:
data_dir = './STraTS_torch/mortality_mimic_3_benchmark'
with gzip.GzipFile(f'{data_dir}/val_times.npy.gz', 'r') as f:
    s_val_times = np.load(f)
with gzip.GzipFile(f'{data_dir}/val_values.npy.gz', 'r') as f:
    s_val_values = np.load(f)
with gzip.GzipFile(f'{data_dir}/val_varis.npy.gz', 'r') as f:
    s_val_varis = np.load(f)
with gzip.GzipFile(f'{data_dir}/val_y.npy.gz', 'r') as f:
    s_val_y = np.load(f)      

In [193]:
s_val_times.shape

(11689, 500)

In [188]:
s_val_varis.shape

(2553, 500)

In [189]:
s_val_varis

array([[17, 17, 14, ...,  0,  0,  0],
       [ 2,  6,  7, ...,  0,  0,  0],
       [ 2, 14, 13, ...,  5, 11,  2],
       ...,
       [ 2, 12,  9, ...,  0,  0,  0],
       [13,  9, 11, ...,  0,  0,  0],
       [ 8, 17,  8, ...,  2,  9, 15]], dtype=int32)

In [190]:
np.unique(s_val_varis)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17], dtype=int32)

In [74]:
s_val_values[0]

array([  7.37    ,   7.37    , 154.      , 107.      ,  91.      ,
       107.      ,  32.      ,  99.      ,  29.      ,  36.055557,
        99.      ,   4.      ,   6.      , 105.      ,   5.      ,
        66.3     , 102.      ,  26.      ,  98.      , 103.      ,
        31.      ,  99.      , 109.      , 154.      ,  92.      ,
        97.      , 138.      ,   0.7     ,  32.      ,  99.      ,
        97.      ,  87.      ,  99.      , 103.      ,  24.      ,
       167.      ,  89.      , 104.      ,  37.11111 ,  31.      ,
        75.      ,  88.      ,   5.      ,   4.      ,   6.      ,
        99.      , 127.      , 101.      ,  91.      ,  32.      ,
        72.      ,  81.      ,  99.      , 107.      ,  99.      ,
        96.      , 117.      ,  72.      ,  35.      ,  82.      ,
       133.      ,  92.      , 107.      , 107.      , 151.      ,
        99.      ,  34.      ,   4.      ,   5.      ,   6.      ,
        65.      ,  51.      , 104.      ,  98.      ,  15.   

In [196]:
for i in range(1,18):
    print(f'Mean: {round(s_val_values[s_val_varis==i].mean(),2)}, std: {round(s_val_values[s_val_varis==i].std(),2)}')

Mean: 0.009999999776482582, std: 1.0099999904632568
Mean: -0.0, std: 0.8700000047683716
Mean: 0.029999999329447746, std: 1.0299999713897705
Mean: -0.019999999552965164, std: 1.0099999904632568
Mean: -0.009999999776482582, std: 1.0
Mean: -0.03999999910593033, std: 1.0099999904632568
Mean: -0.0, std: 1.0
Mean: 0.019999999552965164, std: 0.949999988079071
Mean: 0.009999999776482582, std: 0.8999999761581421
Mean: 0.019999999552965164, std: 1.1699999570846558
Mean: 0.0, std: 2.7899999618530273
Mean: -0.0, std: 0.0
Mean: -0.0, std: 0.0
Mean: 0.0, std: 0.7200000286102295
Mean: 0.019999999552965164, std: 1.649999976158142
Mean: 69.58000183105469, std: 4626.60009765625
Mean: 0.019999999552965164, std: 1.850000023841858


In [154]:
s_val_values.shape

(2553, 500)

In [158]:
np.unique(s_val_varis)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
      dtype=int32)

In [163]:
s_val_varis

array([[16, 16, 13, ...,  0,  0,  0],
       [ 1,  5,  6, ...,  0,  0,  0],
       [ 1, 13, 12, ...,  4, 10,  1],
       ...,
       [ 1, 11,  8, ...,  0,  0,  0],
       [12,  8, 10, ...,  0,  0,  0],
       [ 7, 16,  7, ...,  1,  8, 14]], dtype=int32)

In [169]:
s_val_values

array([[  7.37,   7.37, 154.  , ...,   0.  ,   0.  ,   0.  ],
       [ 49.  ,  15.  ,   5.  , ...,   0.  ,   0.  ,   0.  ],
       [ 71.  , 194.  ,  15.  , ...,   4.  ,  75.  ,  55.  ],
       ...,
       [ 78.  ,  96.  , 134.  , ...,   0.  ,   0.  ,   0.  ],
       [ 18.  ,  85.  ,  68.  , ...,   0.  ,   0.  ,   0.  ],
       [ 52.  ,   7.42, 108.  , ...,  54.  , 106.  ,  38.3 ]],
      dtype=float32)

In [173]:
(s_val_varis==0).sum()

225386

In [174]:
s_val_values[s_val_varis==0].shape

(225386,)

In [184]:
a = np.array([[1,2,3],[3,2,1], [2,1,3]])
b = np.array([[1,2,3], [1,2,3], [1,2,3]])

mean_1 = b[a==1].mean()
std_1 = b[a==1].std()

b[a==1] = b[a==1]-mean_1

b

array([[-1,  2,  3],
       [ 1,  2,  1],
       [ 1,  0,  3]])

In [175]:
s_val_values[s_val_varis==0].std()

0.0139707755

In [171]:
s_val_values.mean()

61.58082

In [167]:
np.where(s_val_varis == 0)

(array([   0,    0,    0, ..., 2551, 2551, 2551]),
 array([376, 377, 378, ..., 497, 498, 499]))

In [None]:
mean = np.zeros((17))

In [210]:
import torch

In [213]:
torch.linspace(0, 1., 48)

tensor([0.0000, 0.0213, 0.0426, 0.0638, 0.0851, 0.1064, 0.1277, 0.1489, 0.1702,
        0.1915, 0.2128, 0.2340, 0.2553, 0.2766, 0.2979, 0.3191, 0.3404, 0.3617,
        0.3830, 0.4043, 0.4255, 0.4468, 0.4681, 0.4894, 0.5106, 0.5319, 0.5532,
        0.5745, 0.5957, 0.6170, 0.6383, 0.6596, 0.6809, 0.7021, 0.7234, 0.7447,
        0.7660, 0.7872, 0.8085, 0.8298, 0.8511, 0.8723, 0.8936, 0.9149, 0.9362,
        0.9574, 0.9787, 1.0000])

In [218]:
torch.cat((torch.tensor([[[1,2,3],[3,2,1]]]),torch.tensor([[[4,5,6],[4,5,6]]])),2)

tensor([[[1, 2, 3, 4, 5, 6],
         [3, 2, 1, 4, 5, 6]]])

# Results

In [1]:
import pickle

In [8]:
with open('./MultimodalMIMIC/run/TS/ihm/TS/TS_48/Atten/layer3/batch_seq_feature/irregular_TS_64/irregular_Text_64/0.0004_20_8_128_1_4/result.pkl', "rb") as f:
    results = pickle.load(f)

In [5]:
results

{42: {'auc': {'val': 0.8278205621871391, 'test': 0.8432079231129863},
  'auprc': {'val': 0.4826476642756666, 'test': 0.48514806595578813},
  'f1': {'val': 0.46919431279620855, 'test': 0.4576271186440678}}}

In [9]:
results

{42: {'auc': {'val': 0.826881979206777, 'test': 0.8420710267229254},
  'auprc': {'val': 0.4822337256119204, 'test': 0.4851173219491775},
  'f1': {'val': 0.4755244755244755, 'test': 0.457983193277311}}}