In [None]:
from tqdm import tqdm
import numpy as np
import pickle

In [None]:
# Follow instruction SEFT repository to get access to P19 data
# https://github.com/BorgwardtLab/Set_Functions_for_Time_Series/tree/master#available-datasets

In [None]:
import tensorflow_datasets as tfds
import medical_ts_datasets

splitname = 'train' # train dev test
p19, p19_info = tfds.load(name='physionet2019', split=splitname, as_supervised=True, with_info=True)

In [None]:
# run it each time separately

data = []
diag_times = []
for i,d in tqdm(enumerate(p19)):
    x,y=d
    static, timestamps,diagnosis,mask, l = x
    data_dict ={}
    data_dict['static'] = static.numpy()
    data_dict['timestamps'] = timestamps.numpy()
    data_dict['diagnosis'] = diagnosis.numpy()
    data_dict['masks'] = mask.numpy()
    data_dict['length'] = l.numpy()
    
    data_dict['target'] = y.numpy().reshape(-1)
    data.append(data_dict)

In [None]:
filename = f'p19_{splitname}.pickle'

with open(filename, 'wb') as fp:
    pickle.dump(data, fp)

In [None]:
with open(filename, 'rb') as fp:
    data_p19 = pickle.load(fp)
    
len(data_p19)

In [None]:
25813 + 6454 + 8066

In [None]:
# p19 3 instances were removed by SeFT 
# time series variable = 34, static = 4
#         train - dev  - test 
# nsample 25813 - 6454 - 8066
# avgobs 38.458 - 38.314 - 38.690
nobs = []
for p in tqdm(p19):
    x = p[0]
    y = p[1]
    #print(len(x), len(y))
    nobs.append(len(y))
    print(x,y)
    #break
print(np.mean(nobs))

# Preparing stat files

In [None]:
nvar = 34
diag_total = np.zeros(nvar)
count_total = np.zeros(nvar)
count_total_p = np.zeros(nvar)
patient_avg_ = np.zeros(nvar)
min_total = np.array([np.inf]*nvar)
max_total = np.array([-np.inf]*nvar)
val_dict = {f'idx{i}': [] for i in range(nvar)}

for d in tqdm(data_p19):
    diag = d['diagnosis']
    mask = d['masks']
    timestamp = d['timestamps']
    #print(timestamp)
    total_hr = timestamp[-1] - timestamp[0]
    #print(diag.shape)#, diag)#, np.nansum(diag, axis=0))
    diag_total += np.nansum(diag, axis=0)
    count_total += np.sum(mask, axis=0)
    for i,(v,m) in enumerate(zip(diag.T, mask.T)):
        val_dict[f'idx{i}']+=list(v[m])
        #print(i,'?',v[m])
        if len(v[m]) >0:
            maxof = max(max_total[i],max(v[m]))
            minof = min(min_total[i],min(v[m]))
            max_total[i] = maxof
            min_total[i] = minof
            #print(i, "==>", maxof,minof)
    
    #print(np.sum(mask, axis=0)/total_hr)
    patient_avg_ += (np.sum(mask, axis=0)/total_hr)
    #break

In [None]:
patient_avg_/25813 # avg of (avg of diagnosis per variable) per patient

In [None]:
with open('sampling_rate_p19.npy', 'wb') as fp:
    np.save(fp, patient_avg_/25813)
    
    
with open('sampling_rate_p19.npy', 'rb') as fp:
    sr_p19 = np.load(fp)
# https://numpy.org/doc/stable/reference/generated/numpy.save.html

In [None]:
final_stat = {}
for k in val_dict:
    print(k, min(val_dict[k]), max(val_dict[k]), np.mean(val_dict[k]), np.std(val_dict[k]))
    final_stat[k] = {"min":min(val_dict[k]),
                    "max": max(val_dict[k]),
                    "mean":np.mean(val_dict[k]),
                    "std": np.std(val_dict[k])}

In [None]:
filename = 'p19_stat.pickle'
with open(filename, 'wb') as fp:
    pickle.dump(final_stat, fp)

In [None]:
with open(filename, 'rb') as fp:
    stat_p19 = pickle.load(fp)
    
len(stat_p19)