In [None]:
from collections import Counter
import pandas as pd
import numpy as np
import json

from tqdm import tqdm
import torch

In [None]:
#data.py

class mimicforsetlstm():
    def __init__(self, datadir, configdir, name, device, normalize=False, padding=False, load_all=True,num_of_instances=20, subsample=False, top=False):

        self.datadir = datadir
        self.configdir = configdir
        self.name = name
        self.device  = device
        self.do_padding = padding
        self.load_all = load_all
        self.normalize = normalize
        # The max length for IHM is 1569 after removing the 32 patients as done in SeFT
        self.MAX_LEN = 1569 
        with open(self.configdir+ 'discretizer_config.json','r') as json_file:
            self.discretizer_config = json.loads(json_file.read())
        
        with open(self.configdir + 'channel_info.json','r') as json_file:
            self.channelinfo = json.loads(json_file.read())
        ignore_files = []
        with open('../ignore_file.txt','r') as fp:
            for line in fp:
                ignore_files.append(line.strip())
            
        self.subsample = subsample
        self.top = top
        #shifted one index
        #top_five from desc
        self.topfive_idx = [12,13,9,14,2,11]
        #bottom_five from aesc
        self.bottom_idx = [1, 10, 16, 3, 17, 6]
        
        self.df = pd.read_csv(self.datadir + f'{name}_listfile.csv')
        print("originaldflen:", len(self.df))
        self.df = self.df[~self.df.stay.isin(ignore_files)]
        print("afterdeldflen:", len(self.df))
        
        if self.load_all:
            self.df = self.df[:]
        else:
            self.df = self.df[:num_of_instances]
        
        

        self.id_to_channel = self.discretizer_config['id_to_channel']
        self.is_categorical_channel = self.discretizer_config['is_categorical_channel']

        self.possible_values = self.channelinfo
        
        self.normal_values = self.discretizer_config['normal_values']
        print(self.normal_values.keys())
        #self.configdir = configdir
        

    def __len__(self):
        return len(self.df)

    def __labels__(self):
        return list(self.df.y_true)

    def __getitem__(self, idx):

        info = self.df.iloc[idx,:].values
        filename = info[0]
        y = info[1]
        if self.name != 'test':
            patient_df = pd.read_csv(self.datadir + f'train/{filename}')
        else:
            patient_df = pd.read_csv(self.datadir + f'test/{filename}')
        
        #print("original col:", patient_df.columns)
        if self.subsample:
            if self.top:
                patient_df = patient_df.iloc[:,[0] +self.topfive_idx]
            else:
                patient_df = patient_df.iloc[:,[0] +self.bottom_idx]
                
        info = filename.split('_')
        patient_id = float(info[0] + '.' +  info[1].split('episode')[-1])
        feature_dict = {fname:idx for idx, fname in enumerate(patient_df.columns[1:])}
        
        feature_count = {idx:0 for idx, fname in enumerate(patient_df.columns[1:])}
        feature_ftime = {idx:[] for idx, fname in enumerate(patient_df.columns[1:])}
        time_global = []
        type_global = []
        z_global = []
        delt_global = []
        
        all_timestamps = patient_df.Hours.values
        #print('total timestamps', len(all_timestamps))
        if all_timestamps[-1] != all_timestamps[0]:
            duration = all_timestamps[-1] - all_timestamps[0]
        else:
            duration = all_timestamps[0]

        for i,rowidx in enumerate(patient_df.iterrows()):
            info = rowidx[1]
            
            timestamp = info.values[0]
            features = info.values[1:] # feature values
            
            rowmask = ~info.isna().values[1:] # which are available
         
            available_feature_keys = np.array(list(feature_dict.keys()))[rowmask]
            available_feature_values = features[rowmask]
            
            for af,av in zip(available_feature_keys, available_feature_values):
                feat_key = feature_dict[af]
                
                time_global.append(timestamp)
                type_global.append(feat_key)
                
                feature_count[feat_key] +=1
                
                if self.is_categorical_channel[af]:
                    "fixing categorical values"
                    vdict = self.possible_values[af]['values']
                    
                    if str(av) not in vdict:
                        
                        av = str(av).split('.')[0]
                        z_global.append(vdict[str(av)])
                    else:
                        z_global.append(vdict[str(av)])
                else:
                    z_global.append(av)
                
                if len(feature_ftime[feat_key]) == 0:
                    # meaning: feature af is appearing for the first time
                    feature_ftime[feat_key].append(timestamp)
                    delt_global.append(0)
                else:
                    delt_global.append(timestamp - feature_ftime[feat_key][0])

        curr_len = len(z_global)
        if self.do_padding:
            diff = self.MAX_LEN - curr_len
            if diff > 0:
                time_global += [-1]*diff
                type_global += [-1]*diff
                z_global += [-1]*diff
                delt_global += [-1]*diff
            else:
                time_global = time_global[:self.MAX_LEN]
                type_global = type_global[:self.MAX_LEN]
                z_global = z_global[:self.MAX_LEN]
                delt_global = delt_global[:self.MAX_LEN]
                
        
        stacked_x = torch.vstack((torch.tensor(time_global),
                                    torch.tensor(type_global),
                                    torch.tensor(z_global), 
                                torch.tensor(delt_global)))

        return {'x': stacked_x.to(self.device),
                'y':torch.tensor(y).to(self.device),
                 'lx':torch.tensor(curr_len).to(self.device),
                'pid':torch.tensor(patient_id).to(self.device)}, list(feature_count.values()), duration, patient_df



In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


trainset = mimicforsetlstm(datadir = './data/in-hospital-mortality/', 
                           configdir = './mimic3models/resources/',
                            name='train',  load_all=True, num_of_instances=10996,
                            device=device, subsample=False, top=False,
                          padding=True)


#print(trainset.name,':' , trainset.__len__())
all_patient_sampling = []
ds = []
ys = []
diag_times = []
trainmiss, trainall = 0,0
c = 0
save_for_slan = []
for sample,_, d,pdf in tqdm(iter(trainset)):
    x = sample['x']
    y = sample['y']
    all_patient_sampling.append(_)
    ds.append(d)
    ys.append(y.detach().cpu().tolist())
    r,c = pdf.iloc[:,1:].shape
    trainall +=r*c
    trainmiss += (sum(sum(np.array(pdf.iloc[:,1:].isna()))))
    diag_times.append(len(pdf.Hours.values))
    save_for_slan.append(list(sum(np.array(pdf.iloc[:,1:].isna()))) + [r] + [y.detach().cpu().tolist()])
    

In [None]:
diagn_per_hour = 1/np.array(ds)[:,None] * np.array(all_patient_sampling)#num of patient x nul of feature

print(diagn_per_hour.shape)
np.mean(diagn_per_hour, axis=0)# mean across variable 

In [None]:
#check missingness

#(14604887, 9865879)(4246396, 2857945)(4263175, 2877824)
#a,y = (14604887+4246396+4263175, 9865879+2857945+2877824)

#y,a,a-y

# Stat file generation

In [None]:
class normalizer():
    def __init__(self, datadir, configdir, name, device,num_of_instances=20, normalize=False, padding=False, load_all=True):

        self.datadir = datadir
        self.configdir = configdir
        self.name = name
        self.device  = device
        self.do_padding = padding
        self.load_all = load_all
        self.num_of_instances = num_of_instances
        self.normalize = normalize
        # The max length for IHM is 1569 after removing the 32 patients as done in SeFT
        self.MAX_LEN = 1569 
        with open(self.configdir+ 'discretizer_config.json','r') as json_file:
            self.discretizer_config = json.loads(json_file.read())
        
        with open(self.configdir + 'channel_info.json','r') as json_file:
            self.channelinfo = json.loads(json_file.read())
        ignore_files = []
        
        with open('../ignore_file.txt','r') as fp:
            for line in fp:
                ignore_files.append(line.strip())

        self.df = pd.read_csv(self.datadir + f'{name}_listfile.csv')
        self.df = self.df[~self.df.stay.isin(ignore_files)]
        
        if self.load_all:
            self.df = self.df[:]
        else:
            self.df = self.df[:num_of_instances]

        self.df = self.df[~self.df.stay.isin(ignore_files)]

        self.id_to_channel = self.discretizer_config['id_to_channel']
        self.is_categorical_channel = self.discretizer_config['is_categorical_channel']

        self.possible_values = self.channelinfo
        
        self.normal_values = self.discretizer_config['normal_values']
        print(self.normal_values)
        
        self.var_stat = self._normalize_data(self.df, debug=False)
        
    def sum_max_min(self, vec):
        
        isnan = [~np.isnan(v) for v in vec]
        #print(isnan, sum(isnan))
        sm = np.nansum(vec)
        mn = np.inf
        for v in vec:
            mn = min(mn,v)
        mx = -np.inf
        for v in vec:
            mx = max(mx,v)
            
        return sm,mx, mn, sum(isnan)
        
        
    def _normalize_data(self, patient_df, debug=False):
        
        print('hllo')
        
        is_categorical = []
        nvariables = 0
        features_names = list(self.normal_values.keys())
        print(features_names)
        feat_dict = {f: {'min':np.inf, 'max':-np.inf, 'sum':0, 'count':0, 'is_cat':False, 'value':[]} for f in features_names}
        
        for idx in tqdm(range(len(self.df))):
            info = self.df.iloc[idx,:].values
            #print(info)
            filename = info[0]
            if self.name != 'test':
                patient_df = pd.read_csv(self.datadir + f'train/{filename}')
            else:
                patient_df = pd.read_csv(self.datadir + f'test/{filename}')
                
            max_values = []
            min_values = []
            sum_values = []
            count_values = []
            
            
            info = patient_df.iloc[:,1:].to_numpy().T
            #print(info.shape)
            
            for feat, featn in zip(info, features_names):
                
                if self.is_categorical_channel[featn]:
                    mask = [~np.isnan(v) if not isinstance(v,str) else True for v in feat]
                    if debug:
                        print("categorical pass done")
                        print(feat)
                        print(sum([~np.isnan(v) if not isinstance(v,str) else True for v in feat]))
                    #s,mx,mn, c = -2,-2,-2, sum([~np.isnan(v) if not isinstance(v,str) else True for v in feat])
                else:
                    mask = [~np.isnan(v) for v in feat]
                    if debug:
                        1;#print(feat)
                        #print(featn)
                    #print(np.isnan(feat))
                    #s,mx,mn, c = self.sum_max_min(feat)
                    
                #print('==nonnan==>',feat[mask])
                if len(feat[mask]) > 0:
                    if not self.is_categorical_channel[featn]:
                        feat_dict[featn]['sum'] += sum(feat[mask])
                        feat_dict[featn]['count'] += len(feat[mask])
                        feat_dict[featn]['is_cat'] = self.is_categorical_channel[featn]
                        feat_dict[featn]['min'] = min(feat_dict[featn]['min'], min(feat[mask]))
                        feat_dict[featn]['max'] = max(feat_dict[featn]['max'], max(feat[mask]))
                        feat_dict[featn]['value'].extend(feat[mask])
                    else:
                        feat_dict[featn]['count'] += len(feat[mask])
                        feat_dict[featn]['is_cat'] = self.is_categorical_channel[featn]
                        feat_dict[featn]['value'].extend(feat[mask])
                        
    
        return feat_dict

In [None]:
device = torch.device('cpu')
            
train = normalizer(datadir = './data/in-hospital-mortality/', 
                           configdir = './mimic3models/resources/',
                            name='train',  load_all=True, num_of_instances=10996,
                            device=device,
                          padding=True)

In [None]:
final_stat_dict= {}

for k in train.var_stat:
    #print(k)
    final_stat_dict[k] = {'is_cat': None, 'mean':None, 'std':None, 'min':None, 'max':None}
    if not train.var_stat[k]['is_cat']:
        final_stat_dict[k]['is_cat'] = train.var_stat[k]['is_cat']
        final_stat_dict[k]['min'] = train.var_stat[k]['min']
        final_stat_dict[k]['max'] = train.var_stat[k]['max']
        info = train.var_stat[k]
        if info['count']>0:
            #print(info)
            print('mean:', info['sum']/info['count'])
            print('std:', np.std(info['value']))
            print("*"*20)
            final_stat_dict[k]['mean'] = info['sum']/info['count']
            final_stat_dict[k]['std'] = np.std(info['value'])
            
    else:
        final_stat_dict[k]['is_cat'] = train.var_stat[k]['is_cat']

#  Writing mimic


In [None]:
import json

with open('mimic_stat.json', 'w') as fp:
    json.dump(final_stat_dict, fp )
    
    
with open('sampling_rate_mimic.npy', 'wb') as f:
    np.save(f,np.mean(diagn_per_hour, axis=0))