In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
def dpath(fname):
    return os.path.join('/mnt/data1/mimic/iii', fname)

def cleaned(fname):
    return os.path.join('/mnt/data1/mimic/iii/cleaned', fname)

def aligned(fname):
    return os.path.join('/mnt/data1/mimic/iii/aligned', fname)

def filter_encs(df, encs):
    return df[df['HADM_ID'].isin(set(encs))].reset_index(drop=True)

def filter_pts(df, pts):
    return df[df['SUBJECT_ID'].isin(set(pts))].reset_index(drop=True)

In [None]:
notes = pd.read_csv(cleaned('Progress_Notes.csv'), parse_dates=['CHARTTIME'])
note_encs = set(notes['HADM_ID'])
note_pts = set(notes['SUBJECT_ID'])

feat_names  = [
    ('HR', ['HR']),
    ('BP', ['SBP', 'DBP']),
    ('RR', ['RR']),
    ('SpO2', ['SpO2']),
    ('GCS', ['GCS']),
]
feats = {k: filter_encs(pd.read_csv(cleaned(f'{k}.csv'), parse_dates=['CHARTTIME']), note_encs) for k, _ in feat_names}

adt = filter_encs(pd.read_csv(dpath('ADMISSIONS.csv.gz'), parse_dates=['ADMITTIME', 'DISCHTIME', 'DEATHTIME']), note_encs)
patients = filter_pts(pd.read_csv(dpath('PATIENTS.csv.gz'), parse_dates=['DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN']), note_pts)

In [None]:
{k: len(df) for k, df in feats.items()}

In [None]:
assert len(note_encs) == len(set(adt['HADM_ID']))
assert len(note_pts) == len(set(patients['SUBJECT_ID']))
print(f"#Pts: {len(note_pts)} #Encs: {len(note_encs)}")

In [None]:
print(f"#Total_Notes: {len(notes)}")
print(f"#Unique_Notes: {len(notes.drop_duplicates(subset=['HADM_ID', 'CHARTTIME']))}")

In [None]:
plt.hist(notes.drop_duplicates(subset=['HADM_ID', 'CHARTTIME'])['CHARTTIME'].dt.round('H').dt.hour, bins=np.arange(25))
plt.title('Progress Note Time of Day')
plt.ylabel('Count')
plt.xlabel('Hour')

# Align Timeseries

In [None]:
for df in feats.values():
    df.sort_values(['SUBJECT_ID', 'HADM_ID', 'CHARTTIME'], ascending=True, inplace=True)
    df['next15'] = df['CHARTTIME'].dt.ceil("15min")
    df.drop_duplicates(subset=['HADM_ID', 'next15'], keep="last", inplace=True)

notes['next15'] = notes['CHARTTIME'].dt.ceil("15min")

In [None]:
rel_window = pd.Timedelta("2D")
window_idx = pd.timedelta_range(start="0D", end=rel_window, freq="15min", closed="left")[::-1]
window_idx.name = "Time to note"
data = []
for enc in tqdm(note_encs):
    pt_notes = notes[notes['HADM_ID'] == enc]
    pt_feats = {k: v[v['HADM_ID'] == enc] for k, v in feats.items()}
    subj = pt_notes['SUBJECT_ID'].iloc[0]

    note_times = pt_notes['next15'].drop_duplicates().sort_values().to_list()
    for i, (start_time, end_time) in enumerate(zip([pd.Timestamp('1500')]+note_times[:-1], note_times)):
        note_feats = pd.DataFrame(index=window_idx, columns=[col if len(cols) > 1 else k for k, cols in feat_names for col in cols])
        for k, cols in feat_names:
            v = pt_feats[k]
            before_note = v['next15'].between(start_time, end_time, inclusive='right')
            note_feat = v[before_note]
            time_to_note = end_time - note_feat['next15']
            in_window = time_to_note < rel_window
            time_to_note = time_to_note[in_window]
            for col in cols:
                vals = note_feat.loc[in_window, col]
                vals.index = time_to_note
                note_feats.loc[time_to_note, col if len(cols) > 1 else k] = vals
        note_dupes = pt_notes.loc[pt_notes['next15'] == end_time, 'TEXT']
        note_dupes.to_csv(aligned(f'notes/{subj}-{enc}-{i}.csv'), index=False)
        note_feats.to_csv(aligned(f'feats/{subj}-{enc}-{i}.csv'))
        data.append((subj, enc, i, end_time))
all_records = pd.DataFrame(data, columns=['SUBJECT_ID', 'HADM_ID', 'NOTE_NUM', 'ALIGNED_TIME'])

In [None]:
all_records['DOD'] = patients.set_index('SUBJECT_ID').loc[all_records['SUBJECT_ID'], 'DOD'].reset_index(drop=True)
all_records['TIME_TO_DEATH'] = all_records['DOD'] - all_records['ALIGNED_TIME'].dt.floor("1D")
all_records['60D_MORTALITY'] = (all_records['TIME_TO_DEATH'] < pd.Timedelta('60D')).astype(int)

In [None]:
all_records['60D_MORTALITY'].value_counts()

In [None]:
all_records['60D_MORTALITY'].value_counts() / len(all_records)

In [None]:
n_splits = 5
temp = all_records['SUBJECT_ID'].drop_duplicates().sample(frac=1, random_state=0).reset_index(drop=True)
pt_split = pd.DataFrame({
    'SUBJECT_ID': temp,
    'SPLIT': [i % n_splits for i in range(len(temp))],
})
all_records['SPLIT'] = pt_split.set_index('SUBJECT_ID').loc[all_records['SUBJECT_ID']].reset_index(drop=True)

In [None]:
all_records['SPLIT'].value_counts().sort_index()

In [None]:
all_records.groupby('SPLIT')['60D_MORTALITY'].value_counts()

In [None]:
all_records.groupby('SPLIT')['60D_MORTALITY'].value_counts() / all_records['SPLIT'].value_counts()

In [None]:
all_records.to_csv(aligned('all_records.csv'), index=False)

In [None]:
all_records = pd.read_csv(aligned('all_records.csv'))

In [None]:
all_records = all_records.drop('DOD', axis=1)

In [None]:
all_records

In [None]:
merged = all_records.merge(patients, on='SUBJECT_ID')

In [None]:
patients['GENDER'].value_counts()

In [None]:
merged['GENDER'].value_counts()

In [None]:
from datetime import timedelta
((pd.to_datetime(merged['ALIGNED_TIME']).dt.date - pd.to_datetime(merged['DOB']).dt.date) / timedelta(days=365)).describe()