In [None]:
from tokenizers import Tokenizer, models
import os
import pandas as pd
import pickle
import numpy as np

In [None]:
OUTPUT_DIR = "data"

MC-MED

In [None]:
COLS_NAME = ['event']
COLS = {
    "labs": ['Component_name'],
    "numerics": ['Measure'],
    "orders": ['Procedure_ID'],
    "rads": ['Study']
}

In [None]:
def read_csv_fn(fn):
    cols = COLS[fn]
    df = pd.read_csv(os.path.join(DATA_DIR, f'{fn}.csv'), usecols=['CSN'] + cols)
    df = df.rename(columns={c: COLS_NAME[i] for i, c in enumerate(cols)})
    return df

In [None]:
def create_from_data(FNAME, exclude_demo=False):
    labs_df = read_csv_fn('labs')
    vitals_df = read_csv_fn('numerics')
    orders_df = read_csv_fn('orders')
    df = pd.concat([labs_df, vitals_df, orders_df])

    if exclude_demo:
        vocab_list = list(set(df['event'].astype(str).unique()))
    else:
        vocab_list = list(set().union(*df['input'])) + list(df['age'].unique()) + list(df['sex'].unique()) + list(df['race'].unique())
    vocab_list = [x for x in vocab_list if x is not None]
    vocab_list = list(set(vocab_list))
    
    special_list = ['[UNK]', '[BOS]', '[EOS]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
    vocab_list = special_list + vocab_list

    # Create a dictionary mapping each token to a unique index
    vocab = {token: idx for idx, token in enumerate(vocab_list)}

    # Create a WordLevel tokenizer with the custom vocab and an unknown token
    tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token="[UNK]"))

    tokenizer.save(os.path.join(OUTPUT_DIR, f'{FNAME}_tokenizer.json'))

In [None]:
def create_from_df(FNAME, exclude_demo=False):
    df = pd.read_parquet('data/decomp_data.parquet')

    vocab_list = list(set(df['eventval'].astype(str).unique()))
    vocab_list = [x for x in vocab_list if x is not None]
    vocab_list = list(set(vocab_list))
    
    special_list = ['[UNK]', '[BOS]', '[EOS]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
    vocab_list = special_list + vocab_list

    # Create a dictionary mapping each token to a unique index
    vocab = {token: idx for idx, token in enumerate(vocab_list)}

    # Create a WordLevel tokenizer with the custom vocab and an unknown token
    tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token="[UNK]"))

    tokenizer.save(os.path.join(OUTPUT_DIR, f'{FNAME}_tokenizer.json'))

In [None]:
def create_from_df(FNAME, exclude_demo=False):
    vocab_list = []

    for i in range(0, 34):
        df = pd.read_parquet(f'data/output_{i}.parquet')
        df = df.dropna(subset=['eventval'])
        vocab_list += list(set(df['eventval'].astype(str).unique()))
        vocab_list = list(set(vocab_list))
    
    special_list = ['[UNK]', '[BOS]', '[EOS]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
    vocab_list = special_list + vocab_list

    # Create a dictionary mapping each token to a unique index
    vocab = {token: idx for idx, token in enumerate(vocab_list)}

    # Create a WordLevel tokenizer with the custom vocab and an unknown token
    tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token="[UNK]"))

    tokenizer.save(os.path.join(OUTPUT_DIR, f'{FNAME}_tokenizer.json'))

In [None]:
create_from_df("decomp", exclude_demo=True)

EHRSHOT

MIMIC: ED

In [None]:
demo_df = pd.read_csv('data/transfer/demo.csv')
med_df = pd.read_csv('data/transfer/med.csv')
numerics_df = pd.read_csv('data/transfer/numerics.csv')

In [None]:
demo_vocab = list(demo_df['race_str'].unique()) + list(demo_df['gender_str'].unique()) + list(demo_df['age_str'].unique())
med_vocab = list(med_df['eventval'].unique())
num_vocab = list(numerics_df['eventval'].unique())

In [None]:
vocab_list = demo_vocab + med_vocab + num_vocab
vocab_list = [v for v in vocab_list if not v is None]

In [None]:
special_list = ['[UNK]', '[BOS]', '[EOS]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
vocab_list = special_list + vocab_list
# Create a dictionary mapping each token to a unique index
vocab = {token: idx for idx, token in enumerate(vocab_list)}

# Create a WordLevel tokenizer with the custom vocab and an unknown token
tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token="[UNK]"))

tokenizer.save(os.path.join(OUTPUT_DIR, f'ed_eventval_tokenizer.json'))

MIMIC: HOSP + ICU

In [None]:
demo_df = pd.read_parquet('data/mortality/demographics.parquet')

In [None]:
demo_vocab = list(demo_df['gender_str'].unique()) + list(demo_df['race_str'].unique()) + list(demo_df['ethnicity_str'].unique()) + list(demo_df['age_str'].unique())

In [None]:
eventvals = []
for i in range(0, 37):
    df = pd.read_parquet(f'data/mortality/{i}_final.parquet', columns=['eventval'])
    eventvals += list(df['eventval'].unique())
    eventvals = list(set(eventvals))

In [None]:
vocab_list = demo_vocab + eventvals

In [None]:
special_list = ['[UNK]', '[BOS]', '[EOS]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
vocab_list = special_list + vocab_list

# Create a dictionary mapping each token to a unique index
vocab = {token: idx for idx, token in enumerate(vocab_list)}

# Create a WordLevel tokenizer with the custom vocab and an unknown token
tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token="[UNK]"))

tokenizer.save(os.path.join(OUTPUT_DIR, f'eventval_tokenizer.json'))