In [1]:
ROOT = "/fs01/home/afallah/odyssey/odyssey"

from typing import Any, Tuple

import torch
import pickle
import os; os.chdir(ROOT)
import pandas as pd

from models.big_bird_cehr.data import PretrainDataset
from models.big_bird_cehr.tokenizer import HuggingFaceConceptTokenizer


DATA_ROOT = f"{ROOT}/data/slurm_data/2048/one_month"
DATA_PATH = f"{DATA_ROOT}/fine_test.parquet"
NEW_DATA_PATH = f"{ROOT}/data/bigbird_data/patient_sequences_2048_labeled.parquet"

In [2]:
data = pd.read_parquet("/h/afallah/odyssey/odyssey/data/bigbird_data/patient_sequences_2048_labeled.parquet")
patient_ids = pickle.load(open('/h/afallah/odyssey/odyssey/data/bigbird_data/dataset_2048_mortality_1month.pkl', 'rb'))
pre_data = data.loc[data['patient_id'].isin(patient_ids['test'])]

# Train Tokenizer
tokenizer = HuggingFaceConceptTokenizer(data_dir="/h/afallah/odyssey/odyssey/data/vocab")
tokenizer.fit_on_vocab()

# Load datasets
train_dataset = PretrainDataset(
    data=pre_data,
    tokenizer=tokenizer,
    max_len=2048,
    mask_prob=0.15,
)

val_dataset = PretrainDataset(
    data=pre_data,
    tokenizer=tokenizer,
    max_len=2048,
    mask_prob=0.15,
)

In [3]:
def mask_tokens(self, sequence: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Mask the tokens in the sequence using vectorized operations."""
    mask_token_id = self.tokenizer.get_mask_token_id()

    masked_sequence = sequence.clone()

    # Ignore [PAD], [UNK], [MASK] tokens
    prob_matrix = torch.full(masked_sequence.shape, self.mask_prob)
    prob_matrix[torch.where(masked_sequence <= mask_token_id)] = 0
    selected = torch.bernoulli(prob_matrix).bool()

    # 80% of the time, replace masked input tokens with respective mask tokens
    replaced = torch.bernoulli(torch.full(selected.shape, 0.8)).bool() & selected
    masked_sequence[replaced] = mask_token_id

    # 10% of the time, we replace masked input tokens with random vector.
    randomized = torch.bernoulli(torch.full(selected.shape, 0.1)).bool() & selected & ~replaced
    random_idx = torch.randint(low=self.tokenizer.get_first_token_index(),
                               high=self.tokenizer.get_last_token_index(),
                               size=prob_matrix.shape, dtype=torch.long)
    masked_sequence[randomized] = random_idx[randomized]

    labels = torch.where(selected, sequence, -100)

    return masked_sequence, labels

In [14]:
len(set(train_dataset[0]['type_ids'].tolist()))

9

In [None]:
patients = pd.read_parquet(NEW_DATA_PATH)
patients

In [3]:
tokenizer = HuggingFaceConceptTokenizer(data_dir=DATA_ROOT)
tokenizer.fit_on_vocab()

train_dataset = PretrainDataset(
    data=patients,
    tokenizer=tokenizer,
    max_len=2048,
    mask_prob=0.15,
)

In [6]:
e1 = "[CLS] [VS] 00054853516 00245008201 00338004904 00008084199 00045152510 00006003121"
e2 = "[CLS] [VS] 00054853516 00245008201"

In [38]:
tokenizer(patients["event_tokens_2048"].iloc[0])

{'input_ids': tensor([[    5,     0,     0,  ...,     0,     0,     0],
        [    3,     0,     0,  ...,     0,     0,     0],
        [12809,     0,     0,  ...,     0,     0,     0],
        ...,
        [ 1352,     0,     0,  ...,     0,     0,     0],
        [    4,     0,     0,  ...,     0,     0,     0],
        [    6,     0,     0,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        ...,
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0]])}

In [None]:
patients = patients[patients["event_tokens_2048"].notnull()]

tokenizer = ConceptTokenizer(data_dir=DATA_ROOT)
tokenizer.fit_on_vocab()

train_dataset = PretrainDataset(
    data=patients,
    tokenizer=tokenizer,
    max_len=2048,
    mask_prob=1,
)

In [None]:
tokenizer.decode([[3]])

In [None]:
patients.iloc[1]

In [None]:
train_dataset[0]

In [None]:
print(train_dataset[0]["attention_mask"])

In [None]:
tokenizer.get_mask_token_id()

In [None]:
print(list(train_dataset[110]["concept_ids"]).count(20569))

In [None]:
print(len(train_dataset[110]["concept_ids"]))

In [None]:
tokenizer.get_pad_token_id()

In [None]:
tokenizer.encode(["[PAD]"])

In [None]:
patients.iloc[0]["event_tokens_2048"]

In [None]:
ROOT = "/fs01/home/afallah/odyssey/odyssey"

import os


os.chdir(ROOT)
import numpy as np
import pandas as pd
from tqdm import tqdm

from models.big_bird_cehr.data import PretrainDataset
from models.big_bird_cehr.tokenizer import ConceptTokenizer


DATA_ROOT = f"{ROOT}/data/slurm_data/2048/one_month"
DATA_PATH = f"{DATA_ROOT}/fine_test.parquet"
patients = pd.read_parquet(DATA_PATH)
patients
# Find the unique set of all possible tokens, including special tokens
unique_event_tokens = set()

for patient_event_tokens in tqdm(
        patients["event_tokens_2048"].values, desc="Loading Tokens", unit=" Patients",
):
    for event_token in patient_event_tokens:
        unique_event_tokens.add(event_token)

unique_event_tokens = list(unique_event_tokens)
unique_event_tokens.sort(reverse=True)

print(
    f"Complete list of unique event tokens\nLength: {len(unique_event_tokens)}\nHead: {unique_event_tokens[:30]}...",
)
special_tokens = [
    "[CLS]",
    "[PAD]",
    # "[VS]",
    "[VE]",
    "[W_0]",
    "[W_1]",
    "[W_2]",
    "[W_3]",
    *[f"[M_{i}]" for i in range(0, 13)],
    "[LT]",
]

feature_event_tokens = [token for token in unique_event_tokens if token not in special_tokens]

print(len(feature_event_tokens), feature_event_tokens[:20])
patients_event_tokens = patients["event_tokens_2048"]
len_vocab = len(feature_event_tokens)
token2id = {token: i for i, token in enumerate(feature_event_tokens)}
token_correlations = np.zeros(shape=(len_vocab, len_vocab))
token_frequencies = []

for curr_token in tqdm(feature_event_tokens, desc="Analyzing... ", unit=" Tokens"):
    curr_token_id = token2id[curr_token]
    token_freq = 0

    for _, patient in enumerate(patients_event_tokens):

        vs_id = np.where(patient == "[VS]")[0]
        ve_id = np.where(patient == "[VE]")[0]

        for vs, ve in zip(vs_id, ve_id):
            curr_visit = patient[vs:ve]

            if curr_token not in curr_visit:
                continue

            token_freq += 1
            for visit_token in curr_visit:
                token_correlations[curr_token_id][token2id[visit_token]] += 1

    token_frequencies.append(token_freq)
patients = patients[patients["event_tokens_2048"].notnull()]

tokenizer = ConceptTokenizer(data_dir=DATA_ROOT)
tokenizer.fit_on_vocab()

train_dataset = PretrainDataset(
    data=patients,
    tokenizer=tokenizer,
    max_len=2048,
    mask_prob=1,
)