## Synthetic dataset generation -- Sequence based
**Author: Lin Lee Cheong <br>
Updated by: Tesfagabir Meharizghi<br>
Date created: 12/12/ 2020 <br>
Date updated: 02/04/2021 <br>**

Goal of this synthetic dataset is to create datasets to help understand how different relationships between tokens affect attention, SHAP and other interpretability factors.
- length of events (30, 300)
- spacing between 2+ coupled events, i.e. order of sequence matters
- amount of noise, i.e. performance vs interpretability
- vocabulary space

### Sequence dataset

Positive label is driven by a sequence of tokens
- Positive set sequence and their probability

Unhelper(U) -> Helper(H) -> Adverse(A) ==> 99%<br>
Unhelper(U) -> Adverse(A) -> Helper(H) ==> 80%<br>
Helper(H) -> Unhelper(H) -> Adverse(A) ==> 60%<br>
Adverse(A) -> Unhelper(H) -> Helper(H) ==> 40%<br>
Helper(H) -> Adverse(A) -> Unhelper(U) ==> 20%<br>
Adverse(A) -> Helper(H) -> Unhelper(U) ==> 1%

In [98]:
%load_ext lab_black

%load_ext autoreload

%autoreload 2

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [99]:
import yaml
import string
import os
import numpy as np
import pandas as pd

from utils import *

In [100]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [101]:
TOKEN_NAMES_FP = "./tokens_v2.yaml"

SEQ_LEN = 300

TRAIN_FP = "data/seq_final_v2/{}/train.csv".format(SEQ_LEN)
VAL_FP = "data/seq_final_v2/{}/val.csv".format(SEQ_LEN)
TEST_FP = "data/seq_final_v2/{}/test.csv".format(SEQ_LEN)

UID_COLNAME = "patient_id"

TRAIN_NROWS = 3000  # 18000
VAL_NROWS = 1000  # 6000
TEST_NROWS = 1000  # 6000

UID_LEN = 10

In [102]:
# Load tokens from yaml file path
tokens = load_tokens(TOKEN_NAMES_FP)
for key in tokens.keys():
    print(f"{key}: {len(tokens[key])} tokens")

adverse_tokens: 10 tokens
adverse_helper_tokens: 10 tokens
adverse_unhelper_tokens: 10 tokens
noise_tokens: 15 tokens


In [103]:
for key, tok in tokens.items():
    print(key)
    print(tok)
    print("-" * 50)

adverse_tokens
['Acute_Myocardial_Infarction_A', 'hypertension_A', 'arrhythmia_A', 'congestive_heart_failure_A', 'heart_valve_failure_A', 'pulmonary_embolism_A', 'ventricular_aneurysm_A', 'ventricular_hypertrophy_A', 'cardiomyopathy_A', 'Chronic_Obstructive_Pulmonary_Disease_A']
--------------------------------------------------
adverse_helper_tokens
['sleep_apnea_H', 'pneumonia_H', 'coronary_artery_disease_H', 'edema_H', 'troponin_H', 'Brain_Natriuretic_Peptide_H', 'alchoholism_H', 'metabolic_disorder_H', 'elevated_creatinine_H', 'electrolyte_imbalance_H']
--------------------------------------------------
adverse_unhelper_tokens
['percutaneous_coronary_intervention_U (PCI_U)', 'electrical_cardioversion_U', 'catheter_ablation_U', 'pacemaker_U', 'pacemaker_U', 'sleep_apnea_treatment_U', 'ACE_inhibitors_U', 'ARB_U', 'diuretics_U', 'beta_blockers_U']
--------------------------------------------------
noise_tokens
['eye_exam_N', 'annual_physical_N', 'hay_fever_N', 'headache_N', 'foot_pain

Total number of observations

In [104]:
x = TRAIN_NROWS
total = x * 6
pos_lab = x * 0.99 + x * 0.8 + x * 0.6 + x * 0.4 + x * 0.2 + x * 0.01
neg_lab = total - pos_lab
print(f"#pos: {pos_lab}, #neg: {neg_lab}")

#pos: 9000.0, #neg: 9000.0


In [105]:
# key--> sequence of adverse(A), helper(H), and unhelper(U)
# tuple --> (probability of positive label, number of rows)
train_count_dict = {
    "UHA": (0.99, TRAIN_NROWS),
    "UAH": (0.80, TRAIN_NROWS),
    "HUA": (0.60, TRAIN_NROWS),
    "AUH": (0.40, TRAIN_NROWS),
    "HAU": (0.20, TRAIN_NROWS),
    "AHU": (0.01, TRAIN_NROWS),
}

val_count_dict = {
    "UHA": (0.99, VAL_NROWS),
    "UAH": (0.80, VAL_NROWS),
    "HUA": (0.60, VAL_NROWS),
    "AUH": (0.40, VAL_NROWS),
    "HAU": (0.20, VAL_NROWS),
    "AHU": (0.01, VAL_NROWS),
}

test_count_dict = {
    "UHA": (0.99, TEST_NROWS),
    "UAH": (0.80, TEST_NROWS),
    "HUA": (0.60, TEST_NROWS),
    "AUH": (0.40, TEST_NROWS),
    "HAU": (0.20, TEST_NROWS),
    "AHU": (0.01, TEST_NROWS),
}

In [106]:
# Mappings of the token groups with the abbreviation
token_mappings = {
    "A": "adverse_tokens",
    "H": "adverse_helper_tokens",
    "U": "adverse_unhelper_tokens",
}

In [107]:
# Token abbreviations and token groups
token_mappings = {
    "A": "adverse_tokens",
    "H": "adverse_helper_tokens",
    "U": "adverse_unhelper_tokens",
}

In [108]:
def get_a_sequence_seq_v2(seq_len, label, tokens, proba, token_mappings, seq_tokens):
    """creates sequence + label (at the end of list) with specific orderings.
    returns list of list"""
    n_seq_tokens = len(seq_tokens)
    n_noise = (
        np.max(
            (
                10,
                random.choices(range(n_seq_tokens, seq_len), k=1)[0],
            )
        )
        - (n_seq_tokens)
    )
    sel_positions = sorted(random.sample(range(n_noise), k=n_seq_tokens))
    sel_tokens = []
    for key in seq_tokens:
        key_mapping = token_mappings[key]
        sel_tokens.append(random.choices(tokens[key_mapping])[0])
    sel_tokens = list(zip(sel_positions, sel_tokens))
    sel_noise = get_tokens(seq_len, tokens, "noise_tokens", n_noise)

    for idx, event in sel_tokens:
        sel_noise.insert(idx, event)

    sel_noise = ["<pad>"] * (seq_len - len(sel_noise)) + sel_noise
    # sel_noise.reverse()
    sim_lab = get_label(proba, target=label)

    sequence = sel_noise + [sim_lab]

    return sequence


def get_sequences_v2(
    seq_len,
    label,
    uid_len,
    uid_colname,
    tokens,
    proba,
    token_mappings,
    seq_tokens,
    n_seq,
):
    """Get multiple sequences."""

    sequences = [
        get_a_sequence_seq_v2(seq_len, label, tokens, proba, token_mappings, seq_tokens)
        + [get_uid(uid_len)]
        for _ in range(n_seq)
    ]
    print(f"seq based events generated")

    seq_df = pd.DataFrame(sequences)
    seq_df.columns = [str(x) for x in range(seq_len - 1, -1, -1)] + [
        "label",
        uid_colname,
    ]

    return seq_df


def get_sequence_dataset(
    seq_len, uid_len, uid_colname, count_dict, tokens, token_mappings
):
    """Generate a simple toy dataset.

    Arg:
    -----
        seq_len (int) : length of the generated sequence
        uid_len (int) : length of uid token
        uid_colname (str) : name of uid column, usually patient_id
        count_dict (dict) : dictionary of various sequence types.
            6 different types are allowed:
                n_ppp_adverse, n_pp_adverse, n_p_adverse
                n_nnn_adverse, n_nn_adverse, n_n_adverse
        tokens (dict) : dictionary of the various token types

    Returns:
    --------
        dataset (dataframe) : dataframe containing all the
                              generated dataset, randomly mixed

    """
    label = 1
    cat_lst = []
    for seq_tokens, (proba, n_seq) in count_dict.items():
        df = get_sequences_v2(
            seq_len,
            label,
            uid_len,
            uid_colname,
            tokens,
            proba,
            token_mappings,
            seq_tokens,
            n_seq,
        )

        df["seq_event"] = seq_tokens
        cat_lst.append(df.copy())
    dataset = pd.concat(cat_lst, axis=0)
    dataset.reset_index(inplace=True)
    indexes = [idx for idx in range(dataset.shape[0])]
    random.shuffle(indexes)
    dataset = dataset.iloc[indexes, :]
    # dataset = dataset.sample(frac=1).reset_index(drop=True)

    print(f"dataset: {dataset.shape}")
    print(f"ratio:\n{dataset.label.value_counts(normalize=True)}\n")

    return dataset

In [109]:
df_train = get_sequence_dataset(
    seq_len=SEQ_LEN,
    uid_len=UID_LEN,
    uid_colname=UID_COLNAME,
    count_dict=train_count_dict,
    tokens=tokens,
    token_mappings=token_mappings,
)

df_val = get_sequence_dataset(
    seq_len=SEQ_LEN,
    uid_len=UID_LEN,
    uid_colname=UID_COLNAME,
    count_dict=val_count_dict,
    tokens=tokens,
    token_mappings=token_mappings,
)

df_test = get_sequence_dataset(
    seq_len=SEQ_LEN,
    uid_len=UID_LEN,
    uid_colname=UID_COLNAME,
    count_dict=test_count_dict,
    tokens=tokens,
    token_mappings=token_mappings,
)

seq based events generated
seq based events generated
seq based events generated
seq based events generated
seq based events generated
seq based events generated
dataset: (18000, 304)
ratio:
0    0.502556
1    0.497444
Name: label, dtype: float64

seq based events generated
seq based events generated
seq based events generated
seq based events generated
seq based events generated
seq based events generated
dataset: (6000, 304)
ratio:
0    0.502667
1    0.497333
Name: label, dtype: float64

seq based events generated
seq based events generated
seq based events generated
seq based events generated
seq based events generated
seq based events generated
dataset: (6000, 304)
ratio:
0    0.500333
1    0.499667
Name: label, dtype: float64



In [110]:
print(df_train.shape)
df_train.head()

(18000, 304)


Unnamed: 0,index,299,298,297,296,295,294,293,292,291,...,6,5,4,3,2,1,0,label,patient_id,seq_event
11750,2750,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,ingrown_nail_N,foot_pain_N,ACL_tear_N,backache_N,backache_N,ingrown_nail_N,annual_physical_N,0,WZ43OZQ4CU,AUH
6377,377,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,pacemaker_U,pulmonary_embolism_A,ingrown_nail_N,ingrown_nail_N,ankle_sprain_N,backache_N,headache_N,0,LEJOOHMUV9,HUA
14948,2948,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,cut_finger_N,eye_exam_N,annual_physical_N,ACL_tear_N,myopia_N,ingrown_nail_N,quad_injury_N,0,BL7L1RRQTH,HAU
17150,2150,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,ingrown_nail_N,hay_fever_N,ankle_sprain_N,annual_physical_N,ingrown_nail_N,ACL_tear_N,backache_N,0,03K19M3DRP,AHU
3717,717,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,peanut_allergy_N,ankle_sprain_N,ankle_sprain_N,ingrown_nail_N,myopia_N,peanut_allergy_N,quad_injury_N,0,QAG79HLPXH,UAH


In [111]:
df_train[df_train["seq_event"] == "UHA"]["label"].value_counts()

1    2959
0      41
Name: label, dtype: int64

In [112]:
df_train[df_train["seq_event"] == "UHA"].iloc[0]

index                 661
299                 <pad>
298                 <pad>
297                 <pad>
296                 <pad>
                 ...     
1             hay_fever_N
0              ACL_tear_N
label                   1
patient_id     7IP3BT44VF
seq_event             UHA
Name: 661, Length: 304, dtype: object

In [113]:
df_train.seq_event.value_counts()

AHU    3000
UAH    3000
UHA    3000
AUH    3000
HAU    3000
HUA    3000
Name: seq_event, dtype: int64

In [114]:
save_csv(df_train, TRAIN_FP)
save_csv(df_val, VAL_FP)
save_csv(df_test, TEST_FP)

In [115]:
df = pd.read_csv(TRAIN_FP)
print(df.shape)
df.head()

(18000, 304)


Unnamed: 0,index,299,298,297,296,295,294,293,292,291,...,6,5,4,3,2,1,0,label,patient_id,seq_event
0,2750,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,ingrown_nail_N,foot_pain_N,ACL_tear_N,backache_N,backache_N,ingrown_nail_N,annual_physical_N,0,WZ43OZQ4CU,AUH
1,377,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,pacemaker_U,pulmonary_embolism_A,ingrown_nail_N,ingrown_nail_N,ankle_sprain_N,backache_N,headache_N,0,LEJOOHMUV9,HUA
2,2948,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,cut_finger_N,eye_exam_N,annual_physical_N,ACL_tear_N,myopia_N,ingrown_nail_N,quad_injury_N,0,BL7L1RRQTH,HAU
3,2150,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,ingrown_nail_N,hay_fever_N,ankle_sprain_N,annual_physical_N,ingrown_nail_N,ACL_tear_N,backache_N,0,03K19M3DRP,AHU
4,717,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,peanut_allergy_N,ankle_sprain_N,ankle_sprain_N,ingrown_nail_N,myopia_N,peanut_allergy_N,quad_injury_N,0,QAG79HLPXH,UAH


In [116]:
df.label.value_counts(normalize=True)

0    0.502556
1    0.497444
Name: label, dtype: float64