## Synthetic dataset generation -- Sequence based
Author: Lin Lee Cheong <br>
Date created: 12/12/ 2020 <br>
Date updated: 1/31/2021 <br> <br>

Goal of this synthetic dataset is to create datasets to help understand how different relationships between tokens affect attention, SHAP and other interpretability factors.
- length of events (30, 300, 900)
- spacing between 2+ coupled events, i.e. order of sequence matters
- amount of noise, i.e. performance vs interpretability
- vocabulary space

In [73]:
import yaml
import string
import os
import numpy as np
import pandas as pd

from utils import *

In [74]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [75]:
TOKEN_NAMES_FP = "./tokens.yaml"

SEQ_LEN = 300

TRAIN_FP = "data/seq/{}/seq_aaa_1/train.csv".format(SEQ_LEN)
VAL_FP = "data/seq/{}/seq_aaa_1/val.csv".format(SEQ_LEN)
TEST_FP = "data/seq/{}/seq_aaa_1/test.csv".format(SEQ_LEN)

UID_COLNAME = "patient_id"

TRAIN_NROWS = 3000 # 18000
VAL_NROWS = 1000 # 6000
TEST_NROWS = 1000 # 6000

UID_LEN = 10

In [76]:
# Load tokens from yaml file path
tokens = load_tokens(TOKEN_NAMES_FP)
for key in tokens.keys():
    print(f"{key}: {len(tokens[key])} tokens")

adverse_tokens: 4 tokens
adverse_helper_tokens: 6 tokens
adverse_unhelper_tokens: 5 tokens
noise_tokens: 15 tokens
adverse_sequence_tokens: 3 tokens


In [77]:
print(tokens['adverse_tokens'])

['AMI_A', 'PH_A', 'ARR_A', 'CHF_A']


In [78]:
print(tokens['adverse_helper_tokens'])

['apnea_H', 'furosemide_H', 'pneumonia_H', 'high_creatinine_H', 'tachycardia_H', 'resistent_hyp_H']


In [79]:
print(tokens['adverse_unhelper_tokens'])

['PCI_U', 'cardiac_rehab_U', 'normal_bmi_U', 'low_salt_diet_U', 'ACE_inhibitors_U']


In [80]:
print(tokens['adverse_sequence_tokens'])

['AMI_A', 'CHF_A', 'ARR_A']


### Sequence dataset

Positive label is driven by a sequence of tokens
- positive set sequence:
    - (AMI_A, CHF_A, ARR_A) --> 99%
    - Other 1, 2, 3A sequence gives 0.55, 0.7, 0.75
    - (1A) + 2 H --> 65 %
- negative set:
    - Ns (#and Us)



Total number of observations

In [81]:
x = 3000
tot = x * 5
pos_lab =  x * 0.99 + x * 0.6 + x * 0.7 + x * 0.75 + x * 0.7
neg_lab = tot - pos_lab 
print(pos_lab)
print(neg_lab) # number already negatively labelled
print(pos_lab - neg_lab)

11220.0
3780.0
7440.0


In [82]:
x = 1000
tot = x * 5
pos_lab =  x * 0.99 + x * 0.6 + x * 0.7 + x * 0.75 + x * 0.7
neg_lab = tot - pos_lab 
print(neg_lab)
print(pos_lab -  neg_lab)

1260.0
2480.0


In [83]:
train_count_dict = {
    "n_aaa_seq_adverse": TRAIN_NROWS,
    "n_aaa_adverse": TRAIN_NROWS,
    "n_aa_adverse": TRAIN_NROWS,
    "n_a_adverse": TRAIN_NROWS,
    "n_ahh_adverse": TRAIN_NROWS,
    "n_noise_adverse": 7440,
}

val_count_dict = {
    "n_aaa_seq_adverse": VAL_NROWS,
    "n_aaa_adverse": VAL_NROWS,
    "n_aa_adverse": VAL_NROWS,
    "n_a_adverse": VAL_NROWS,
    "n_ahh_adverse": VAL_NROWS,
    "n_noise_adverse": 2480,
}

test_count_dict = {
    "n_aaa_seq_adverse": TEST_NROWS,
    "n_aaa_adverse": TEST_NROWS,
    "n_aa_adverse": TEST_NROWS,
    "n_a_adverse": TEST_NROWS,
    "n_ahh_adverse": TEST_NROWS,
    "n_noise_adverse": 2480,
}

In [84]:
def get_idx_tok(seq_len, token_dict, token_key, n_pairs, min_idx=0):
    """Get random index and token from token_key of n_pairs."""
    return [
        (
            random.choices(range(min_idx, seq_len), k=1)[0],
            random.choices(token_dict[token_key], k=1)[0],
        )
        for _ in range(n_pairs)
    ]


In [85]:
def get_idx_tok_ordered(seq_len, token_dict, token_key, min_idx=0):
    """Get random index and token from token_key of n_pairs."""
    
    seq = token_dict[token_key]
    
    indexes = sorted([random.choices(range(min_idx, seq_len), k=1)[0] for _ in range(len(seq))])
    return [(idx, tok) for idx, tok in zip(indexes, seq)]

In [86]:
get_idx_tok_ordered(30, tokens, 'adverse_sequence_tokens')

[(7, 'AMI_A'), (17, 'CHF_A'), (24, 'ARR_A')]

In [97]:
def get_a_sequence_seq(
    adverse, helper, unhelper, adverse_seq, seq_len, label, tokens, proba
):
    """creates sequence + label (at the end of list) with specific orderings.
       returns list of list"""

    n_noise = np.max((
        10,
        random.choices(
            range(adverse + helper + unhelper + adverse_seq, seq_len), k=1
        )[0],
    )) - (adverse + helper + unhelper + adverse_seq)

    sel_adverse, sel_helper, sel_unhelper, sel_ad_seq = [], [], [], []

    if adverse:
        sel_adverse = get_idx_tok(n_noise, tokens, "adverse_tokens", adverse)

    if helper:
        sel_helper = get_idx_tok(n_noise, tokens, "adverse_helper_tokens", helper)

    if unhelper:
        sel_unhelper = get_idx_tok(n_noise, tokens, "adverse_unhelper_tokens", unhelper)
    
    if adverse_seq:
        sel_ad_seq = get_idx_tok_ordered(n_noise, tokens, 'adverse_sequence_tokens')
        
    sel_noise = get_tokens(seq_len, tokens, "noise_tokens", n_noise)

    for idx, event in sel_adverse + sel_helper + sel_unhelper + sel_ad_seq:
        sel_noise.insert(idx, event)

    sel_noise = ["<pad>"] * (seq_len - len(sel_noise)) + sel_noise

    # label depending on proba
    sim_lab = get_label(proba, target=label)

    return sel_noise + [sim_lab]

In [98]:
def get_sequences(
    adverse, helper, unhelper, seq_len, label, uid_len, uid_colname, n_seq, tokens,
    seq_type='event', adverse_seq=0, proba=1.0
):
    """Get multiple sequences."""
    
    if seq_type == 'event':
        sequences = [
            get_a_sequence(
                adverse=adverse,
                helper=helper,
                unhelper=unhelper,
                seq_len=seq_len,
                label=label,
                tokens=tokens,
                proba=proba
            )
            + [get_uid(uid_len)]
            for _ in range(n_seq)
        ]
        
    if seq_type == 'seq':
        sequences = [
            get_a_sequence_seq(
                adverse=adverse,
                helper=helper,
                unhelper=unhelper,
                adverse_seq=adverse_seq,
                seq_len=seq_len,
                label=label,
                tokens=tokens,
                proba=proba
            )
            + [get_uid(uid_len)]
            for _ in range(n_seq)
        ]
        print(f"seq based events generated")
        
    seq_df = pd.DataFrame(sequences)
    seq_df.columns = [str(x) for x in range(seq_len-1, -1, -1)] + [
        "label",
        uid_colname,
    ]

    return seq_df

In [99]:
def get_sequence_dataset(seq_len, uid_len, uid_colname, count_dict, tokens):
    """Generate a simple toy dataset.
    
    Arg:
    -----
        seq_len (int) : length of the generated sequence
        uid_len (int) : length of uid token
        uid_colname (str) : name of uid column, usually patient_id
        count_dict (dict) : dictionary of various sequence types.
            6 different types are allowed:
                n_ppp_adverse, n_pp_adverse, n_p_adverse
                n_nnn_adverse, n_nn_adverse, n_n_adverse
        tokens (dict) : dictionary of the various token types
                
    Returns:
    --------
        dataset (dataframe) : dataframe containing all the 
                              generated dataset, randomly mixed 
    
    """
    
    cat_lst = []

    if "n_aaa_seq_adverse" in count_dict:
        aaa_seq = get_sequences(
            adverse=0,
            helper=0,
            unhelper=0,
            seq_len=seq_len,
            label=1,
            uid_len=uid_len,
            uid_colname=uid_colname,
            n_seq=count_dict["n_aaa_seq_adverse"],
            tokens=tokens,
            seq_type='seq', 
            adverse_seq=3,
            proba=0.99
        )
        aaa_seq['seq_event'] = 'aaa_seq'
        cat_lst.append(aaa_seq)
    
    if "n_aaa_adverse" in count_dict:
        aaa = get_sequences(
            adverse=3,
            helper=0,
            unhelper=0,
            seq_len=seq_len,
            label=1,
            uid_len=uid_len,
            uid_colname=uid_colname,
            n_seq=count_dict["n_aaa_adverse"],
            tokens=tokens,
            seq_type='seq', 
            adverse_seq=0,
            proba=0.75
        )
        aaa['seq_event'] = 'aaa'
        
        cat_lst.append(aaa)    

    if "n_aa_adverse" in count_dict:
        aa = get_sequences(
            adverse=2,
            helper=0,
            unhelper=0,
            seq_len=seq_len,
            label=1,
            uid_len=uid_len,
            uid_colname=uid_colname,
            n_seq=count_dict["n_aa_adverse"],
            tokens=tokens,
            seq_type='seq', 
            adverse_seq=0,
            proba=0.7
        )
        aa['seq_event'] = 'aa'
        cat_lst.append(aa)
        
    if "n_a_adverse" in count_dict:
        a = get_sequences(
            adverse=1,
            helper=0,
            unhelper=0,
            seq_len=seq_len,
            label=1,
            uid_len=uid_len,
            uid_colname=uid_colname,
            n_seq=count_dict["n_a_adverse"],
            tokens=tokens,
            seq_type='seq', 
            adverse_seq=0,
            proba=0.55
        )
        a['seq_event'] = 'a'
        cat_lst.append(a)        

    if "n_ahh_adverse" in count_dict:
        ahh = get_sequences(
            adverse=1,
            helper=2,
            unhelper=0,
            seq_len=seq_len,
            label=1,
            uid_len=uid_len,
            uid_colname=uid_colname,
            n_seq=count_dict["n_ahh_adverse"],
            tokens=tokens,
            seq_type='seq', 
            adverse_seq=0,
            proba=0.65
        )
        ahh['seq_event'] = 'ahh'
        cat_lst.append(ahh)  

    if "n_noise_adverse" in count_dict:
        noise = get_sequences(
            adverse=0,
            helper=0,
            unhelper=0,
            seq_len=seq_len,
            label=0,
            uid_len=uid_len,
            uid_colname=uid_colname,
            n_seq=count_dict["n_noise_adverse"],
            tokens=tokens,
            seq_type='seq', 
            adverse_seq=0,
            proba=0.95
        )
        noise['seq_event'] = 'noise'
        cat_lst.append(noise)  
        
    # event-triggered ##
    if "n_ppp_adverse" in count_dict:
        ppp = get_sequences(
            adverse=1,
            helper=1,
            unhelper=0,
            seq_len=seq_len,
            label=1,
            uid_len=uid_len,
            uid_colname=uid_colname,
            n_seq=count_dict["n_ppp_adverse"],
            tokens=tokens
        )
        ppp['seq_event'] = 1
        cat_lst.append(ppp)
        
    if "n_pp_adverse" in count_dict:  
        pp = get_sequences(
            adverse=1,
            helper=0,
            unhelper=0,
            seq_len=seq_len,
            label=1,
            uid_len=uid_len,
            uid_colname=uid_colname,
            n_seq=count_dict["n_pp_adverse"],
            tokens=tokens,
        )
        pp['seq_event'] = 0
        cat_lst.append(pp)
        
    if "n_p_adverse" in count_dict:
        p = get_sequences(
            adverse=0,
            helper=3,
            unhelper=0,
            seq_len=seq_len,
            label=1,
            uid_len=uid_len,
            uid_colname=uid_colname,
            n_seq=count_dict["n_p_adverse"],
            tokens=tokens,
        )
        p['seq_event'] = 0
        cat_lst.append(p)
        
    if "n_nnn_adverse" in count_dict:
        nnn = get_sequences(
            adverse=0,
            helper=0,
            unhelper=3,
            seq_len=seq_len,
            label=0,
            uid_len=uid_len,
            uid_colname=uid_colname,
            n_seq=count_dict["n_nnn_adverse"],
            tokens=tokens,
        )
        nnn['seq_event'] = 0
        cat_lst.append(nnn)
    
    if "n_nn_adverse" in count_dict:
        nn = get_sequences(
            adverse=0,
            helper=1,
            unhelper=2,
            seq_len=seq_len,
            label=0,
            uid_len=uid_len,
            uid_colname=uid_colname,
            n_seq=count_dict["n_nn_adverse"],
            tokens=tokens,
        )
        nn['seq_event'] = 0
        cat_lst.append(nn)
        
    if "n_n_adverse" in count_dict:
        n = get_sequences(
            adverse=0,
            helper=2,
            unhelper=1,
            seq_len=seq_len,
            label=0,
            uid_len=uid_len,
            uid_colname=uid_colname,
            n_seq=count_dict["n_n_adverse"],
            tokens=tokens,
        )
        n['seq_event'] = 0
        cat_lst.append(n)

    dataset = pd.concat(cat_lst, axis=0)
    dataset.reset_index(inplace=True)
    indexes = [idx for idx in range(dataset.shape[0])]
    random.shuffle(indexes)
    dataset = dataset.iloc[indexes, :]
    #dataset = dataset.sample(frac=1).reset_index(drop=True)

    print(f"dataset: {dataset.shape}")
    print(f"ratio:\n{dataset.label.value_counts(normalize=True)}\n")

    return dataset

In [101]:
train_simple_data = get_sequence_dataset(
    seq_len=SEQ_LEN,
    uid_len=UID_LEN,
    uid_colname=UID_COLNAME,
    count_dict=train_count_dict,
    tokens=tokens,
)

val_simple_data = get_sequence_dataset(
    seq_len=SEQ_LEN,
    uid_len=UID_LEN,
    uid_colname=UID_COLNAME,
    count_dict=val_count_dict,
    tokens=tokens,
)

test_simple_data = get_sequence_dataset(
    seq_len=SEQ_LEN,
    uid_len=UID_LEN,
    uid_colname=UID_COLNAME,
    count_dict=test_count_dict,
    tokens=tokens,
)

In [91]:
train_simple_data.seq_event.value_counts()

noise      7440
aaa_seq    3000
aa         3000
a          3000
ahh        3000
aaa        3000
Name: seq_event, dtype: int64

In [92]:
save_csv(train_simple_data, TRAIN_FP)
save_csv(val_simple_data, VAL_FP)
save_csv(test_simple_data, TEST_FP)

In [93]:
df = pd.read_csv(TRAIN_FP)
print(df.shape)
df.head()

(22440, 304)


Unnamed: 0,index,299,298,297,296,295,294,293,292,291,...,6,5,4,3,2,1,0,label,patient_id,seq_event
0,413,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,peanut_allergy_N,cold_sore_N,annual_physical_N,myopia_N,backache_N,dental_exam_N,myopia_N,1,QYGMG349QI,aaa
1,924,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,dental_exam_N,ankle_sprain_N,peanut_allergy_N,foot_pain_N,cold_sore_N,cold_sore_N,ankle_sprain_N,1,GEX6E0F70W,aaa
2,2289,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,ingrown_nail_N,annual_physical_N,ingrown_nail_N,headache_N,annual_physical_N,annual_physical_N,headache_N,1,XDWKV6SBL2,ahh
3,1464,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,hay_fever_N,ACL_tear_N,dental_exam_N,eye_exam_N,peanut_allergy_N,annual_physical_N,ingrown_nail_N,0,YOSLNTMHVO,a
4,1710,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,ankle_sprain_N,ACL_tear_N,headache_N,eye_exam_N,annual_physical_N,myopia_N,cut_finger_N,1,8GAD7ENZWG,aaa


In [108]:
df.label.value_counts()

1    11257
0    11183
Name: label, dtype: int64

In [94]:
import numpy as np
n = np.random.choice(df.shape[0])
df.iloc[n]

index                   140
299                   <pad>
298                   <pad>
297                   <pad>
296                   <pad>
                  ...      
1               hay_fever_N
0             dental_exam_N
label                     0
patient_id       ZZC2MUYUFQ
seq_event               ahh
Name: 20430, Length: 304, dtype: object

In [95]:
ee = df[df.label == 0]

In [96]:
ee.iloc[43]

index               6559
299                <pad>
298                <pad>
297                <pad>
296                <pad>
                 ...    
1               myopia_N
0             backache_N
label                  0
patient_id    MN66YREIS2
seq_event          noise
Name: 84, Length: 304, dtype: object