# Config

In [3]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v2-xxlarge"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=100
    epochs=4
    encoder_lr=8e-6
    decoder_lr=8e-6
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.98)
    batch_size=2
    fc_dropout=0.2
    max_len=280
    weight_decay=0.1
    gradient_accumulation_steps=4
    max_grad_norm=1000
    # MLM setting
    mlm_probability=0.10 # 0.15
    max_seq_length=None
    seed=0
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [4]:
# ====================================================
# Define path
# ====================================================
import os

INPUT_DIR = '../data/'
OUTPUT_DIR = '../output/pretrained/{}/'.format(CFG.model.replace('/', '-'))
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Library

In [6]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForMaskedLM
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers.modeling_outputs import MaskedLMOutput
from transformers import DataCollatorForLanguageModeling
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.11.0
transformers.__version__: 4.16.2


2022-04-14 05:56:50.272931: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


env: TOKENIZERS_PARALLELISM=true


# Utils

In [7]:
# ====================================================
# Utils
# ===================================================
def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# Trainer

In [8]:
######################################################

# Trainer
def trainer(model, data_loader, optimizer, scheduler, CFG):
    model.train()

    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = []

    optimizer.zero_grad()

    for idx, batch in enumerate(data_loader):
        for k, v in batch.items():
            batch[k] = v.to(device, dtype=torch.long)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            outputs = model(**batch)

        loss = outputs.loss

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (idx + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            CFG.global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        
        losses.append(loss.detach().cpu().item())

        # if (CFG.global_step % CFG.save_step) == 0:
        #     LOGGER.info(
        #         "Epoch {} Step {}: Train Loss {:.4f}, elapsed {:.4f}s".format(
        #             CFG.epoch + 1, CFG.global_step, np.mean(losses), time.time() - start)
        #         )
        #     torch.save(
        #         model.state_dict(),
        #         OUTPUT_DIR + '{}-mlm-step-{}.bin'.format(
        #             CFG.model.replace('/', '-'),
        #             CFG.global_step))
        
        if idx % 500 == 0 or idx == (len(data_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss:.4f} '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, idx, len(data_loader), 
                          remain=timeSince(start, float(idx+1)/len(data_loader)),
                          loss=np.mean(losses),
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    
    return np.mean(losses)

# Model

In [9]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

In [10]:
def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
    return scheduler

In [11]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                cfg.model,
                output_hidden_states=False
                )
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
            self.lm_head = AutoModelForMaskedLM.from_pretrained(cfg.model, config=self.config).cls # [cls, lm_head]
        else:
            self.model = AutoModel(self.config)
            self.lm_head = AutoModelForMaskedLM(self.config).cls # [cls, lm_head]
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(
            self, 
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            #position_ids=None,
            inputs_embeds=None,
            labels=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None):
        
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            #position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,)
        
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        return MaskedLMOutput(loss=masked_lm_loss,
                              logits=prediction_scores,
                              hidden_states=outputs.hidden_states,
                              attentions=outputs.attentions)

# Main

In [12]:
# ====================================================
# data loading
# ====================================================
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
features = pd.read_csv(os.path.join(INPUT_DIR, 'features.csv'))
patient_notes = pd.read_csv(os.path.join(INPUT_DIR, 'patient_notes.csv'))
display(patient_notes.head())
display(patient_notes['pn_history'].nunique(), len(patient_notes))

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


42146

42146

In [13]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
mlm_data = patient_notes[['pn_history']].reset_index(drop=True)
mlm_data = mlm_data.rename(columns={'pn_history': 'text'})
csv_name = f'mlm_data.csv'
mlm_data.to_csv(OUTPUT_DIR+csv_name, index=False)
print(f"Saved mlm data: {csv_name}")
print(f"mlm data: {mlm_data.shape}")

Saved mlm data: mlm_data.csv
mlm data: (42146, 1)


In [15]:
#####################################################

# Training support

def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True)

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // CFG.max_len) * CFG.max_len
    result = {
        k: [t[i : i + CFG.max_len] for i in range(0, total_length, CFG.max_len)]
        for k, t in concatenated_examples.items()
    }
    return result

In [16]:
seed_everything(CFG.seed)

CFG.train_file = f"mlm_data.csv"
data_files = {'train': OUTPUT_DIR+CFG.train_file}
raw_datasets = load_dataset('csv', data_files=data_files)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-6cbb5b2c44c7a6a0/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-6cbb5b2c44c7a6a0/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
if CFG.max_seq_length is None:
    max_seq_length = tokenizer.model_max_length
else:
    if CFG.max_seq_length > tokenizer.model_max_length:
        max_seq_length = min(CFG.max_seq_length, tokenizer.model_max_length)

In [18]:
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["text"],
    load_from_cache_file=not True,
    )
LOGGER.info(f"tokenized_datasets: {tokenized_datasets}")

tokenized_datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 42146
    })
})


In [19]:
tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=4,
    load_from_cache_file=not True,
    )
train_dataset = tokenized_datasets["train"]
LOGGER.info(f"train_dataset: {train_dataset}")

train_dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 28101
})


In [20]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=CFG.mlm_probability
    )
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=CFG.batch_size
    )

In [22]:
model = CustomModel(CFG, config_path=None, pretrained=True)
model.to(device)
optimizer_parameters = get_optimizer_params(
    model,
    encoder_lr=CFG.encoder_lr, 
    decoder_lr=CFG.decoder_lr,
    weight_decay=CFG.weight_decay)
optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

Some weights of the model checkpoint at microsoft/deberta-v2-xxlarge were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-v2-xxlarge were not used when initializing DebertaV2ForMaskedLM: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_h

In [23]:
num_train_steps = int(len(mlm_data) / CFG.batch_size * CFG.epochs / CFG.gradient_accumulation_steps)
scheduler = get_scheduler(CFG, optimizer, num_train_steps)

In [25]:
CFG.global_step = 0
CFG.save_step = 25000

for epoch in range(CFG.epochs):
    CFG.epoch = epoch
    start = time.time()
        
    train_loss = trainer(model, train_loader, optimizer, scheduler, CFG)

    LOGGER.info(
        "Epoch {}: Train Loss {:.4f}, elapsed {:.4f}s".format(
            epoch + 1, train_loss, time.time() - start)
        )
    torch.save(
        model.state_dict(),
        OUTPUT_DIR + '{}-mlm-epoch-{}.bin'.format(
            CFG.model.replace('/', '-'),
            CFG.epoch + 1))

Epoch: [1][0/14051] Elapsed 0m 0s (remain 187m 32s) Loss: 3.2435 Grad: inf  LR: 0.00000000  
Epoch: [1][500/14051] Elapsed 3m 44s (remain 101m 4s) Loss: 2.3588 Grad: 25869.3711  LR: 0.00000800  
Epoch: [1][1000/14051] Elapsed 7m 28s (remain 97m 27s) Loss: 1.7467 Grad: 1567.6235  LR: 0.00000800  
Epoch: [1][1500/14051] Elapsed 11m 12s (remain 93m 42s) Loss: 1.4370 Grad: 1210.4193  LR: 0.00000800  
Epoch: [1][2000/14051] Elapsed 14m 56s (remain 90m 0s) Loss: 1.2511 Grad: 2278.7188  LR: 0.00000799  
Epoch: [1][2500/14051] Elapsed 18m 41s (remain 86m 20s) Loss: 1.1237 Grad: 1816.4919  LR: 0.00000799  
Epoch: [1][3000/14051] Elapsed 22m 26s (remain 82m 39s) Loss: 1.0293 Grad: 1304.2701  LR: 0.00000798  
Epoch: [1][3500/14051] Elapsed 26m 8s (remain 78m 47s) Loss: 0.9561 Grad: 1057.3477  LR: 0.00000797  
Epoch: [1][4000/14051] Elapsed 29m 45s (remain 74m 45s) Loss: 0.8992 Grad: 1431.6154  LR: 0.00000796  
Epoch: [1][4500/14051] Elapsed 33m 21s (remain 70m 47s) Loss: 0.8513 Grad: 1158.0610  L

Epoch 1: Train Loss 0.5347, elapsed 6098.7480s


Epoch: [1][14050/14051] Elapsed 101m 38s (remain 0m 0s) Loss: 0.5347 Grad: 2592.5010  LR: 0.00000749  
Epoch: [2][0/14051] Elapsed 0m 0s (remain 148m 21s) Loss: 0.3482 Grad: 277684.4062  LR: 0.00000749  
Epoch: [2][500/14051] Elapsed 3m 34s (remain 96m 42s) Loss: 0.3410 Grad: 89565.2422  LR: 0.00000745  
Epoch: [2][1000/14051] Elapsed 7m 8s (remain 93m 7s) Loss: 0.3337 Grad: 102147.0391  LR: 0.00000741  
Epoch: [2][1500/14051] Elapsed 10m 43s (remain 89m 36s) Loss: 0.3312 Grad: 69110.0312  LR: 0.00000737  
Epoch: [2][2000/14051] Elapsed 14m 20s (remain 86m 21s) Loss: 0.3320 Grad: 33042.1289  LR: 0.00000733  
Epoch: [2][2500/14051] Elapsed 17m 54s (remain 82m 43s) Loss: 0.3332 Grad: 41190.3945  LR: 0.00000729  
Epoch: [2][3000/14051] Elapsed 21m 29s (remain 79m 9s) Loss: 0.3346 Grad: 28906.2129  LR: 0.00000725  
Epoch: [2][3500/14051] Elapsed 25m 4s (remain 75m 34s) Loss: 0.3337 Grad: 36016.3281  LR: 0.00000720  
Epoch: [2][4000/14051] Elapsed 28m 38s (remain 71m 57s) Loss: 0.3317 Grad:

Epoch 2: Train Loss 0.3250, elapsed 6037.9381s


Epoch: [2][14050/14051] Elapsed 100m 37s (remain 0m 0s) Loss: 0.3250 Grad: 12226.5137  LR: 0.00000603  
Epoch: [3][0/14051] Elapsed 0m 0s (remain 96m 47s) Loss: 0.2941 Grad: 211316.9062  LR: 0.00000603  
Epoch: [3][500/14051] Elapsed 3m 35s (remain 97m 20s) Loss: 0.3083 Grad: 133609.6406  LR: 0.00000597  
Epoch: [3][1000/14051] Elapsed 7m 10s (remain 93m 34s) Loss: 0.3040 Grad: 59099.5117  LR: 0.00000590  
Epoch: [3][1500/14051] Elapsed 10m 44s (remain 89m 51s) Loss: 0.3052 Grad: 59725.1211  LR: 0.00000584  
Epoch: [3][2000/14051] Elapsed 14m 20s (remain 86m 19s) Loss: 0.3053 Grad: 50988.1875  LR: 0.00000577  
Epoch: [3][2500/14051] Elapsed 17m 54s (remain 82m 41s) Loss: 0.3062 Grad: 47101.5859  LR: 0.00000570  
Epoch: [3][3000/14051] Elapsed 21m 29s (remain 79m 7s) Loss: 0.3060 Grad: 25919.3398  LR: 0.00000564  
Epoch: [3][3500/14051] Elapsed 25m 3s (remain 75m 31s) Loss: 0.3056 Grad: 30456.3770  LR: 0.00000557  
Epoch: [3][4000/14051] Elapsed 28m 38s (remain 71m 56s) Loss: 0.3047 Gra

Epoch 3: Train Loss 0.2966, elapsed 6034.1259s


Epoch: [3][14050/14051] Elapsed 100m 34s (remain 0m 0s) Loss: 0.2966 Grad: 4017.9485  LR: 0.00000403  
Epoch: [4][0/14051] Elapsed 0m 0s (remain 95m 1s) Loss: 0.3028 Grad: 257145.0625  LR: 0.00000403  
Epoch: [4][500/14051] Elapsed 3m 35s (remain 97m 7s) Loss: 0.2855 Grad: 195667.5781  LR: 0.00000396  
Epoch: [4][1000/14051] Elapsed 7m 10s (remain 93m 26s) Loss: 0.2863 Grad: 81998.7578  LR: 0.00000388  
Epoch: [4][1500/14051] Elapsed 10m 44s (remain 89m 51s) Loss: 0.2861 Grad: 49286.8906  LR: 0.00000381  
Epoch: [4][2000/14051] Elapsed 14m 18s (remain 86m 11s) Loss: 0.2858 Grad: 28315.8242  LR: 0.00000373  
Epoch: [4][2500/14051] Elapsed 17m 54s (remain 82m 40s) Loss: 0.2838 Grad: 29042.4668  LR: 0.00000366  
Epoch: [4][3000/14051] Elapsed 21m 29s (remain 79m 7s) Loss: 0.2837 Grad: 23706.9473  LR: 0.00000358  
Epoch: [4][3500/14051] Elapsed 25m 3s (remain 75m 30s) Loss: 0.2829 Grad: 15108.6025  LR: 0.00000351  
Epoch: [4][4000/14051] Elapsed 28m 38s (remain 71m 57s) Loss: 0.2817 Grad: 

Epoch 4: Train Loss 0.2751, elapsed 6049.9473s


Epoch: [4][14050/14051] Elapsed 100m 49s (remain 0m 0s) Loss: 0.2751 Grad: 2848.4832  LR: 0.00000202  
