# Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

INPUT_DIR = '../input/nbme-score-clinical-patient-notes/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    env='kaggle' if os.getcwd() == '/kaggle/working' else 'colab'
    wandb=False # True
    competition='NBME'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-large"
    mlm_dir='./drive/MyDrive/Colab Notebooks/NBME/mlm/kaggle-exp1/' # None
    finetune_dir='./drive/MyDrive/Colab Notebooks/NBME/train/exp135/' # None
    pseudo_dir='./drive/MyDrive/Colab Notebooks/NBME/pseudo/exp135/' # None
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4 # 5
    encoder_lr=2e-5
    decoder_lr=2e-5
    group_step=8
    lr_scale=1e-6
    reinit_layers=0 # https://openreview.net/pdf?id=cO1IH43yUF
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=6 # 8
    test_batch_size=32
    fc_dropout=0.6
    loss='bce' # ['bce', 'ce']
    target_size=1 if loss == 'bce' else 2
    max_len=512
    alpha=1
    gamma=2
    smoothing=0.0001
    p_aug=0.5
    p_aug_epoch=2
    pos_length_weight_decay=0.0
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    # ====================================================
    # MLM
    # ====================================================
    mlm=False # True
    extension='csv'
    pad_to_max_length=True
    max_seq_length=512
    overwrite_cache=True
    mlm_probability=0.15
    # ====================================================
    # pseudo labeling
    # ====================================================
    pseudo=False # True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [3]:
# ====================================================
# colab
# ====================================================
if CFG.env == 'colab':

    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
        print('Not connected to a GPU')
    else:
        print(gpu_info)

    from requests import get
    exp = get('http://172.28.0.2:9000/api/sessions').json()[0]['name'].split('.ipynb')[0]
    INPUT_DIR = './drive/MyDrive/Colab Notebooks/NBME/input/nbme-score-clinical-patient-notes/'
    if CFG.mlm:
        OUTPUT_DIR = f'./drive/MyDrive/Colab Notebooks/NBME/mlm/{exp}/'
    elif CFG.pseudo:
        OUTPUT_DIR = f'./drive/MyDrive/Colab Notebooks/NBME/pseudo/{exp}/'
    else:
        OUTPUT_DIR = f'./drive/MyDrive/Colab Notebooks/NBME/train/{exp}/'

    from google.colab import drive
    drive.mount('/content/drive')

Sat Apr 23 04:38:53 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='NBME', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

# Library

In [5]:
def prepare_deverta_v2_v3_tokenizer_fast():
    if CFG.env == 'kaggle':
        transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")
        input_dir = Path("../input/deberta-v2-v3-tokenizer-fast")
    elif CFG.env == 'colab':
        transformers_path = Path("/usr/local/lib/python3.7/dist-packages/transformers")
        input_dir = Path("./drive/MyDrive/Colab Notebooks/NBME/input/deberta-v2-v3-tokenizer-fast")
    convert_file = input_dir / "convert_slow_tokenizer.py"
    conversion_path = transformers_path / convert_file.name
    if conversion_path.exists():
        conversion_path.unlink()
    shutil.copy(convert_file, transformers_path)
    deberta_v2_path = transformers_path / "models" / "deberta_v2"
    for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py', "deberta__init__.py"]:
        if str(filename).startswith("deberta"):
            filepath = deberta_v2_path / str(filename).replace("deberta", "")
        else:
            filepath = deberta_v2_path / filename
        if filepath.exists():
            filepath.unlink()
        shutil.copy(input_dir / filename, filepath)

In [6]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')

if CFG.env == 'colab':
    os.system('pip install transformers==4.16.2')
    os.system('pip install tokenizers==0.11.0')
    os.system('pip install sentencepiece==0.1.96')
elif CFG.env == 'kaggle':
    os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')
    os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels tokenizers')
os.system('pip install datasets==1.18.3')
prepare_deverta_v2_v3_tokenizer_fast()

import datasets
import tokenizers
import transformers
print(f"datasets.__version__: {datasets.__version__}")
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from datasets import load_dataset
from transformers import AutoModelForMaskedLM
from transformers.modeling_outputs import MaskedLMOutput
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.models.deberta_v2 import DebertaV2TokenizerFast
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.__version__: 1.10.0+cu111
datasets.__version__: 1.18.3
tokenizers.__version__: 0.11.0
transformers.__version__: 4.16.2
env: TOKENIZERS_PARALLELISM=true


# Helper functions for scoring

In [7]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [8]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


def compute_best_th(truths, char_probs):
    best_th = 0.5
    best_score = 0.
    for th in [0.5]:
        th = np.round(th, 2)
        results = get_results(char_probs, th=th)
        preds = get_predictions(results)
        score = get_score(truths, preds)
        if best_score < score:
            best_th = th
            best_score = score
    return best_th, best_score

# Utils

In [9]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
# seed_everything(seed=42)
seed_everything(seed=0)

# Data Loading

In [10]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv(INPUT_DIR+'train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv(INPUT_DIR+'features.csv')
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv(INPUT_DIR+'patient_notes.csv')

# NEW
features['feature_text'] = features['feature_text'].str.lower()
patient_notes['pn_history'] = patient_notes['pn_history'].str.lower()

print(f"train.shape: {train.shape}")
display(train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,family-history-of-mi-or-family-history-of-myoc...
1,1,0,family-history-of-thyroid-disorder
2,2,0,chest-pressure
3,3,0,intermittent-symptoms
4,4,0,lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,dillon cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [11]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],family-history-of-mi-or-family-history-of-myoc...,hpi: 17yo m presents with palpitations. patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],family-history-of-thyroid-disorder,hpi: 17yo m presents with palpitations. patien...
2,00016_002,0,16,2,[chest pressure],[203 217],chest-pressure,hpi: 17yo m presents with palpitations. patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",intermittent-symptoms,hpi: 17yo m presents with palpitations. patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],lightheaded,hpi: 17yo m presents with palpitations. patien...


In [12]:
# incorrect annotation
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [13]:
train['annotation_length'] = train['annotation'].apply(len)
display(train['annotation_length'].value_counts())

1    8185
0    4399
2    1292
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

# CV split

In [14]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    3575
1    3575
2    3575
3    3575
dtype: int64

In [15]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

In [16]:
# ====================================================
# CV split for mlm
# ====================================================
if CFG.mlm:
        
    pn_tmp = patient_notes[~patient_notes['pn_num'].isin(train['pn_num'].unique())].reset_index(drop=True)
    Fold = KFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
    for n, (train_index, val_index) in enumerate(Fold.split(pn_tmp, pn_tmp['pn_history'])):
        pn_tmp.loc[val_index, 'fold'] = int(n)
    pn_tmp['fold'] = pn_tmp['fold'].astype(int)

    fold_dict = dict(train.groupby('pn_num').agg({'fold': 'mean'}).reset_index()[['pn_num', 'fold']].values)
    fold_dict.update(dict(pn_tmp[['pn_num', 'fold']].values))

    patient_notes['fold'] = patient_notes['pn_num'].map(fold_dict)
    patient_notes['fold'] = patient_notes['fold'].fillna(-1)
    patient_notes['fold'] = patient_notes['fold'].astype(int)
    display(patient_notes['fold'].value_counts())
    
    if CFG.debug:
        display(patient_notes.groupby('fold').size())
        patient_notes = patient_notes.sample(n=1000, random_state=0).reset_index(drop=True)
        display(patient_notes.groupby('fold').size())
        
    for fold in range(CFG.n_fold):
        trn_mlm_data = patient_notes[patient_notes['fold'] != fold][['pn_history']].reset_index(drop=True)
        val_mlm_data = patient_notes[patient_notes['fold'] == fold][['pn_history']].reset_index(drop=True)
        trn_mlm_data = trn_mlm_data.rename(columns={'pn_history': 'text'})
        val_mlm_data = val_mlm_data.rename(columns={'pn_history': 'text'})
        trn_csv_name = f'trn_mlm_data_fold{fold}.csv'
        val_csv_name = f'val_mlm_data_fold{fold}.csv'
        trn_mlm_data.to_csv(OUTPUT_DIR+trn_csv_name, index=False)
        val_mlm_data.to_csv(OUTPUT_DIR+val_csv_name, index=False)
        print(f"Saved train: {trn_csv_name}  valid: {val_csv_name}")

# tokenizer

In [17]:
# ====================================================
# tokenizer
# ====================================================
if CFG.model.find("deberta-v2") >= 0 or CFG.model.find("deberta-v3") >= 0:
    tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.model, add_prefix_space=False, trim_offsets=False)
else:
    tokenizer = AutoTokenizer.from_pretrained(CFG.model, add_prefix_space=False, trim_offsets=False)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [18]:
# ====================================================
# Define max_len
# ====================================================
for text_col in ['pn_history']:
    pn_history_lengths = []
    tk0 = tqdm(patient_notes[text_col].fillna("").values, total=len(patient_notes))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        pn_history_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(pn_history_lengths)}')

for text_col in ['feature_text']:
    features_lengths = []
    tk0 = tqdm(features[text_col].fillna("").values, total=len(features))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(features_lengths)}')

CFG.max_len = max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
CFG.max_seq_length = max(pn_history_lengths) + 3 # cls & sep & sep # mlm
LOGGER.info(f"max_len: {CFG.max_len}")
LOGGER.info(f"max_seq_length: {CFG.max_seq_length}")

  0%|          | 0/42146 [00:00<?, ?it/s]

pn_history max(lengths): 284


  0%|          | 0/143 [00:00<?, ?it/s]

feature_text max(lengths): 28
max_len: 315
max_seq_length: 287


In [19]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation_length, location_list):
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=cfg.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    weight = np.ones(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
                    if end - start > 1:
                        weight[start_idx:end_idx] = weight[start_idx:end_idx] + (end - start) * CFG.pos_length_weight_decay
    return label, weight


def create_pseudo_label(cfg, text, pseudo_label):
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=cfg.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[:311] = pseudo_label
    weight = np.ones(len(offset_mapping))
    label[ignore_idxes] = -1
    return label, weight


class TrainDataset(Dataset):
    def __init__(self, cfg, df, p_aug=0.):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.annotation_lengths = df['annotation_length'].values
        self.pseudo_flags = df['pseudo_flag'].values
        self.pseudo_labels = df[[i for i in range(311)]].values
        self.locations = df['location'].values
        self.p_aug = p_aug

    def __len__(self):
        return len(self.feature_texts)

    def augment_feature_text(self, feature_text):
        if feature_text.find('-or-') >= 0:
            augmented_feature_text = '-or-'.join(np.random.permutation(feature_text.split('-or-')))
        elif feature_text.find('-OR-') >= 0:
            augmented_feature_text = '-OR-'.join(np.random.permutation(feature_text.split('-OR-')))
        else:
            augmented_feature_text = feature_text
        return augmented_feature_text

    def __getitem__(self, item):
        if float(torch.rand(1)) < self.p_aug:
            feature_text = self.augment_feature_text(self.feature_texts[item])
        else:
            feature_text = self.feature_texts[item]
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               feature_text)
        if self.pseudo_flags[item]:
            label, weight = create_pseudo_label(self.cfg, 
                                                self.pn_historys[item], 
                                                self.pseudo_labels[item])
        else:
            label, weight = create_label(self.cfg,
                                         self.pn_historys[item], 
                                         self.annotation_lengths[item], 
                                         self.locations[item])
        if self.cfg.loss == 'bce':
            return inputs, torch.tensor(label, dtype=torch.float), torch.tensor(weight, dtype=torch.float)
        elif self.cfg.loss == 'ce':
            return inputs, torch.tensor(label, dtype=torch.long), torch.tensor(weight, dtype=torch.float)


class ValidDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.annotation_lengths = df['annotation_length'].values
        self.locations = df['location'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        label, weight = create_label(self.cfg,
                                     self.pn_historys[item], 
                                     self.annotation_lengths[item], 
                                     self.locations[item])
        if self.cfg.loss == 'bce':
            return inputs, torch.tensor(label, dtype=torch.float), torch.tensor(weight, dtype=torch.float)
        elif self.cfg.loss == 'ce':
            return inputs, torch.tensor(label, dtype=torch.long), torch.tensor(weight, dtype=torch.float)

In [20]:
def prepare_input_for_test(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           # max_length=CFG.max_len,
                           # padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input_for_test(self.cfg, 
                                        self.pn_historys[item], 
                                        self.feature_texts[item])
        return inputs

In [21]:
"""
train['pseudo_flag'] = 0
train[[i for i in range(311)]] = 0
train_dataset = TrainDataset(CFG, train)
inputs, label, weight = train_dataset[0]
print(inputs)
print(label)
print(weight)
"""

"\ntrain['pseudo_flag'] = 0\ntrain[[i for i in range(311)]] = 0\ntrain_dataset = TrainDataset(CFG, train)\ninputs, label, weight = train_dataset[0]\nprint(inputs)\nprint(label)\nprint(weight)\n"

# Pseudo Sampling

In [22]:
def get_sampled_pseudo_df(fold, sample_score=0.5, positive_th=0.5, max_samples=1000):
    # calcurate oof scores
    oof_df = pd.read_pickle(CFG.finetune_dir+'oof_df.pkl')
    scores = {}
    for feature_text, oof_tmp in tqdm(oof_df.groupby('feature_text')):
        oof_tmp = oof_tmp.reset_index(drop=True)
        truths = create_labels_for_scoring(oof_tmp)
        char_probs = get_char_probs(oof_tmp['pn_history'].values, oof_tmp[[i for i in range(CFG.max_len)]].values, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        try:
            score = get_score(preds, truths)
        except:
            score = np.nan
        scores[feature_text] = {'score': score, 'num_labels': len(oof_tmp[oof_tmp['annotation_length'] > 0])}
    scores = sorted(scores.items(), key=lambda i: i[1]['score'])
    # sample pseudo labels
    pseudo_df = pd.read_pickle(CFG.pseudo_dir+f'pseudo_df_fold{fold}.pkl')
    pseudo_pn_num_list = []
    for text, d in tqdm([(text, d) for text, d in scores if d['score'] < sample_score]):
        pseudo_df_tmp = pseudo_df[pseudo_df['feature_text'] == text]
        pseudo_df_tmp['positive_count'] = (pseudo_df_tmp[[i for i in range(311)]] > positive_th).sum(1)
        pseudo_pn_num = pseudo_df_tmp[pseudo_df_tmp['positive_count'] > 0]['pn_num'].values.tolist()
        if len(pseudo_pn_num) > 0:
            print(f"{text}({d}) has {len(pseudo_pn_num)} pseudo positive labels")
            if len(pseudo_pn_num) > max_samples:
                pseudo_pn_num = random.sample(pseudo_pn_num, max_samples)
                print(f"sampled by max_samples -> {text}({d}) has {len(pseudo_pn_num)} pseudo positive labels")
                pseudo_pn_num_list.append(pseudo_pn_num)
            else:
                pseudo_pn_num_list.append(pseudo_pn_num)
    pseudo_pn_num_list = sum(pseudo_pn_num_list, [])
    sampled_pseudo_df = pseudo_df[pseudo_df['pn_num'].isin(pseudo_pn_num_list)].reset_index(drop=True)
    print(f"sampled_pseudo_df.shape: {sampled_pseudo_df.shape}")
    return sampled_pseudo_df

In [23]:
"""
pseudo_df = get_sampled_pseudo_df(fold, sample_score=0.5)
train['pseudo_flag'] = 0
pseudo_df['pseudo_flag'] = 1
train_dataset = TrainDataset(CFG, pd.concat([train.head(5), pseudo_df]).reset_index(drop=True))
inputs, label, weight = train_dataset[6]
print(inputs)
print(label)
print(weight)
"""

"\npseudo_df = get_sampled_pseudo_df(fold, sample_score=0.5)\ntrain['pseudo_flag'] = 0\npseudo_df['pseudo_flag'] = 1\ntrain_dataset = TrainDataset(CFG, pd.concat([train.head(5), pseudo_df]).reset_index(drop=True))\ninputs, label, weight = train_dataset[6]\nprint(inputs)\nprint(label)\nprint(weight)\n"

# Model

In [24]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if cfg.reinit_layers != 0:
            reinit_encoder_layer = AutoModel.from_config(self.config).encoder.layer
            for i in np.arange(self.config.num_hidden_layers)[cfg.reinit_layers:]:
                LOGGER.info(f"reinit encoder layer #{i}")
                self.model.encoder.layer[i] = reinit_encoder_layer[i]
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [25]:
class CustomModelForMaskedLM(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if cfg.reinit_layers != 0:
            reinit_encoder_layer = AutoModel.from_config(self.config).encoder.layer
            for i in np.arange(self.config.num_hidden_layers)[cfg.reinit_layers:]:
                LOGGER.info(f"reinit encoder layer #{i}")
                self.model.encoder.layer[i] = reinit_encoder_layer[i]
        self.cls = AutoModelForMaskedLM.from_pretrained(cfg.model, config=self.config).cls

    def forward(self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)
        
        masked_lm_loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
        
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

# Loss

In [26]:
class FocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2):
        super().__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-bce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * bce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss


class SmoothFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2, smoothing=0.0):
        super().__init__()
        self.reduction = reduction
        self.focal_loss = FocalLoss(reduction='none', alpha=alpha, gamma=gamma)
        self.smoothing = smoothing

    @staticmethod
    def _smooth(targets:torch.Tensor, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothFocalLoss._smooth(targets, self.smoothing)
        loss = self.focal_loss(inputs, targets)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

    
class CEFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2):
        super(CEFocalLoss, self).__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * ce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

    
class SmoothCEFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2, smoothing=0.0):
        super(SmoothCEFocalLoss, self).__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma
        self.smoothing = smoothing

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', label_smoothing=self.smoothing) # torch >= 1.10.0
        pt = torch.exp(-ce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * ce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

In [27]:
"""
train_dataset = TrainDataset(CFG, train)
train_loader = DataLoader(train_dataset,
                          batch_size=2,
                          shuffle=True,
                          num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
model = CustomModel(CFG, config_path=None, pretrained=False)
model.to(device)
if CFG.loss == 'bce':
    criterion = SmoothFocalLoss(reduction='none', alpha=CFG.alpha, gamma=CFG.gamma, smoothing=CFG.smoothing)
elif CFG.loss == 'ce':
    criterion = CEFocalLoss(reduction='none', alpha=CFG.alpha, gamma=CFG.gamma)
for step, (inputs, labels, weights) in enumerate(train_loader):
    for k, v in inputs.items():
        inputs[k] = v.to(device)
    labels = labels.to(device)
    weights = weights.to(device)
    batch_size = labels.size(0)
    y_preds = model(inputs)
    if CFG.loss == 'bce':
        loss_mask = labels.view(-1, 1) != -1
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = (loss.view(batch_size, -1) * weights.view(batch_size, -1)).view(-1, 1)[loss_mask]
        loss = loss.mean()
    elif CFG.loss == 'ce':
        loss_mask = labels.view(-1) != -1
        loss = criterion(y_preds.view(-1, CFG.target_size), labels.view(-1))
        loss = (loss.view(batch_size, -1) * weights.view(batch_size, -1)).view(-1)[loss_mask]
        loss = loss.mean()
    break
"""

"\ntrain_dataset = TrainDataset(CFG, train)\ntrain_loader = DataLoader(train_dataset,\n                          batch_size=2,\n                          shuffle=True,\n                          num_workers=CFG.num_workers, pin_memory=True, drop_last=True)\nmodel = CustomModel(CFG, config_path=None, pretrained=False)\nmodel.to(device)\nif CFG.loss == 'bce':\n    criterion = SmoothFocalLoss(reduction='none', alpha=CFG.alpha, gamma=CFG.gamma, smoothing=CFG.smoothing)\nelif CFG.loss == 'ce':\n    criterion = CEFocalLoss(reduction='none', alpha=CFG.alpha, gamma=CFG.gamma)\nfor step, (inputs, labels, weights) in enumerate(train_loader):\n    for k, v in inputs.items():\n        inputs[k] = v.to(device)\n    labels = labels.to(device)\n    weights = weights.to(device)\n    batch_size = labels.size(0)\n    y_preds = model(inputs)\n    if CFG.loss == 'bce':\n        loss_mask = labels.view(-1, 1) != -1\n        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))\n        loss = (loss.vie

# Helpler functions

In [28]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels, weights) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        weights = weights.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        if CFG.loss == 'bce':
            loss_mask = labels.view(-1, 1) != -1
            loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
            loss = (loss.view(batch_size, -1) * weights.view(batch_size, -1)).view(-1, 1)[loss_mask]
            loss = loss.mean()
        elif CFG.loss == 'ce':
            loss_mask = labels.view(-1) != -1
            loss = criterion(y_preds.view(-1, CFG.target_size), labels.view(-1))
            loss = (loss.view(batch_size, -1) * weights.view(batch_size, -1)).view(-1)[loss_mask]
            loss = loss.mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels, weights) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        weights = weights.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        if CFG.loss == 'bce':
            loss_mask = labels.view(-1, 1) != -1
            loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
            loss = (loss.view(batch_size, -1) * weights.view(batch_size, -1)).view(-1, 1)[loss_mask]
            loss = loss.mean()
        elif CFG.loss == 'ce':
            loss_mask = labels.view(-1) != -1
            loss = criterion(y_preds.view(-1, CFG.target_size), labels.view(-1))
            loss = (loss.view(batch_size, -1) * weights.view(batch_size, -1)).view(-1)[loss_mask]
            loss = loss.mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        if CFG.loss == 'bce':
            preds.append(y_preds.sigmoid().to('cpu').numpy())
        elif CFG.loss == 'ce':
            preds.append(y_preds.softmax(2).to('cpu').numpy()[:,:,1])
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        if CFG.loss == 'bce':
            preds.append(y_preds.sigmoid().to('cpu').numpy())
        elif CFG.loss == 'ce':
            preds.append(y_preds.softmax(2).to('cpu').numpy()[:,:,1])
    predictions = preds.copy()
    max_len = max([pred.shape[1] for pred in preds])
    for i, pred in enumerate(preds):
        bs = pred.shape[0]
        p = np.zeros((bs, max_len, 1))
        p[:,:pred.shape[1],:] = pred
        predictions[i] = p
    predictions = np.concatenate(predictions)
    predictions = predictions.reshape((-1, max_len))
    return predictions

In [29]:
def train_mlm_fn(fold, train_loader, model, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, inputs in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        batch_size = inputs['labels'].size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            outputs = model(**inputs)
        loss = outputs.loss
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_mlm_fn(valid_loader, model, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, inputs in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        batch_size = inputs['labels'].size(0)
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs.loss
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    return losses.avg

In [30]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_texts = valid_folds['pn_history'].values
    valid_labels = create_labels_for_scoring(valid_folds)

    pseudo_df = get_sampled_pseudo_df(fold, sample_score=1.0, positive_th=0.5, max_samples=10)
    train_folds['pseudo_flag'] = 0
    pseudo_df['pseudo_flag'] = 1
    train_folds = pd.concat([train_folds, pseudo_df]).reset_index(drop=True)
    
    train_dataset = TrainDataset(CFG, train_folds, p_aug=CFG.p_aug)
    valid_dataset = ValidDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    if CFG.mlm_dir is not None:
        model = CustomModel(CFG, config_path=None, pretrained=False)
        state = torch.load(CFG.mlm_dir+f"{CFG.model.replace('/', '-')}_mlm_fold{fold}_best.pth", map_location=torch.device('cpu'))
        model.load_state_dict(state['model'], strict=False)
        del state; gc.collect()
    else:
        model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, group_step=1, lr_scale=1.0, weight_decay=0.0):
        num_hidden_layers = model.config.num_hidden_layers
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        groups = np.array([f'layer.{i}.' for i in range(num_hidden_layers)]).reshape(-1, group_step).tolist()
        optimizer_parameters = []
        for i, group in enumerate(groups):
            lr_factor = (num_hidden_layers - (i * lr_scale)) / num_hidden_layers
            optimizer_parameters.append(
                {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group)], 'weight_decay': weight_decay,
                 'lr': encoder_lr * lr_factor}
            )
            optimizer_parameters.append(
                {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group)], 'weight_decay': 0.0,
                 'lr': encoder_lr * lr_factor}
            )
        optimizer_parameters.append(
            {'params': [p for n, p in model.named_parameters() if "model" not in n], 'weight_decay': 0.0,
             'lr': decoder_lr, "momentum" : 0.99}
        )
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                group_step=CFG.group_step,
                                                lr_scale=CFG.lr_scale,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    if CFG.loss == 'bce':
        criterion = SmoothFocalLoss(reduction='none', alpha=CFG.alpha, gamma=CFG.gamma, smoothing=CFG.smoothing)
    elif CFG.loss == 'ce':
        criterion = CEFocalLoss(reduction='none', alpha=CFG.alpha, gamma=CFG.gamma)
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        if CFG.p_aug_epoch < epoch + 1:
            train_dataset = TrainDataset(CFG, train_folds, p_aug=0.)
            train_loader = DataLoader(
                train_dataset,
                batch_size=CFG.batch_size,
                shuffle=True,
                num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
        
        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        predictions = predictions.reshape((len(valid_folds), CFG.max_len))
        
        # scoring
        char_probs = get_char_probs(valid_texts, predictions, CFG.tokenizer)
        th, score = compute_best_th(valid_labels, char_probs)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  th: {th}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [31]:
# ====================================================
# train mlm loop
# ====================================================
def train_mlm_loop(fold):

    # ====================================================
    # loader
    # ====================================================
    args = CFG()
    args.train_file = f"trn_mlm_data_fold{fold}.csv"
    args.validation_file = f"val_mlm_data_fold{fold}.csv"
    data_files = {'train': OUTPUT_DIR+args.train_file, 'validation': OUTPUT_DIR+args.validation_file}
    raw_datasets = load_dataset(args.extension, data_files=data_files)
    LOGGER.info(f'raw_datasets: {raw_datasets}')
    
    max_seq_length = args.max_seq_length
    LOGGER.info(f"max_seq_length: {max_seq_length}")

    def tokenize_function(examples):
        return tokenizer(examples["text"], return_special_tokens_mask=True)

    tokenized_datasets = raw_datasets.map(tokenize_function,
                                          batched=True,
                                          num_proc=args.num_workers,
                                          remove_columns=["text"],
                                          load_from_cache_file=not args.overwrite_cache)
    LOGGER.info(f"tokenized_datasets: {tokenized_datasets}")
    
    def group_texts(examples):
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        total_length = (total_length // max_seq_length) * max_seq_length
        result = {
            k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
            for k, t in concatenated_examples.items()
        }
        return result

    tokenized_datasets = tokenized_datasets.map(group_texts,
                                                batched=True,
                                                num_proc=args.num_workers,
                                                load_from_cache_file=not args.overwrite_cache)
    train_dataset = tokenized_datasets["train"]
    valid_dataset = tokenized_datasets["validation"]
    LOGGER.info(f"train_dataset: {train_dataset}  valid_dataset: {valid_dataset}")

    data_collator = DataCollatorForLanguageModeling(tokenizer=args.tokenizer,
                                                    mlm_probability=args.mlm_probability)
    train_loader = DataLoader(train_dataset,
                              shuffle=True,
                              collate_fn=data_collator,
                              batch_size=args.batch_size)
    valid_loader = DataLoader(valid_dataset,
                              shuffle=False,
                              collate_fn=data_collator,
                              batch_size=args.batch_size)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModelForMaskedLM(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, group_step=1, lr_scale=1.0, weight_decay=0.0):
        num_hidden_layers = model.config.num_hidden_layers
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        groups = np.array([f'layer.{i}.' for i in range(num_hidden_layers)]).reshape(-1, group_step).tolist()
        optimizer_parameters = []
        for i, group in enumerate(groups):
            lr_factor = (num_hidden_layers - (i * lr_scale)) / num_hidden_layers
            optimizer_parameters.append(
                {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group)], 'weight_decay': weight_decay,
                 'lr': encoder_lr * lr_factor}
            )
            optimizer_parameters.append(
                {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group)], 'weight_decay': 0.0,
                 'lr': encoder_lr * lr_factor}
            )
        optimizer_parameters.append(
            {'params': [p for n, p in model.named_parameters() if "model" not in n], 'weight_decay': 0.0,
             'lr': decoder_lr, "momentum" : 0.99}
        )
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                group_step=CFG.group_step,
                                                lr_scale=CFG.lr_scale,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_dataset) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    best_loss = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_mlm_fn(fold, train_loader, model, optimizer, epoch, scheduler, device)
        
        # eval
        avg_val_loss = valid_mlm_fn(valid_loader, model, device)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss})
        
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
            torch.save({'model': model.state_dict()},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_mlm_fold{fold}_best.pth")

    torch.cuda.empty_cache()
    gc.collect()

In [32]:
# ====================================================
# pseudo labeling
# ====================================================
def pseudo_labeling(fold, debug=False):

    # ====================================================
    # loader
    # ====================================================
    if debug:
        oof_df = pd.read_pickle(CFG.finetune_dir+'oof_df.pkl')

        # NEW
        oof_df['feature_text'] = oof_df['feature_text'].str.lower()
        oof_df['pn_history'] = oof_df['pn_history'].str.lower()

        oof_df = oof_df[oof_df['fold'] == fold].reset_index(drop=True)
        oof_df['tokenize_length'] = [len(CFG.tokenizer(text, feature_text)['input_ids']) for text, feature_text 
                                        in zip(oof_df['pn_history'].values, oof_df['feature_text'].values)]
        oof_df = oof_df.sort_values('tokenize_length', ascending=True).reset_index(drop=True)

        truths = create_labels_for_scoring(oof_df)
        char_probs = get_char_probs(oof_df['pn_history'].values, oof_df[[i for i in range(CFG.max_len)]].values, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(preds, truths)
        LOGGER.info(f"Fold{fold} Saved Score: {score:.5f}")

        test_dataset = TestDataset(CFG, oof_df)
        test_loader = DataLoader(
            test_dataset,
            batch_size=CFG.test_batch_size,
            shuffle=False,
            collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
            num_workers=CFG.num_workers, pin_memory=True, drop_last=False,
        )
    else:
        pseudo_df = pd.read_pickle('./drive/MyDrive/Colab Notebooks/NBME/pseudo/pseudo_plain.pkl')

        # NEW
        pseudo_df['feature_text'] = pseudo_df['feature_text'].str.lower()
        pseudo_df['pn_history'] = pseudo_df['pn_history'].str.lower()

        pseudo_df['tokenize_length'] = [len(CFG.tokenizer(text, feature_text)['input_ids']) for text, feature_text 
                                        in zip(pseudo_df['pn_history'].values, pseudo_df['feature_text'].values)]
        pseudo_df = pseudo_df.sort_values('tokenize_length', ascending=True).reset_index(drop=True)
        LOGGER.info(f"pseudo_df['tokenize_length'].max(): {pseudo_df['tokenize_length'].max()}")

        test_dataset = TestDataset(CFG, pseudo_df)
        test_loader = DataLoader(
            test_dataset,
            batch_size=CFG.test_batch_size,
            shuffle=False,
            collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
            num_workers=CFG.num_workers, pin_memory=True, drop_last=False,
        )

    # ====================================================
    # model
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=False)
    state = torch.load(CFG.finetune_dir+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", map_location=torch.device('cpu'))
    model.load_state_dict(state['model'], strict=False)
    del state; gc.collect()
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)

    # ====================================================
    # inference
    # ====================================================
    predictions = inference_fn(test_loader, model, device)

    if debug:
        oof_df[[i for i in range(predictions.shape[1])]] = predictions
        char_probs = get_char_probs(oof_df['pn_history'].values, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(preds, truths)
        LOGGER.info(f"Fold{fold} Reproduced Score: {score:.5f}")
    else:
        pseudo_df[[i for i in range(predictions.shape[1])]] = predictions
        pseudo_df.to_pickle(OUTPUT_DIR+f'pseudo_df_fold{fold}.pkl')

    del model, predictions
    torch.cuda.empty_cache()
    gc.collect()

In [33]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = create_labels_for_scoring(oof_df)
        predictions = oof_df[[i for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(oof_df['pn_history'].values, predictions, CFG.tokenizer)
        th, score = compute_best_th(labels, char_probs)
        LOGGER.info(f'Score: {score:<.4f}  th: {th}')
        
    if CFG.mlm:
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                train_mlm_loop(fold)
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')

    if CFG.pseudo:
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                pseudo_labeling(fold, debug=False)

    if CFG.wandb:
        wandb.finish()



  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/131 [00:00<?, ?it/s]

hallucinations-after-taking-ambien({'score': 0.0, 'num_labels': 1}) has 9 pseudo positive labels
stress({'score': 0.3636363636363636, 'num_labels': 1}) has 17 pseudo positive labels
sampled by max_samples -> stress({'score': 0.3636363636363636, 'num_labels': 1}) has 10 pseudo positive labels
no-premenstrual-symptoms({'score': 0.384, 'num_labels': 11}) has 108 pseudo positive labels
sampled by max_samples -> no-premenstrual-symptoms({'score': 0.384, 'num_labels': 11}) has 10 pseudo positive labels
meningococcal-vaccine-status-unknown({'score': 0.4403669724770642, 'num_labels': 2}) has 107 pseudo positive labels
sampled by max_samples -> meningococcal-vaccine-status-unknown({'score': 0.4403669724770642, 'num_labels': 2}) has 10 pseudo positive labels
prior-episodes-of-diarrhea({'score': 0.67720424796246, 'num_labels': 96}) has 705 pseudo positive labels
sampled by max_samples -> prior-episodes-of-diarrhea({'score': 0.67720424796246, 'num_labels': 96}) has 10 pseudo positive labels
unsucc

Epoch 1 - avg_train_loss: 0.0040  avg_val_loss: 0.0047  time: 1700s
Epoch 1 - Score: 0.8882  th: 0.5
Epoch 1 - Save Best Score: 0.8882 Model


Epoch: [2][0/5008] Elapsed 0m 0s (remain 65m 25s) Loss: 0.0008(0.0008) Grad: 2697.8252  LR: 0.00001707  
Epoch: [2][100/5008] Elapsed 0m 34s (remain 27m 42s) Loss: 0.0004(0.0021) Grad: 559.9632  LR: 0.00001696  
Epoch: [2][200/5008] Elapsed 1m 6s (remain 26m 39s) Loss: 0.0014(0.0020) Grad: 5364.1489  LR: 0.00001685  
Epoch: [2][300/5008] Elapsed 1m 38s (remain 25m 47s) Loss: 0.0012(0.0019) Grad: 1353.7357  LR: 0.00001673  
Epoch: [2][400/5008] Elapsed 2m 11s (remain 25m 7s) Loss: 0.0017(0.0020) Grad: 1186.6547  LR: 0.00001661  
Epoch: [2][500/5008] Elapsed 2m 43s (remain 24m 28s) Loss: 0.0026(0.0020) Grad: 7999.1724  LR: 0.00001650  
Epoch: [2][600/5008] Elapsed 3m 15s (remain 23m 51s) Loss: 0.0010(0.0020) Grad: 2395.2588  LR: 0.00001638  
Epoch: [2][700/5008] Elapsed 3m 47s (remain 23m 16s) Loss: 0.0006(0.0020) Grad: 2449.5466  LR: 0.00001625  
Epoch: [2][800/5008] Elapsed 4m 19s (remain 22m 42s) Loss: 0.0036(0.0020) Grad: 2165.0696  LR: 0.00001613  
Epoch: [2][900/5008] Elapsed 4m 51

Epoch 2 - avg_train_loss: 0.0019  avg_val_loss: 0.0045  time: 1703s
Epoch 2 - Score: 0.8874  th: 0.5


Epoch: [3][0/5008] Elapsed 0m 0s (remain 59m 16s) Loss: 0.0008(0.0008) Grad: 568.2717  LR: 0.00001000  
Epoch: [3][100/5008] Elapsed 0m 32s (remain 26m 26s) Loss: 0.0003(0.0019) Grad: 299.2139  LR: 0.00000984  
Epoch: [3][200/5008] Elapsed 1m 4s (remain 25m 44s) Loss: 0.0047(0.0017) Grad: 6128.8687  LR: 0.00000969  
Epoch: [3][300/5008] Elapsed 1m 36s (remain 25m 9s) Loss: 0.0001(0.0016) Grad: 364.9572  LR: 0.00000953  
Epoch: [3][400/5008] Elapsed 2m 8s (remain 24m 35s) Loss: 0.0001(0.0016) Grad: 361.6916  LR: 0.00000937  
Epoch: [3][500/5008] Elapsed 2m 40s (remain 24m 2s) Loss: 0.0002(0.0015) Grad: 359.1866  LR: 0.00000922  
Epoch: [3][600/5008] Elapsed 3m 12s (remain 23m 30s) Loss: 0.0002(0.0016) Grad: 556.8608  LR: 0.00000906  
Epoch: [3][700/5008] Elapsed 3m 44s (remain 22m 59s) Loss: 0.0002(0.0016) Grad: 221.8153  LR: 0.00000891  
Epoch: [3][800/5008] Elapsed 4m 16s (remain 22m 26s) Loss: 0.0020(0.0017) Grad: 3425.7085  LR: 0.00000875  
Epoch: [3][900/5008] Elapsed 4m 48s (remai

Epoch 3 - avg_train_loss: 0.0016  avg_val_loss: 0.0045  time: 1700s
Epoch 3 - Score: 0.8914  th: 0.5
Epoch 3 - Save Best Score: 0.8914 Model


Epoch: [4][0/5008] Elapsed 0m 0s (remain 68m 31s) Loss: 0.0010(0.0010) Grad: 925.6389  LR: 0.00000293  
Epoch: [4][100/5008] Elapsed 0m 33s (remain 27m 19s) Loss: 0.0003(0.0011) Grad: 291.9099  LR: 0.00000282  
Epoch: [4][200/5008] Elapsed 1m 6s (remain 26m 20s) Loss: 0.0050(0.0012) Grad: 2844.8813  LR: 0.00000271  
Epoch: [4][300/5008] Elapsed 1m 37s (remain 25m 32s) Loss: 0.0032(0.0014) Grad: 777.5075  LR: 0.00000261  
Epoch: [4][400/5008] Elapsed 2m 9s (remain 24m 52s) Loss: 0.0006(0.0015) Grad: 278.2370  LR: 0.00000250  
Epoch: [4][500/5008] Elapsed 2m 41s (remain 24m 16s) Loss: 0.0017(0.0015) Grad: 895.6863  LR: 0.00000240  
Epoch: [4][600/5008] Elapsed 3m 13s (remain 23m 40s) Loss: 0.0009(0.0015) Grad: 1932.6871  LR: 0.00000230  
Epoch: [4][700/5008] Elapsed 3m 45s (remain 23m 6s) Loss: 0.0005(0.0014) Grad: 366.2957  LR: 0.00000220  
Epoch: [4][800/5008] Elapsed 4m 17s (remain 22m 33s) Loss: 0.0030(0.0014) Grad: 7188.7598  LR: 0.00000210  
Epoch: [4][900/5008] Elapsed 4m 49s (rem

Epoch 4 - avg_train_loss: 0.0014  avg_val_loss: 0.0047  time: 1698s
Epoch 4 - Score: 0.8908  th: 0.5
Score: 0.8914  th: 0.5


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/131 [00:00<?, ?it/s]

hallucinations-after-taking-ambien({'score': 0.0, 'num_labels': 1}) has 4094 pseudo positive labels
sampled by max_samples -> hallucinations-after-taking-ambien({'score': 0.0, 'num_labels': 1}) has 10 pseudo positive labels
stress({'score': 0.3636363636363636, 'num_labels': 1}) has 26 pseudo positive labels
sampled by max_samples -> stress({'score': 0.3636363636363636, 'num_labels': 1}) has 10 pseudo positive labels
no-premenstrual-symptoms({'score': 0.384, 'num_labels': 11}) has 1789 pseudo positive labels
sampled by max_samples -> no-premenstrual-symptoms({'score': 0.384, 'num_labels': 11}) has 10 pseudo positive labels
meningococcal-vaccine-status-unknown({'score': 0.4403669724770642, 'num_labels': 2}) has 101 pseudo positive labels
sampled by max_samples -> meningococcal-vaccine-status-unknown({'score': 0.4403669724770642, 'num_labels': 2}) has 10 pseudo positive labels
prior-episodes-of-diarrhea({'score': 0.67720424796246, 'num_labels': 96}) has 708 pseudo positive labels
sampled 

Epoch 1 - avg_train_loss: 0.0039  avg_val_loss: 0.0042  time: 1692s
Epoch 1 - Score: 0.8800  th: 0.5
Epoch 1 - Save Best Score: 0.8800 Model


Epoch: [2][0/4982] Elapsed 0m 0s (remain 67m 55s) Loss: 0.0037(0.0037) Grad: 7431.6943  LR: 0.00001707  
Epoch: [2][100/4982] Elapsed 0m 33s (remain 26m 54s) Loss: 0.0001(0.0017) Grad: 376.5298  LR: 0.00001696  
Epoch: [2][200/4982] Elapsed 1m 5s (remain 26m 7s) Loss: 0.0012(0.0018) Grad: 2828.0093  LR: 0.00001684  
Epoch: [2][300/4982] Elapsed 1m 37s (remain 25m 22s) Loss: 0.0016(0.0017) Grad: 1295.5312  LR: 0.00001673  
Epoch: [2][400/4982] Elapsed 2m 9s (remain 24m 43s) Loss: 0.0002(0.0018) Grad: 461.6565  LR: 0.00001661  
Epoch: [2][500/4982] Elapsed 2m 42s (remain 24m 10s) Loss: 0.0005(0.0018) Grad: 571.7423  LR: 0.00001649  
Epoch: [2][600/4982] Elapsed 3m 14s (remain 23m 34s) Loss: 0.0025(0.0018) Grad: 1381.9872  LR: 0.00001637  
Epoch: [2][700/4982] Elapsed 3m 46s (remain 23m 0s) Loss: 0.0012(0.0017) Grad: 3846.6846  LR: 0.00001625  
Epoch: [2][800/4982] Elapsed 4m 18s (remain 22m 26s) Loss: 0.0003(0.0017) Grad: 300.2839  LR: 0.00001612  
Epoch: [2][900/4982] Elapsed 4m 49s (re

Epoch 2 - avg_train_loss: 0.0018  avg_val_loss: 0.0050  time: 1695s
Epoch 2 - Score: 0.8823  th: 0.5
Epoch 2 - Save Best Score: 0.8823 Model


Epoch: [3][0/4982] Elapsed 0m 0s (remain 66m 20s) Loss: 0.0006(0.0006) Grad: 1727.1110  LR: 0.00001000  
Epoch: [3][100/4982] Elapsed 0m 33s (remain 27m 17s) Loss: 0.0019(0.0016) Grad: 2582.2483  LR: 0.00000984  
Epoch: [3][200/4982] Elapsed 1m 6s (remain 26m 16s) Loss: 0.0024(0.0015) Grad: 2125.7864  LR: 0.00000968  
Epoch: [3][300/4982] Elapsed 1m 38s (remain 25m 27s) Loss: 0.0043(0.0014) Grad: 7016.9507  LR: 0.00000953  
Epoch: [3][400/4982] Elapsed 2m 10s (remain 24m 46s) Loss: 0.0021(0.0015) Grad: 618.3480  LR: 0.00000937  
Epoch: [3][500/4982] Elapsed 2m 42s (remain 24m 9s) Loss: 0.0004(0.0015) Grad: 3430.7124  LR: 0.00000921  
Epoch: [3][600/4982] Elapsed 3m 13s (remain 23m 33s) Loss: 0.0008(0.0015) Grad: 313.3532  LR: 0.00000905  
Epoch: [3][700/4982] Elapsed 3m 45s (remain 22m 59s) Loss: 0.0010(0.0015) Grad: 1013.9958  LR: 0.00000890  
Epoch: [3][800/4982] Elapsed 4m 17s (remain 22m 25s) Loss: 0.0003(0.0015) Grad: 301.8095  LR: 0.00000874  
Epoch: [3][900/4982] Elapsed 4m 49s 

Epoch 3 - avg_train_loss: 0.0016  avg_val_loss: 0.0049  time: 1690s
Epoch 3 - Score: 0.8841  th: 0.5
Epoch 3 - Save Best Score: 0.8841 Model


Epoch: [4][0/4982] Elapsed 0m 0s (remain 66m 56s) Loss: 0.0022(0.0022) Grad: 5220.1138  LR: 0.00000293  
Epoch: [4][100/4982] Elapsed 0m 33s (remain 27m 19s) Loss: 0.0002(0.0016) Grad: 258.0656  LR: 0.00000282  
Epoch: [4][200/4982] Elapsed 1m 6s (remain 26m 22s) Loss: 0.0006(0.0016) Grad: 698.4146  LR: 0.00000271  
Epoch: [4][300/4982] Elapsed 1m 38s (remain 25m 32s) Loss: 0.0008(0.0015) Grad: 422.6472  LR: 0.00000260  
Epoch: [4][400/4982] Elapsed 2m 10s (remain 24m 51s) Loss: 0.0012(0.0015) Grad: 249.7075  LR: 0.00000250  
Epoch: [4][500/4982] Elapsed 2m 42s (remain 24m 13s) Loss: 0.0072(0.0015) Grad: 7883.2354  LR: 0.00000239  
Epoch: [4][600/4982] Elapsed 3m 14s (remain 23m 38s) Loss: 0.0007(0.0015) Grad: 941.4023  LR: 0.00000229  
Epoch: [4][700/4982] Elapsed 3m 46s (remain 23m 3s) Loss: 0.0001(0.0015) Grad: 265.5974  LR: 0.00000219  
Epoch: [4][800/4982] Elapsed 4m 18s (remain 22m 29s) Loss: 0.0011(0.0015) Grad: 2068.0684  LR: 0.00000210  
Epoch: [4][900/4982] Elapsed 4m 50s (re

Epoch 4 - avg_train_loss: 0.0014  avg_val_loss: 0.0051  time: 1693s
Epoch 4 - Score: 0.8847  th: 0.5
Epoch 4 - Save Best Score: 0.8847 Model
Score: 0.8847  th: 0.5


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/131 [00:00<?, ?it/s]

hallucinations-after-taking-ambien({'score': 0.0, 'num_labels': 1}) has 840 pseudo positive labels
sampled by max_samples -> hallucinations-after-taking-ambien({'score': 0.0, 'num_labels': 1}) has 10 pseudo positive labels
stress({'score': 0.3636363636363636, 'num_labels': 1}) has 34 pseudo positive labels
sampled by max_samples -> stress({'score': 0.3636363636363636, 'num_labels': 1}) has 10 pseudo positive labels
no-premenstrual-symptoms({'score': 0.384, 'num_labels': 11}) has 875 pseudo positive labels
sampled by max_samples -> no-premenstrual-symptoms({'score': 0.384, 'num_labels': 11}) has 10 pseudo positive labels
meningococcal-vaccine-status-unknown({'score': 0.4403669724770642, 'num_labels': 2}) has 4582 pseudo positive labels
sampled by max_samples -> meningococcal-vaccine-status-unknown({'score': 0.4403669724770642, 'num_labels': 2}) has 10 pseudo positive labels
prior-episodes-of-diarrhea({'score': 0.67720424796246, 'num_labels': 96}) has 706 pseudo positive labels
sampled b

Epoch 1 - avg_train_loss: 0.0040  avg_val_loss: 0.0040  time: 1714s
Epoch 1 - Score: 0.8868  th: 0.5
Epoch 1 - Save Best Score: 0.8868 Model


Epoch: [2][0/5042] Elapsed 0m 0s (remain 70m 36s) Loss: 0.0012(0.0012) Grad: 413.3121  LR: 0.00001707  
Epoch: [2][100/5042] Elapsed 0m 33s (remain 27m 38s) Loss: 0.0026(0.0019) Grad: 3194.6289  LR: 0.00001696  
Epoch: [2][200/5042] Elapsed 1m 6s (remain 26m 41s) Loss: 0.0005(0.0019) Grad: 675.7823  LR: 0.00001685  
Epoch: [2][300/5042] Elapsed 1m 38s (remain 25m 52s) Loss: 0.0002(0.0018) Grad: 634.8412  LR: 0.00001673  
Epoch: [2][400/5042] Elapsed 2m 10s (remain 25m 11s) Loss: 0.0022(0.0019) Grad: 2991.7053  LR: 0.00001662  
Epoch: [2][500/5042] Elapsed 2m 42s (remain 24m 36s) Loss: 0.0010(0.0018) Grad: 3319.1350  LR: 0.00001650  
Epoch: [2][600/5042] Elapsed 3m 14s (remain 24m 0s) Loss: 0.0016(0.0019) Grad: 1873.3019  LR: 0.00001638  
Epoch: [2][700/5042] Elapsed 3m 46s (remain 23m 25s) Loss: 0.0007(0.0018) Grad: 3032.3638  LR: 0.00001626  
Epoch: [2][800/5042] Elapsed 4m 18s (remain 22m 50s) Loss: 0.0013(0.0019) Grad: 4132.5068  LR: 0.00001614  
Epoch: [2][900/5042] Elapsed 4m 50s 

Epoch 2 - avg_train_loss: 0.0018  avg_val_loss: 0.0038  time: 1717s
Epoch 2 - Score: 0.8937  th: 0.5
Epoch 2 - Save Best Score: 0.8937 Model


Epoch: [3][0/5042] Elapsed 0m 0s (remain 65m 42s) Loss: 0.0006(0.0006) Grad: 333.0442  LR: 0.00001000  
Epoch: [3][100/5042] Elapsed 0m 34s (remain 27m 43s) Loss: 0.0021(0.0015) Grad: 5987.0947  LR: 0.00000984  
Epoch: [3][200/5042] Elapsed 1m 6s (remain 26m 45s) Loss: 0.0006(0.0015) Grad: 2646.1179  LR: 0.00000969  
Epoch: [3][300/5042] Elapsed 1m 38s (remain 25m 54s) Loss: 0.0010(0.0015) Grad: 895.1393  LR: 0.00000953  
Epoch: [3][400/5042] Elapsed 2m 10s (remain 25m 12s) Loss: 0.0029(0.0015) Grad: 1378.4325  LR: 0.00000938  
Epoch: [3][500/5042] Elapsed 2m 42s (remain 24m 34s) Loss: 0.0002(0.0015) Grad: 446.3468  LR: 0.00000922  
Epoch: [3][600/5042] Elapsed 3m 14s (remain 23m 59s) Loss: 0.0015(0.0016) Grad: 2924.4875  LR: 0.00000907  
Epoch: [3][700/5042] Elapsed 3m 46s (remain 23m 24s) Loss: 0.0032(0.0015) Grad: 2038.1512  LR: 0.00000891  
Epoch: [3][800/5042] Elapsed 4m 18s (remain 22m 50s) Loss: 0.0013(0.0015) Grad: 746.7685  LR: 0.00000876  
Epoch: [3][900/5042] Elapsed 4m 51s 

Epoch 3 - avg_train_loss: 0.0016  avg_val_loss: 0.0039  time: 1716s
Epoch 3 - Score: 0.8927  th: 0.5


Epoch: [4][0/5042] Elapsed 0m 0s (remain 63m 39s) Loss: 0.0010(0.0010) Grad: 818.1074  LR: 0.00000293  
Epoch: [4][100/5042] Elapsed 0m 32s (remain 26m 44s) Loss: 0.0016(0.0013) Grad: 502.5338  LR: 0.00000282  
Epoch: [4][200/5042] Elapsed 1m 4s (remain 26m 1s) Loss: 0.0010(0.0013) Grad: 1442.9409  LR: 0.00000271  
Epoch: [4][300/5042] Elapsed 1m 36s (remain 25m 25s) Loss: 0.0019(0.0014) Grad: 662.0673  LR: 0.00000261  
Epoch: [4][400/5042] Elapsed 2m 8s (remain 24m 51s) Loss: 0.0004(0.0014) Grad: 316.5270  LR: 0.00000250  
Epoch: [4][500/5042] Elapsed 2m 40s (remain 24m 18s) Loss: 0.0001(0.0014) Grad: 177.3092  LR: 0.00000240  
Epoch: [4][600/5042] Elapsed 3m 12s (remain 23m 46s) Loss: 0.0008(0.0014) Grad: 799.3568  LR: 0.00000230  
Epoch: [4][700/5042] Elapsed 3m 45s (remain 23m 13s) Loss: 0.0004(0.0014) Grad: 400.3608  LR: 0.00000220  
Epoch: [4][800/5042] Elapsed 4m 17s (remain 22m 41s) Loss: 0.0004(0.0014) Grad: 292.0172  LR: 0.00000210  
Epoch: [4][900/5042] Elapsed 4m 49s (remai

Epoch 4 - avg_train_loss: 0.0015  avg_val_loss: 0.0039  time: 1715s
Epoch 4 - Score: 0.8935  th: 0.5
Score: 0.8937  th: 0.5


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/131 [00:00<?, ?it/s]

hallucinations-after-taking-ambien({'score': 0.0, 'num_labels': 1}) has 1341 pseudo positive labels
sampled by max_samples -> hallucinations-after-taking-ambien({'score': 0.0, 'num_labels': 1}) has 10 pseudo positive labels
stress({'score': 0.3636363636363636, 'num_labels': 1}) has 17 pseudo positive labels
sampled by max_samples -> stress({'score': 0.3636363636363636, 'num_labels': 1}) has 10 pseudo positive labels
no-premenstrual-symptoms({'score': 0.384, 'num_labels': 11}) has 176 pseudo positive labels
sampled by max_samples -> no-premenstrual-symptoms({'score': 0.384, 'num_labels': 11}) has 10 pseudo positive labels
meningococcal-vaccine-status-unknown({'score': 0.4403669724770642, 'num_labels': 2}) has 158 pseudo positive labels
sampled by max_samples -> meningococcal-vaccine-status-unknown({'score': 0.4403669724770642, 'num_labels': 2}) has 10 pseudo positive labels
prior-episodes-of-diarrhea({'score': 0.67720424796246, 'num_labels': 96}) has 700 pseudo positive labels
sampled b

Epoch 1 - avg_train_loss: 0.0040  avg_val_loss: 0.0045  time: 1702s
Epoch 1 - Score: 0.8868  th: 0.5
Epoch 1 - Save Best Score: 0.8868 Model


Epoch: [2][0/4993] Elapsed 0m 0s (remain 73m 24s) Loss: 0.0002(0.0002) Grad: 250.7558  LR: 0.00001707  
Epoch: [2][100/4993] Elapsed 0m 33s (remain 27m 19s) Loss: 0.0014(0.0015) Grad: 480.4092  LR: 0.00001696  
Epoch: [2][200/4993] Elapsed 1m 6s (remain 26m 27s) Loss: 0.0042(0.0017) Grad: 3715.0344  LR: 0.00001684  
Epoch: [2][300/4993] Elapsed 1m 38s (remain 25m 38s) Loss: 0.0006(0.0017) Grad: 418.0899  LR: 0.00001673  
Epoch: [2][400/4993] Elapsed 2m 10s (remain 24m 56s) Loss: 0.0084(0.0018) Grad: 6137.1016  LR: 0.00001661  
Epoch: [2][500/4993] Elapsed 2m 42s (remain 24m 19s) Loss: 0.0018(0.0019) Grad: 792.0134  LR: 0.00001649  
Epoch: [2][600/4993] Elapsed 3m 15s (remain 23m 45s) Loss: 0.0018(0.0019) Grad: 3023.2935  LR: 0.00001637  
Epoch: [2][700/4993] Elapsed 3m 47s (remain 23m 11s) Loss: 0.0006(0.0020) Grad: 906.6857  LR: 0.00001625  
Epoch: [2][800/4993] Elapsed 4m 19s (remain 22m 37s) Loss: 0.0005(0.0019) Grad: 1131.8177  LR: 0.00001613  
Epoch: [2][900/4993] Elapsed 4m 51s (

Epoch 2 - avg_train_loss: 0.0018  avg_val_loss: 0.0038  time: 1703s
Epoch 2 - Score: 0.8910  th: 0.5
Epoch 2 - Save Best Score: 0.8910 Model


Epoch: [3][0/4993] Elapsed 0m 0s (remain 71m 43s) Loss: 0.0009(0.0009) Grad: 656.5357  LR: 0.00001000  
Epoch: [3][100/4993] Elapsed 0m 34s (remain 27m 35s) Loss: 0.0005(0.0016) Grad: 322.2943  LR: 0.00000984  
Epoch: [3][200/4993] Elapsed 1m 6s (remain 26m 35s) Loss: 0.0005(0.0016) Grad: 587.0912  LR: 0.00000969  
Epoch: [3][300/4993] Elapsed 1m 39s (remain 25m 43s) Loss: 0.0027(0.0017) Grad: 1319.6512  LR: 0.00000953  
Epoch: [3][400/4993] Elapsed 2m 11s (remain 25m 0s) Loss: 0.0005(0.0017) Grad: 517.8616  LR: 0.00000937  
Epoch: [3][500/4993] Elapsed 2m 43s (remain 24m 22s) Loss: 0.0015(0.0017) Grad: 2068.9551  LR: 0.00000921  
Epoch: [3][600/4993] Elapsed 3m 15s (remain 23m 46s) Loss: 0.0007(0.0016) Grad: 5159.2568  LR: 0.00000906  
Epoch: [3][700/4993] Elapsed 3m 47s (remain 23m 11s) Loss: 0.0017(0.0016) Grad: 1296.6718  LR: 0.00000890  
Epoch: [3][800/4993] Elapsed 4m 19s (remain 22m 37s) Loss: 0.0010(0.0016) Grad: 1204.8062  LR: 0.00000875  
Epoch: [3][900/4993] Elapsed 4m 51s (

Epoch 3 - avg_train_loss: 0.0016  avg_val_loss: 0.0040  time: 1703s
Epoch 3 - Score: 0.8943  th: 0.5
Epoch 3 - Save Best Score: 0.8943 Model


Epoch: [4][0/4993] Elapsed 0m 0s (remain 62m 56s) Loss: 0.0026(0.0026) Grad: 5008.5405  LR: 0.00000293  
Epoch: [4][100/4993] Elapsed 0m 34s (remain 27m 42s) Loss: 0.0002(0.0013) Grad: 638.1159  LR: 0.00000282  
Epoch: [4][200/4993] Elapsed 1m 7s (remain 26m 39s) Loss: 0.0021(0.0014) Grad: 3135.9512  LR: 0.00000271  
Epoch: [4][300/4993] Elapsed 1m 39s (remain 25m 45s) Loss: 0.0016(0.0014) Grad: 2765.9355  LR: 0.00000260  
Epoch: [4][400/4993] Elapsed 2m 11s (remain 25m 1s) Loss: 0.0040(0.0014) Grad: 6905.7939  LR: 0.00000250  
Epoch: [4][500/4993] Elapsed 2m 43s (remain 24m 23s) Loss: 0.0012(0.0014) Grad: 2824.9089  LR: 0.00000240  
Epoch: [4][600/4993] Elapsed 3m 15s (remain 23m 47s) Loss: 0.0030(0.0013) Grad: 1465.7169  LR: 0.00000229  
Epoch: [4][700/4993] Elapsed 3m 47s (remain 23m 12s) Loss: 0.0006(0.0014) Grad: 715.0070  LR: 0.00000220  
Epoch: [4][800/4993] Elapsed 4m 19s (remain 22m 37s) Loss: 0.0003(0.0014) Grad: 2908.9473  LR: 0.00000210  
Epoch: [4][900/4993] Elapsed 4m 51s

Epoch 4 - avg_train_loss: 0.0015  avg_val_loss: 0.0040  time: 1703s
Epoch 4 - Score: 0.8946  th: 0.5
Epoch 4 - Save Best Score: 0.8946 Model
Score: 0.8946  th: 0.5
Score: 0.8911  th: 0.5
