## Vinh Nguyen - 300473488
<br>
<br>

# Library

In [4]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')

import tokenizers
import transformers
from transformers import AutoTokenizer, PreTrainedTokenizerFast, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Found existing installation: transformers 4.12.5
Uninstalling transformers-4.12.5:
  Successfully uninstalled transformers-4.12.5




Looking in links: ../input/nbme-pip-wheels
Processing /kaggle/input/nbme-pip-wheels/transformers-4.16.2-py3-none-any.whl
Installing collected packages: transformers


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.8.0 requires transformers<4.13,>=4.1, but you have transformers 4.16.2 which is incompatible.


Successfully installed transformers-4.16.2
env: TOKENIZERS_PARALLELISM=true


# Directory settings

In [1]:
OUTPUT_DIR = './results/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [2]:
class CFG:
    wandb=True
    competition='NBME'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=1000
    num_workers=4
    model="microsoft/deberta-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=12
    fc_dropout=0.2
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=20
    n_fold=3
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 1
    CFG.trn_fold = [0]

In [3]:
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"

    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='NBME-Public', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: wandb version 0.15.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


# Helper functions for scoring

In [5]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """

    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [6]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

# Utils

In [7]:
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [8]:
features = pd.read_csv('../input/nbme-processed/features.csv')
features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"

patient_notes = pd.read_csv('../input/nbme-processed/patient_notes.csv')

In [9]:
train = pd.read_csv('../input/nbme-processed/train_processed.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)

# CV split

In [10]:
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    3531
1    3535
2    3538
dtype: int64

In [11]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

# Dataset

In [13]:
for text_col in ['pn_history']:
    pn_history_lengths = []
    tk0 = tqdm(patient_notes[text_col].fillna("").values, total=len(patient_notes))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        pn_history_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(pn_history_lengths)}')

for text_col in ['feature_text']:
    features_lengths = []
    tk0 = tqdm(features[text_col].fillna("").values, total=len(features))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(features_lengths)}')

CFG.max_len = max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/42146 [00:00<?, ?it/s]

pn_history max(lengths): 433


  0%|          | 0/143 [00:00<?, ?it/s]

feature_text max(lengths): 30
max_len: 466


In [14]:
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation_length, location_list):
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=CFG.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    return torch.tensor(label, dtype=torch.float)


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.annotation_lengths = df['annotation_length'].values
        self.locations = df['location'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        label = create_label(self.cfg, 
                             self.pn_historys[item], 
                             self.annotation_lengths[item], 
                             self.locations[item])
        return inputs, label

# Model

In [15]:
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# Helpler functions

In [16]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [17]:
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_texts = valid_folds['pn_history'].values
    valid_labels = create_labels_for_scoring(valid_folds)
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="none")
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        predictions = predictions.reshape((len(valid_folds), CFG.max_len))
        
        # scoring
        char_probs = get_char_probs(valid_texts, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"model_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"model_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

# Main

In [18]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = create_labels_for_scoring(oof_df)
        predictions = oof_df[[i for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(oof_df['pn_history'].values, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        
    if CFG.wandb:
        wandb.finish()



Downloading:   0%|          | 0.00/533M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/589] Elapsed 0m 2s (remain 23m 11s) Loss: 0.6061(0.6061) Grad: inf  LR: 0.00002000  
Epoch: [1][588/589] Elapsed 10m 8s (remain 0m 0s) Loss: 0.0059(0.0326) Grad: 1668.7924  LR: 0.00001809  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 32s) Loss: 0.0137(0.0137) 
EVAL: [294/295] Elapsed 1m 36s (remain 0m 0s) Loss: 0.0108(0.0169) 


Epoch 1 - avg_train_loss: 0.0326  avg_val_loss: 0.0169  time: 710s
Epoch 1 - Score: 0.7820
Epoch 1 - Save Best Score: 0.7820 Model


Epoch: [2][0/589] Elapsed 0m 1s (remain 12m 34s) Loss: 0.0151(0.0151) Grad: 29721.5879  LR: 0.00001809  
Epoch: [2][588/589] Elapsed 10m 8s (remain 0m 0s) Loss: 0.0028(0.0133) Grad: 10961.4463  LR: 0.00001310  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 33s) Loss: 0.0110(0.0110) 
EVAL: [294/295] Elapsed 1m 36s (remain 0m 0s) Loss: 0.0047(0.0142) 


Epoch 2 - avg_train_loss: 0.0133  avg_val_loss: 0.0142  time: 709s
Epoch 2 - Score: 0.8310
Epoch 2 - Save Best Score: 0.8310 Model


Epoch: [3][0/589] Elapsed 0m 1s (remain 12m 15s) Loss: 0.0250(0.0250) Grad: 50607.3711  LR: 0.00001309  
Epoch: [3][588/589] Elapsed 10m 8s (remain 0m 0s) Loss: 0.0219(0.0112) Grad: 53968.6836  LR: 0.00000692  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 37s) Loss: 0.0095(0.0095) 
EVAL: [294/295] Elapsed 1m 36s (remain 0m 0s) Loss: 0.0039(0.0136) 


Epoch 3 - avg_train_loss: 0.0112  avg_val_loss: 0.0136  time: 709s
Epoch 3 - Score: 0.8376
Epoch 3 - Save Best Score: 0.8376 Model


Epoch: [4][0/589] Elapsed 0m 1s (remain 12m 9s) Loss: 0.0192(0.0192) Grad: 28579.1602  LR: 0.00000691  
Epoch: [4][588/589] Elapsed 10m 7s (remain 0m 0s) Loss: 0.0107(0.0101) Grad: 13561.7139  LR: 0.00000192  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 36s) Loss: 0.0094(0.0094) 
EVAL: [294/295] Elapsed 1m 36s (remain 0m 0s) Loss: 0.0036(0.0137) 


Epoch 4 - avg_train_loss: 0.0101  avg_val_loss: 0.0137  time: 709s
Epoch 4 - Score: 0.8414
Epoch 4 - Save Best Score: 0.8414 Model


Epoch: [5][0/589] Elapsed 0m 1s (remain 12m 22s) Loss: 0.0057(0.0057) Grad: 16349.2373  LR: 0.00000191  
Epoch: [5][588/589] Elapsed 10m 8s (remain 0m 0s) Loss: 0.0193(0.0093) Grad: 49969.1797  LR: 0.00000000  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 28s) Loss: 0.0094(0.0094) 
EVAL: [294/295] Elapsed 1m 36s (remain 0m 0s) Loss: 0.0037(0.0136) 


Epoch 5 - avg_train_loss: 0.0093  avg_val_loss: 0.0136  time: 709s
Epoch 5 - Score: 0.8419
Epoch 5 - Save Best Score: 0.8419 Model
Score: 0.8419
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/589] Elapsed 0m 1s (remain 11m 58s) Loss: 0.9019(0.9019) Grad: inf  LR: 0.00002000  
Epoch: [1][588/589] Elapsed 10m 7s (remain 0m 0s) Loss: 0.0137(0.0362) Grad: 1706.6517  LR: 0.00001809  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 33s) Loss: 0.0056(0.0056) 
EVAL: [294/295] Elapsed 1m 36s (remain 0m 0s) Loss: 0.0074(0.0167) 


Epoch 1 - avg_train_loss: 0.0362  avg_val_loss: 0.0167  time: 709s
Epoch 1 - Score: 0.7930
Epoch 1 - Save Best Score: 0.7930 Model


Epoch: [2][0/589] Elapsed 0m 1s (remain 12m 16s) Loss: 0.0072(0.0072) Grad: 13162.1338  LR: 0.00001808  
Epoch: [2][588/589] Elapsed 10m 7s (remain 0m 0s) Loss: 0.0038(0.0124) Grad: 9128.4785  LR: 0.00001309  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 42s) Loss: 0.0039(0.0039) 
EVAL: [294/295] Elapsed 1m 36s (remain 0m 0s) Loss: 0.0062(0.0156) 


Epoch 2 - avg_train_loss: 0.0124  avg_val_loss: 0.0156  time: 709s
Epoch 2 - Score: 0.8192
Epoch 2 - Save Best Score: 0.8192 Model


Epoch: [3][0/589] Elapsed 0m 1s (remain 12m 26s) Loss: 0.0105(0.0105) Grad: 20516.4766  LR: 0.00001308  
Epoch: [3][588/589] Elapsed 10m 8s (remain 0m 0s) Loss: 0.0077(0.0106) Grad: 41392.8398  LR: 0.00000691  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 37s) Loss: 0.0033(0.0033) 
EVAL: [294/295] Elapsed 1m 36s (remain 0m 0s) Loss: 0.0062(0.0156) 


Epoch 3 - avg_train_loss: 0.0106  avg_val_loss: 0.0156  time: 710s
Epoch 3 - Score: 0.8271
Epoch 3 - Save Best Score: 0.8271 Model


Epoch: [4][0/589] Elapsed 0m 1s (remain 12m 21s) Loss: 0.0045(0.0045) Grad: 7891.3491  LR: 0.00000690  
Epoch: [4][588/589] Elapsed 10m 8s (remain 0m 0s) Loss: 0.0094(0.0098) Grad: 24012.6172  LR: 0.00000191  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 36s) Loss: 0.0034(0.0034) 
EVAL: [294/295] Elapsed 1m 36s (remain 0m 0s) Loss: 0.0061(0.0154) 


Epoch 4 - avg_train_loss: 0.0098  avg_val_loss: 0.0154  time: 709s
Epoch 4 - Score: 0.8298
Epoch 4 - Save Best Score: 0.8298 Model


Epoch: [5][0/589] Elapsed 0m 1s (remain 12m 34s) Loss: 0.0148(0.0148) Grad: 28938.8770  LR: 0.00000190  
Epoch: [5][588/589] Elapsed 10m 7s (remain 0m 0s) Loss: 0.0080(0.0094) Grad: 19768.6953  LR: 0.00000000  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 34s) Loss: 0.0033(0.0033) 
EVAL: [294/295] Elapsed 1m 36s (remain 0m 0s) Loss: 0.0063(0.0155) 


Epoch 5 - avg_train_loss: 0.0094  avg_val_loss: 0.0155  time: 709s
Epoch 5 - Score: 0.8312
Epoch 5 - Save Best Score: 0.8312 Model
Score: 0.8312
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/588] Elapsed 0m 1s (remain 11m 49s) Loss: 1.0523(1.0523) Grad: inf  LR: 0.00002000  
Epoch: [1][587/588] Elapsed 10m 6s (remain 0m 0s) Loss: 0.0104(0.0390) Grad: 1014.2875  LR: 0.00001810  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 54s) Loss: 0.0096(0.0096) 
EVAL: [294/295] Elapsed 1m 37s (remain 0m 0s) Loss: 0.0253(0.0156) 


Epoch 1 - avg_train_loss: 0.0390  avg_val_loss: 0.0156  time: 708s
Epoch 1 - Score: 0.8000
Epoch 1 - Save Best Score: 0.8000 Model


Epoch: [2][0/588] Elapsed 0m 1s (remain 12m 24s) Loss: 0.0107(0.0107) Grad: 16937.1328  LR: 0.00001809  
Epoch: [2][587/588] Elapsed 10m 6s (remain 0m 0s) Loss: 0.0037(0.0130) Grad: 18334.5977  LR: 0.00001311  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 38s) Loss: 0.0111(0.0111) 
EVAL: [294/295] Elapsed 1m 37s (remain 0m 0s) Loss: 0.0239(0.0142) 


Epoch 2 - avg_train_loss: 0.0130  avg_val_loss: 0.0142  time: 708s
Epoch 2 - Score: 0.8254
Epoch 2 - Save Best Score: 0.8254 Model


Epoch: [3][0/588] Elapsed 0m 1s (remain 12m 36s) Loss: 0.0048(0.0048) Grad: 8745.3047  LR: 0.00001310  
Epoch: [3][587/588] Elapsed 10m 7s (remain 0m 0s) Loss: 0.0032(0.0115) Grad: 15237.8955  LR: 0.00000693  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 37s) Loss: 0.0063(0.0063) 
EVAL: [294/295] Elapsed 1m 37s (remain 0m 0s) Loss: 0.0236(0.0135) 


Epoch 3 - avg_train_loss: 0.0115  avg_val_loss: 0.0135  time: 709s
Epoch 3 - Score: 0.8369
Epoch 3 - Save Best Score: 0.8369 Model


Epoch: [4][0/588] Elapsed 0m 1s (remain 12m 25s) Loss: 0.0109(0.0109) Grad: 29215.4004  LR: 0.00000692  
Epoch: [4][587/588] Elapsed 10m 7s (remain 0m 0s) Loss: 0.0091(0.0105) Grad: 46593.1250  LR: 0.00000193  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 38s) Loss: 0.0065(0.0065) 
EVAL: [294/295] Elapsed 1m 37s (remain 0m 0s) Loss: 0.0246(0.0135) 


Epoch 4 - avg_train_loss: 0.0105  avg_val_loss: 0.0135  time: 709s
Epoch 4 - Score: 0.8391
Epoch 4 - Save Best Score: 0.8391 Model


Epoch: [5][0/588] Elapsed 0m 1s (remain 12m 25s) Loss: 0.0041(0.0041) Grad: 22838.3789  LR: 0.00000192  
Epoch: [5][587/588] Elapsed 10m 7s (remain 0m 0s) Loss: 0.0142(0.0101) Grad: 36214.4766  LR: 0.00000000  
EVAL: [0/295] Elapsed 0m 0s (remain 2m 38s) Loss: 0.0063(0.0063) 
EVAL: [294/295] Elapsed 1m 37s (remain 0m 0s) Loss: 0.0249(0.0135) 


Epoch 5 - avg_train_loss: 0.0101  avg_val_loss: 0.0135  time: 709s
Epoch 5 - Score: 0.8398
Epoch 5 - Save Best Score: 0.8398 Model
Score: 0.8398
Score: 0.8376


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
[fold0] avg_train_loss,█▂▂▁▁
[fold0] avg_val_loss,█▂▁▁▁
[fold0] epoch,▁▃▅▆█
[fold0] loss,█▇▃▃▂▃▁▁▂▁▂▂▂▂▂▁▂▂▁▂▁▁▂▁▁▂▅▂▁▁▂▁▁▁▁▁▁▂▁▁
[fold0] lr,███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁
[fold0] score,▁▇▇██
[fold1] avg_train_loss,█▂▁▁▁
[fold1] avg_val_loss,█▂▂▁▂
[fold1] epoch,▁▃▅▆█
[fold1] loss,█▆█▃▃▅▃▇▃▁▁▅▁▂▂▁▁▃▁▂▂▁█▁▁▁▃▁▂▂▃▂▅▂▂▁▄▁▄▁

0,1
[fold0] avg_train_loss,0.00927
[fold0] avg_val_loss,0.01364
[fold0] epoch,5.0
[fold0] loss,0.01935
[fold0] lr,0.0
[fold0] score,0.84186
[fold1] avg_train_loss,0.00937
[fold1] avg_val_loss,0.01553
[fold1] epoch,5.0
[fold1] loss,0.00803


In [19]:
# Saving results
oof_df.to_pickle(OUTPUT_DIR+'output_df.pkl')
oof_df.to_csv(OUTPUT_DIR+'output_df.csv', index=False)