<a href="https://colab.research.google.com/github/QSLV/Kaggle/blob/main/Patent/Patent_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install kaggle
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json



In [3]:
! pip install wandb
! pip install datasets
! pip install transformers==4.16.2
! pip install tokenizers==0.11.0
! pip install sentencepiece



In [4]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold
import shutil

from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [5]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    competition='PPPM'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [6]:
if CFG.wandb:
    
    import wandb

    try:
        # from kaggle_secrets import UserSecretsClient
        # user_secrets = UserSecretsClient()
        # secret_value_0 = user_secrets.get_secret("wandb_api")
        secret_value_0 = "fed47a14994d9e07c0966667c6820bf1fc990103"
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='Patent', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

[34m[1mwandb[0m: Currently logged in as: [33mqslv[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [7]:
INPUT_DIR = r'/content/drive/MyDrive/Patent_Kaggle/data'
OUTPUT_DIR = r'/content/drive/MyDrive/Patent_Kaggle/experiments/{}/'.format(CFG.model)
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [8]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

# os.system('pip uninstall -y transformers')
# os.system('pip uninstall -y tokenizers')
# os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels transformers')
# os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.__version__: 1.10.0+cu111
tokenizers.__version__: 0.11.0
transformers.__version__: 4.16.2
env: TOKENIZERS_PARALLELISM=true


In [9]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=2022)

In [10]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))
print(f"train.shape: {train.shape}")
print(f"test.shape: {test.shape}")
print(f"submission.shape: {submission.shape}")
display(train.head())
display(test.head())
display(submission.head())


train.shape: (36473, 5)
test.shape: (36, 4)
submission.shape: (36, 2)


Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23
2,36baf228038e314b,lower trunnion,lower locating,B60
3,1f37ead645e7f0c8,cap component,upper portion,D06
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04


Unnamed: 0,id,score
0,4112d61851461f60,0
1,09e418c93a776564,0
2,36baf228038e314b,0
3,1f37ead645e7f0c8,0
4,71a5b6ad068d531f,0


In [11]:
# ====================================================
# CPC Data
# ====================================================
def get_cpc_texts():
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir(os.path.join(INPUT_DIR, 'cpc', 'CPCSchemeXML202105')):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(os.path.join(INPUT_DIR, 'cpc', 'CPCTitleList202202', f"cpc-section-{cpc}_20220201.txt")) as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results


cpc_texts = get_cpc_texts()
torch.save(cpc_texts, OUTPUT_DIR+"cpc_texts.pth")
train['context_text'] = train['context'].map(cpc_texts)
test['context_text'] = test['context'].map(cpc_texts)
display(train.head())
display(test.head())

Unnamed: 0,id,anchor,target,context,score,context_text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...


Unnamed: 0,id,anchor,target,context,context_text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,PHYSICS. OPTICS
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,MECHANICAL ENGINEERING; LIGHTING; HEATING; WEA...
2,36baf228038e314b,lower trunnion,lower locating,B60,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...
3,1f37ead645e7f0c8,cap component,upper portion,D06,TEXTILES; PAPER. TREATMENT OF TEXTILES OR THE ...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,ELECTRICITY. ELECTRIC COMMUNICATION TECHNIQUE


In [12]:
train['text'] = train['context_text'] + ' ' + train['anchor'] + '[SEP]' + train['target']
test['text'] = test['context_text'] + ' ' + test['anchor'] + '[SEP]' + test['target']
display(train.head())
display(test.head())

Unnamed: 0,id,anchor,target,context,score,context_text,text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...


Unnamed: 0,id,anchor,target,context,context_text,text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,PHYSICS. OPTICS,PHYSICS. OPTICS opc drum[SEP]inorganic photoco...
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,MECHANICAL ENGINEERING; LIGHTING; HEATING; WEA...,MECHANICAL ENGINEERING; LIGHTING; HEATING; WEA...
2,36baf228038e314b,lower trunnion,lower locating,B60,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...
3,1f37ead645e7f0c8,cap component,upper portion,D06,TEXTILES; PAPER. TREATMENT OF TEXTILES OR THE ...,TEXTILES; PAPER. TREATMENT OF TEXTILES OR THE ...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,ELECTRICITY. ELECTRIC COMMUNICATION TECHNIQUE,ELECTRICITY. ELECTRIC COMMUNICATION TECHNIQUE ...


In [13]:
# ====================================================
# CV split
# ====================================================
train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train['score_map'])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    7295
1    7295
2    7295
3    7294
4    7294
dtype: int64

In [14]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

In [15]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
# ====================================================
# Define max_len
# ====================================================
lengths_dict = {}

lengths = []
tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['context_text'] = lengths

for text_col in ['anchor', 'target']:
    lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict[text_col] = lengths
    
CFG.max_len = max(lengths_dict['anchor']) + max(lengths_dict['target'])\
                + max(lengths_dict['context_text']) + 4 # CLS + SEP + SEP + SEP
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/36473 [00:00<?, ?it/s]

  0%|          | 0/36473 [00:00<?, ?it/s]

max_len: 133


In [17]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df['score'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

In [18]:

# train_dataset = TrainDataset(CFG, train)
# inputs, label = train_dataset[0]
# print(inputs)
# print(label)


In [19]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [20]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [21]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['score'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="mean")
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [22]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['score'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/911] Elapsed 0m 0s (remain 14m 4s) Loss: 0.7295(0.7295) Grad: inf  LR: 0.00002000  
Epoch: [1][100/911] Elapsed 0m 58s (remain 7m 52s) Loss: 0.5726(0.6428) Grad: 46791.6211  LR: 0.00001996  
Epoch: [1][200/911] Elapsed 1m 57s (remain 6m 53s) Loss: 0.5814(0.6182) Grad: 34219.6094  LR: 0.00001985  
Epoch: [1][300/911] Elapsed 2m 55s (remain 5m 54s) Loss: 0.6292(0.6046) Grad: 74793.1250  LR: 0.00001967  
Epoch: [1][400/911] Elapsed 3m 53s (remain 4m 56s) Loss: 0.5713(0.5961) Grad: 78818.9062  LR: 0.00001941  
Epoch: [1][500/911] Elapsed 4m 50s (remain 3m 58s) Loss: 0.6074(0.5880) Grad: 40493.8867  LR: 0.00001908  
Epoch: [1][600/911] Elapsed 5m 48s (remain 2m 59s) Loss: 0.5951(0.5827) Grad: 57985.1562  LR: 0.00001869  
Epoch: [1][700/911] Elapsed 6m 46s (remain 2m 1s) Loss: 0.5587(0.5788) Grad: 31796.2129  LR: 0.00001823  
Epoch: [1][800/911] Elapsed 7m 44s (remain 1m 3s) Loss: 0.4587(0.5754) Grad: 17075.6641  LR: 0.00001771  
Epoch: [1][900/911] Elapsed 8m 42s (remain 0m 5s)

Epoch 1 - avg_train_loss: 0.5724  avg_val_loss: 0.5483  time: 570s
Epoch 1 - Score: 0.8294
Epoch 1 - Save Best Score: 0.8294 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.4599(0.5483) 
Epoch: [2][0/911] Elapsed 0m 0s (remain 12m 55s) Loss: 0.4664(0.4664) Grad: 47156.6992  LR: 0.00001707  
Epoch: [2][100/911] Elapsed 0m 58s (remain 7m 51s) Loss: 0.4342(0.5348) Grad: 48724.8438  LR: 0.00001643  
Epoch: [2][200/911] Elapsed 1m 56s (remain 6m 52s) Loss: 0.5003(0.5315) Grad: 101625.0000  LR: 0.00001575  
Epoch: [2][300/911] Elapsed 2m 54s (remain 5m 54s) Loss: 0.5432(0.5337) Grad: 68281.1484  LR: 0.00001503  
Epoch: [2][400/911] Elapsed 3m 52s (remain 4m 55s) Loss: 0.5158(0.5332) Grad: 60840.9766  LR: 0.00001426  
Epoch: [2][500/911] Elapsed 4m 50s (remain 3m 57s) Loss: 0.6061(0.5316) Grad: 135618.2031  LR: 0.00001347  
Epoch: [2][600/911] Elapsed 5m 48s (remain 2m 59s) Loss: 0.5339(0.5329) Grad: 103512.6875  LR: 0.00001265  
Epoch: [2][700/911] Elapsed 6m 46s (remain 2m 1s) Loss: 0.5599(0.5334) Grad: 45088.8867  LR: 0.00001181  
Epoch: [2][800/911] Elapsed 7m 44s (remain 1m 3s) Loss: 0.5056(0.5341) Grad:

Epoch 2 - avg_train_loss: 0.5339  avg_val_loss: 0.5407  time: 570s
Epoch 2 - Score: 0.8444
Epoch 2 - Save Best Score: 0.8444 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.4689(0.5407) 
Epoch: [3][0/911] Elapsed 0m 0s (remain 12m 36s) Loss: 0.4541(0.4541) Grad: 171682.2344  LR: 0.00001000  
Epoch: [3][100/911] Elapsed 0m 58s (remain 7m 52s) Loss: 0.6295(0.5274) Grad: 133280.8594  LR: 0.00000914  
Epoch: [3][200/911] Elapsed 1m 56s (remain 6m 52s) Loss: 0.5289(0.5269) Grad: 46499.8633  LR: 0.00000829  
Epoch: [3][300/911] Elapsed 2m 54s (remain 5m 54s) Loss: 0.5397(0.5245) Grad: 165400.8281  LR: 0.00000745  
Epoch: [3][400/911] Elapsed 3m 52s (remain 4m 55s) Loss: 0.6103(0.5260) Grad: 42804.9609  LR: 0.00000663  
Epoch: [3][500/911] Elapsed 4m 50s (remain 3m 57s) Loss: 0.5571(0.5250) Grad: 101269.2344  LR: 0.00000583  
Epoch: [3][600/911] Elapsed 5m 48s (remain 2m 59s) Loss: 0.5171(0.5252) Grad: 53226.9414  LR: 0.00000506  
Epoch: [3][700/911] Elapsed 6m 46s (remain 2m 1s) Loss: 0.5716(0.5245) Grad: 43349.0273  LR: 0.00000433  
Epoch: [3][800/911] Elapsed 7m 44s (remain 1m 3s) Loss: 0.5071(0.5248) Grad

Epoch 3 - avg_train_loss: 0.5250  avg_val_loss: 0.5440  time: 570s
Epoch 3 - Score: 0.8468
Epoch 3 - Save Best Score: 0.8468 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.4527(0.5440) 
Epoch: [4][0/911] Elapsed 0m 0s (remain 12m 32s) Loss: 0.5553(0.5553) Grad: 71247.1953  LR: 0.00000294  
Epoch: [4][100/911] Elapsed 0m 58s (remain 7m 51s) Loss: 0.6007(0.5178) Grad: 40306.1992  LR: 0.00000235  
Epoch: [4][200/911] Elapsed 1m 56s (remain 6m 52s) Loss: 0.4603(0.5184) Grad: 45898.0000  LR: 0.00000183  
Epoch: [4][300/911] Elapsed 2m 54s (remain 5m 53s) Loss: 0.5064(0.5197) Grad: 56446.1016  LR: 0.00000136  
Epoch: [4][400/911] Elapsed 3m 52s (remain 4m 55s) Loss: 0.5206(0.5187) Grad: 91779.7344  LR: 0.00000096  
Epoch: [4][500/911] Elapsed 4m 50s (remain 3m 57s) Loss: 0.5462(0.5187) Grad: 71601.0859  LR: 0.00000063  
Epoch: [4][600/911] Elapsed 5m 48s (remain 2m 59s) Loss: 0.4949(0.5195) Grad: 35177.4609  LR: 0.00000036  
Epoch: [4][700/911] Elapsed 6m 46s (remain 2m 1s) Loss: 0.5968(0.5202) Grad: 85459.3359  LR: 0.00000017  
Epoch: [4][800/911] Elapsed 7m 44s (remain 1m 3s) Loss: 0.4315(0.5192) Grad: 57

Epoch 4 - avg_train_loss: 0.5189  avg_val_loss: 0.5399  time: 570s
Epoch 4 - Score: 0.8482
Epoch 4 - Save Best Score: 0.8482 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.4518(0.5399) 


Score: 0.8482
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/911] Elapsed 0m 0s (remain 12m 33s) Loss: 0.7289(0.7289) Grad: inf  LR: 0.00002000  
Epoch: [1][100/911] Elapsed 0m 58s (remain 7m 52s) Loss: 0.6524(0.6524) Grad: 38033.7617  LR: 0.00001996  
Epoch: [1][200/911] Elapsed 1m 56s (remain 6m 52s) Loss: 0.5725(0.6225) Grad: 27722.0176  LR: 0.00001985  
Epoch: [1][300/911] Elapsed 2m 54s (remain 5m 53s) Loss: 0.6295(0.6089) Grad: 57209.5859  LR: 0.00001967  
Epoch: [1][400/911] Elapsed 3m 52s (remain 4m 55s) Loss: 0.5908(0.5961) Grad: 68724.3906  LR: 0.00001941  
Epoch: [1][500/911] Elapsed 4m 50s (remain 3m 57s) Loss: 0.6285(0.5896) Grad: 114765.8906  LR: 0.00001908  
Epoch: [1][600/911] Elapsed 5m 48s (remain 2m 59s) Loss: 0.5109(0.5838) Grad: 37179.7695  LR: 0.00001869  
Epoch: [1][700/911] Elapsed 6m 46s (remain 2m 1s) Loss: 0.5022(0.5791) Grad: 34600.7383  LR: 0.00001823  
Epoch: [1][800/911] Elapsed 7m 44s (remain 1m 3s) Loss: 0.5298(0.5756) Grad: 48636.4258  LR: 0.00001771  
Epoch: [1][900/911] Elapsed 8m 42s (remain 0m 5

Epoch 1 - avg_train_loss: 0.5722  avg_val_loss: 0.5420  time: 570s
Epoch 1 - Score: 0.8222
Epoch 1 - Save Best Score: 0.8222 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5387(0.5420) 
Epoch: [2][0/911] Elapsed 0m 0s (remain 13m 12s) Loss: 0.6057(0.6057) Grad: 75933.6094  LR: 0.00001707  
Epoch: [2][100/911] Elapsed 0m 58s (remain 7m 52s) Loss: 0.5421(0.5356) Grad: 92455.2500  LR: 0.00001643  
Epoch: [2][200/911] Elapsed 1m 56s (remain 6m 52s) Loss: 0.5528(0.5373) Grad: 64847.4922  LR: 0.00001575  
Epoch: [2][300/911] Elapsed 2m 54s (remain 5m 54s) Loss: 0.4792(0.5368) Grad: 60065.2539  LR: 0.00001503  
Epoch: [2][400/911] Elapsed 3m 52s (remain 4m 55s) Loss: 0.4646(0.5347) Grad: 106307.5938  LR: 0.00001426  
Epoch: [2][500/911] Elapsed 4m 50s (remain 3m 57s) Loss: 0.5851(0.5355) Grad: 104572.2188  LR: 0.00001347  
Epoch: [2][600/911] Elapsed 5m 48s (remain 2m 59s) Loss: 0.6442(0.5349) Grad: 75353.6172  LR: 0.00001265  
Epoch: [2][700/911] Elapsed 6m 46s (remain 2m 1s) Loss: 0.3932(0.5334) Grad: 54990.3281  LR: 0.00001181  
Epoch: [2][800/911] Elapsed 7m 44s (remain 1m 3s) Loss: 0.5647(0.5336) Grad: 

Epoch 2 - avg_train_loss: 0.5336  avg_val_loss: 0.5409  time: 570s
Epoch 2 - Score: 0.8360
Epoch 2 - Save Best Score: 0.8360 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5434(0.5409) 
Epoch: [3][0/911] Elapsed 0m 0s (remain 12m 26s) Loss: 0.5524(0.5524) Grad: 49343.8125  LR: 0.00001000  
Epoch: [3][100/911] Elapsed 0m 58s (remain 7m 51s) Loss: 0.5612(0.5265) Grad: 81251.1953  LR: 0.00000914  
Epoch: [3][200/911] Elapsed 1m 56s (remain 6m 52s) Loss: 0.5528(0.5209) Grad: 144824.9531  LR: 0.00000829  
Epoch: [3][300/911] Elapsed 2m 54s (remain 5m 53s) Loss: 0.5671(0.5217) Grad: 73728.8516  LR: 0.00000745  
Epoch: [3][400/911] Elapsed 3m 52s (remain 4m 55s) Loss: 0.5255(0.5229) Grad: 81439.0938  LR: 0.00000663  
Epoch: [3][500/911] Elapsed 4m 50s (remain 3m 57s) Loss: 0.5212(0.5235) Grad: 42760.1680  LR: 0.00000583  
Epoch: [3][600/911] Elapsed 5m 48s (remain 2m 59s) Loss: 0.4643(0.5237) Grad: 94947.5234  LR: 0.00000506  
Epoch: [3][700/911] Elapsed 6m 46s (remain 2m 1s) Loss: 0.4762(0.5237) Grad: 62906.6133  LR: 0.00000433  
Epoch: [3][800/911] Elapsed 7m 44s (remain 1m 3s) Loss: 0.4892(0.5238) Grad: 1

Epoch 3 - avg_train_loss: 0.5239  avg_val_loss: 0.5407  time: 570s
Epoch 3 - Score: 0.8390
Epoch 3 - Save Best Score: 0.8390 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5479(0.5407) 
Epoch: [4][0/911] Elapsed 0m 0s (remain 13m 15s) Loss: 0.4826(0.4826) Grad: 44215.7852  LR: 0.00000294  
Epoch: [4][100/911] Elapsed 0m 58s (remain 7m 52s) Loss: 0.4879(0.5163) Grad: 64445.6836  LR: 0.00000235  
Epoch: [4][200/911] Elapsed 1m 56s (remain 6m 52s) Loss: 0.4640(0.5194) Grad: 99989.1875  LR: 0.00000183  
Epoch: [4][300/911] Elapsed 2m 54s (remain 5m 54s) Loss: 0.4546(0.5193) Grad: 85062.0938  LR: 0.00000136  
Epoch: [4][400/911] Elapsed 3m 52s (remain 4m 55s) Loss: 0.5082(0.5211) Grad: 54445.0391  LR: 0.00000096  
Epoch: [4][500/911] Elapsed 4m 50s (remain 3m 57s) Loss: 0.5953(0.5203) Grad: 115141.5469  LR: 0.00000063  
Epoch: [4][600/911] Elapsed 5m 48s (remain 2m 59s) Loss: 0.4644(0.5205) Grad: 85133.1953  LR: 0.00000036  
Epoch: [4][700/911] Elapsed 6m 46s (remain 2m 1s) Loss: 0.4997(0.5199) Grad: 63587.5742  LR: 0.00000017  
Epoch: [4][800/911] Elapsed 7m 44s (remain 1m 3s) Loss: 0.5266(0.5198) Grad: 6

Epoch 4 - avg_train_loss: 0.5187  avg_val_loss: 0.5445  time: 570s
Epoch 4 - Score: 0.8397
Epoch 4 - Save Best Score: 0.8397 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5597(0.5445) 


Score: 0.8397
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/911] Elapsed 0m 0s (remain 12m 32s) Loss: 0.6693(0.6693) Grad: 104531.6016  LR: 0.00002000  
Epoch: [1][100/911] Elapsed 0m 58s (remain 7m 51s) Loss: 0.5933(0.6437) Grad: 31142.6250  LR: 0.00001996  
Epoch: [1][200/911] Elapsed 1m 56s (remain 6m 52s) Loss: 0.5504(0.6215) Grad: 81664.5000  LR: 0.00001985  
Epoch: [1][300/911] Elapsed 2m 54s (remain 5m 54s) Loss: 0.6005(0.6073) Grad: 49877.6289  LR: 0.00001967  
Epoch: [1][400/911] Elapsed 3m 52s (remain 4m 55s) Loss: 0.6080(0.5979) Grad: 61472.4609  LR: 0.00001941  
Epoch: [1][500/911] Elapsed 4m 50s (remain 3m 57s) Loss: 0.5561(0.5908) Grad: 44143.8125  LR: 0.00001908  
Epoch: [1][600/911] Elapsed 5m 48s (remain 2m 59s) Loss: 0.6024(0.5851) Grad: 49759.7773  LR: 0.00001869  
Epoch: [1][700/911] Elapsed 6m 46s (remain 2m 1s) Loss: 0.5392(0.5809) Grad: 30883.4492  LR: 0.00001823  
Epoch: [1][800/911] Elapsed 7m 44s (remain 1m 3s) Loss: 0.4448(0.5769) Grad: 34635.4102  LR: 0.00001771  
Epoch: [1][900/911] Elapsed 8m 42s (rema

Epoch 1 - avg_train_loss: 0.5740  avg_val_loss: 0.5461  time: 570s
Epoch 1 - Score: 0.8129
Epoch 1 - Save Best Score: 0.8129 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5200(0.5461) 
Epoch: [2][0/911] Elapsed 0m 0s (remain 13m 2s) Loss: 0.5004(0.5004) Grad: 86473.9531  LR: 0.00001707  
Epoch: [2][100/911] Elapsed 0m 58s (remain 7m 52s) Loss: 0.5653(0.5294) Grad: 120387.0078  LR: 0.00001643  
Epoch: [2][200/911] Elapsed 1m 56s (remain 6m 52s) Loss: 0.5316(0.5316) Grad: 51189.0469  LR: 0.00001575  
Epoch: [2][300/911] Elapsed 2m 54s (remain 5m 54s) Loss: 0.5880(0.5327) Grad: 149617.4219  LR: 0.00001503  
Epoch: [2][400/911] Elapsed 3m 52s (remain 4m 56s) Loss: 0.5458(0.5334) Grad: 54893.0039  LR: 0.00001426  
Epoch: [2][500/911] Elapsed 4m 50s (remain 3m 58s) Loss: 0.6081(0.5342) Grad: 101469.0312  LR: 0.00001347  
Epoch: [2][600/911] Elapsed 5m 48s (remain 2m 59s) Loss: 0.4918(0.5345) Grad: 66857.1328  LR: 0.00001265  
Epoch: [2][700/911] Elapsed 6m 46s (remain 2m 1s) Loss: 0.4988(0.5346) Grad: 54877.0000  LR: 0.00001181  
Epoch: [2][800/911] Elapsed 7m 44s (remain 1m 3s) Loss: 0.5011(0.5343) Grad: 

Epoch 2 - avg_train_loss: 0.5338  avg_val_loss: 0.5368  time: 571s
Epoch 2 - Score: 0.8361
Epoch 2 - Save Best Score: 0.8361 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5162(0.5368) 
Epoch: [3][0/911] Elapsed 0m 0s (remain 12m 54s) Loss: 0.5557(0.5557) Grad: 64664.8672  LR: 0.00001000  
Epoch: [3][100/911] Elapsed 0m 58s (remain 7m 52s) Loss: 0.3914(0.5253) Grad: 40836.3594  LR: 0.00000914  
Epoch: [3][200/911] Elapsed 1m 56s (remain 6m 52s) Loss: 0.4824(0.5261) Grad: 208641.0938  LR: 0.00000829  
Epoch: [3][300/911] Elapsed 2m 54s (remain 5m 54s) Loss: 0.5544(0.5261) Grad: 250018.3125  LR: 0.00000745  
Epoch: [3][400/911] Elapsed 3m 52s (remain 4m 56s) Loss: 0.5121(0.5256) Grad: 51512.7500  LR: 0.00000663  
Epoch: [3][500/911] Elapsed 4m 50s (remain 3m 58s) Loss: 0.5296(0.5239) Grad: 80255.9219  LR: 0.00000583  
Epoch: [3][600/911] Elapsed 5m 48s (remain 2m 59s) Loss: 0.5739(0.5249) Grad: 30532.8887  LR: 0.00000506  
Epoch: [3][700/911] Elapsed 6m 46s (remain 2m 1s) Loss: 0.6328(0.5259) Grad: 70951.8906  LR: 0.00000433  
Epoch: [3][800/911] Elapsed 7m 44s (remain 1m 3s) Loss: 0.4901(0.5246) Grad: 

Epoch 3 - avg_train_loss: 0.5252  avg_val_loss: 0.5403  time: 571s
Epoch 3 - Score: 0.8374
Epoch 3 - Save Best Score: 0.8374 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5208(0.5403) 
Epoch: [4][0/911] Elapsed 0m 0s (remain 13m 52s) Loss: 0.4758(0.4758) Grad: 65662.4141  LR: 0.00000294  
Epoch: [4][100/911] Elapsed 0m 58s (remain 7m 53s) Loss: 0.5304(0.5180) Grad: 92911.5547  LR: 0.00000235  
Epoch: [4][200/911] Elapsed 1m 56s (remain 6m 53s) Loss: 0.4657(0.5175) Grad: 31409.8457  LR: 0.00000183  
Epoch: [4][300/911] Elapsed 2m 54s (remain 5m 54s) Loss: 0.5726(0.5209) Grad: 178583.0469  LR: 0.00000136  
Epoch: [4][400/911] Elapsed 3m 52s (remain 4m 56s) Loss: 0.5399(0.5204) Grad: 215400.4531  LR: 0.00000096  
Epoch: [4][500/911] Elapsed 4m 50s (remain 3m 58s) Loss: 0.5276(0.5203) Grad: 59819.2383  LR: 0.00000063  
Epoch: [4][600/911] Elapsed 5m 48s (remain 2m 59s) Loss: 0.5703(0.5199) Grad: 73394.0391  LR: 0.00000036  
Epoch: [4][700/911] Elapsed 6m 46s (remain 2m 1s) Loss: 0.4748(0.5198) Grad: 57026.3242  LR: 0.00000017  
Epoch: [4][800/911] Elapsed 7m 44s (remain 1m 3s) Loss: 0.5567(0.5198) Grad: 

Epoch 4 - avg_train_loss: 0.5193  avg_val_loss: 0.5435  time: 571s
Epoch 4 - Score: 0.8369


EVAL: [227/228] Elapsed 0m 42s (remain 0m 0s) Loss: 0.5203(0.5435) 


Score: 0.8374
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/911] Elapsed 0m 0s (remain 13m 54s) Loss: 0.6877(0.6877) Grad: 64419.0469  LR: 0.00002000  
Epoch: [1][100/911] Elapsed 0m 59s (remain 7m 53s) Loss: 0.6341(0.6404) Grad: 74523.9141  LR: 0.00001996  
Epoch: [1][200/911] Elapsed 1m 57s (remain 6m 53s) Loss: 0.6027(0.6176) Grad: 121928.4766  LR: 0.00001985  
Epoch: [1][300/911] Elapsed 2m 55s (remain 5m 54s) Loss: 0.5498(0.6060) Grad: 85840.1016  LR: 0.00001967  
Epoch: [1][400/911] Elapsed 3m 53s (remain 4m 56s) Loss: 0.5591(0.5960) Grad: 100330.9531  LR: 0.00001941  
Epoch: [1][500/911] Elapsed 4m 51s (remain 3m 58s) Loss: 0.5558(0.5914) Grad: 67336.5938  LR: 0.00001908  
Epoch: [1][600/911] Elapsed 5m 49s (remain 3m 0s) Loss: 0.5046(0.5859) Grad: 65928.7031  LR: 0.00001869  
Epoch: [1][700/911] Elapsed 6m 47s (remain 2m 1s) Loss: 0.5126(0.5828) Grad: 94320.6484  LR: 0.00001823  
Epoch: [1][800/911] Elapsed 7m 45s (remain 1m 3s) Loss: 0.5664(0.5800) Grad: 57001.6719  LR: 0.00001771  
Epoch: [1][900/911] Elapsed 8m 43s (rema

Epoch 1 - avg_train_loss: 0.5763  avg_val_loss: 0.5527  time: 571s
Epoch 1 - Score: 0.8101
Epoch 1 - Save Best Score: 0.8101 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.4688(0.5527) 
Epoch: [2][0/911] Elapsed 0m 0s (remain 13m 34s) Loss: 0.5412(0.5412) Grad: 118375.7734  LR: 0.00001707  
Epoch: [2][100/911] Elapsed 0m 59s (remain 7m 53s) Loss: 0.5430(0.5417) Grad: 116229.5703  LR: 0.00001643  
Epoch: [2][200/911] Elapsed 1m 57s (remain 6m 53s) Loss: 0.5284(0.5411) Grad: 123541.5625  LR: 0.00001575  
Epoch: [2][300/911] Elapsed 2m 55s (remain 5m 55s) Loss: 0.5671(0.5405) Grad: 86945.9922  LR: 0.00001503  
Epoch: [2][400/911] Elapsed 3m 53s (remain 4m 56s) Loss: 0.4972(0.5384) Grad: 123600.1406  LR: 0.00001426  
Epoch: [2][500/911] Elapsed 4m 51s (remain 3m 58s) Loss: 0.5679(0.5410) Grad: 49585.5938  LR: 0.00001347  
Epoch: [2][600/911] Elapsed 5m 49s (remain 3m 0s) Loss: 0.4745(0.5395) Grad: 172084.7031  LR: 0.00001265  
Epoch: [2][700/911] Elapsed 6m 47s (remain 2m 2s) Loss: 0.5785(0.5397) Grad: 111297.6406  LR: 0.00001181  
Epoch: [2][800/911] Elapsed 7m 45s (remain 1m 3s) Loss: 0.5429(0.5410) Gra

Epoch 2 - avg_train_loss: 0.5419  avg_val_loss: 0.5502  time: 571s
Epoch 2 - Score: 0.8238
Epoch 2 - Save Best Score: 0.8238 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.4636(0.5502) 
Epoch: [3][0/911] Elapsed 0m 0s (remain 12m 51s) Loss: 0.5350(0.5350) Grad: 66663.0312  LR: 0.00001000  
Epoch: [3][100/911] Elapsed 0m 59s (remain 7m 53s) Loss: 0.5806(0.5338) Grad: 149403.2031  LR: 0.00000914  
Epoch: [3][200/911] Elapsed 1m 57s (remain 6m 53s) Loss: 0.4734(0.5289) Grad: 71732.5625  LR: 0.00000829  
Epoch: [3][300/911] Elapsed 2m 55s (remain 5m 54s) Loss: 0.4696(0.5269) Grad: 110780.6562  LR: 0.00000745  
Epoch: [3][400/911] Elapsed 3m 53s (remain 4m 56s) Loss: 0.5870(0.5266) Grad: 50274.2266  LR: 0.00000663  
Epoch: [3][500/911] Elapsed 4m 51s (remain 3m 58s) Loss: 0.5546(0.5257) Grad: 120000.3672  LR: 0.00000583  
Epoch: [3][600/911] Elapsed 5m 49s (remain 3m 0s) Loss: 0.6152(0.5261) Grad: 92971.2344  LR: 0.00000506  
Epoch: [3][700/911] Elapsed 6m 47s (remain 2m 1s) Loss: 0.5260(0.5263) Grad: 52287.1562  LR: 0.00000433  
Epoch: [3][800/911] Elapsed 7m 45s (remain 1m 3s) Loss: 0.5938(0.5260) Grad: 

Epoch 3 - avg_train_loss: 0.5262  avg_val_loss: 0.5485  time: 571s
Epoch 3 - Score: 0.8291
Epoch 3 - Save Best Score: 0.8291 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.4557(0.5485) 
Epoch: [4][0/911] Elapsed 0m 0s (remain 12m 45s) Loss: 0.5652(0.5652) Grad: 60150.9336  LR: 0.00000294  
Epoch: [4][100/911] Elapsed 0m 58s (remain 7m 52s) Loss: 0.4321(0.5211) Grad: 87590.5156  LR: 0.00000235  
Epoch: [4][200/911] Elapsed 1m 57s (remain 6m 53s) Loss: 0.5658(0.5227) Grad: 62780.6133  LR: 0.00000183  
Epoch: [4][300/911] Elapsed 2m 55s (remain 5m 54s) Loss: 0.4648(0.5217) Grad: 86893.6953  LR: 0.00000136  
Epoch: [4][400/911] Elapsed 3m 53s (remain 4m 56s) Loss: 0.4332(0.5199) Grad: 69872.9141  LR: 0.00000096  
Epoch: [4][500/911] Elapsed 4m 51s (remain 3m 58s) Loss: 0.5522(0.5189) Grad: 190227.9219  LR: 0.00000063  
Epoch: [4][600/911] Elapsed 5m 49s (remain 3m 0s) Loss: 0.5084(0.5205) Grad: 88345.4453  LR: 0.00000036  
Epoch: [4][700/911] Elapsed 6m 47s (remain 2m 1s) Loss: 0.5124(0.5195) Grad: 24115.9160  LR: 0.00000017  
Epoch: [4][800/911] Elapsed 7m 45s (remain 1m 3s) Loss: 0.5647(0.5200) Grad: 60

Epoch 4 - avg_train_loss: 0.5208  avg_val_loss: 0.5465  time: 571s
Epoch 4 - Score: 0.8322
Epoch 4 - Save Best Score: 0.8322 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.4591(0.5465) 


Score: 0.8322
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/911] Elapsed 0m 0s (remain 12m 35s) Loss: 0.8582(0.8582) Grad: inf  LR: 0.00002000  
Epoch: [1][100/911] Elapsed 0m 59s (remain 7m 53s) Loss: 0.6214(0.6547) Grad: 31853.8516  LR: 0.00001996  
Epoch: [1][200/911] Elapsed 1m 57s (remain 6m 53s) Loss: 0.6104(0.6292) Grad: 33875.5078  LR: 0.00001985  
Epoch: [1][300/911] Elapsed 2m 55s (remain 5m 54s) Loss: 0.5850(0.6119) Grad: 31045.1465  LR: 0.00001967  
Epoch: [1][400/911] Elapsed 3m 53s (remain 4m 56s) Loss: 0.5961(0.5974) Grad: 41362.2227  LR: 0.00001941  
Epoch: [1][500/911] Elapsed 4m 51s (remain 3m 58s) Loss: 0.5484(0.5902) Grad: 28434.7656  LR: 0.00001908  
Epoch: [1][600/911] Elapsed 5m 49s (remain 3m 0s) Loss: 0.5600(0.5847) Grad: 50223.1445  LR: 0.00001869  
Epoch: [1][700/911] Elapsed 6m 47s (remain 2m 1s) Loss: 0.5697(0.5794) Grad: 58321.8477  LR: 0.00001823  
Epoch: [1][800/911] Elapsed 7m 45s (remain 1m 3s) Loss: 0.5754(0.5770) Grad: 163611.7344  LR: 0.00001771  
Epoch: [1][900/911] Elapsed 8m 43s (remain 0m 5s

Epoch 1 - avg_train_loss: 0.5741  avg_val_loss: 0.5426  time: 571s
Epoch 1 - Score: 0.8266
Epoch 1 - Save Best Score: 0.8266 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5211(0.5426) 
Epoch: [2][0/911] Elapsed 0m 0s (remain 13m 25s) Loss: 0.6068(0.6068) Grad: 88796.1484  LR: 0.00001707  
Epoch: [2][100/911] Elapsed 0m 59s (remain 7m 53s) Loss: 0.5971(0.5420) Grad: 118448.1484  LR: 0.00001643  
Epoch: [2][200/911] Elapsed 1m 57s (remain 6m 53s) Loss: 0.5966(0.5397) Grad: 97354.5625  LR: 0.00001575  
Epoch: [2][300/911] Elapsed 2m 55s (remain 5m 54s) Loss: 0.4499(0.5363) Grad: 56371.1641  LR: 0.00001503  
Epoch: [2][400/911] Elapsed 3m 53s (remain 4m 56s) Loss: 0.4917(0.5360) Grad: 80472.7188  LR: 0.00001426  
Epoch: [2][500/911] Elapsed 4m 51s (remain 3m 58s) Loss: 0.5853(0.5344) Grad: 501957.1875  LR: 0.00001347  
Epoch: [2][600/911] Elapsed 5m 49s (remain 3m 0s) Loss: 0.5949(0.5350) Grad: 86027.5625  LR: 0.00001265  
Epoch: [2][700/911] Elapsed 6m 47s (remain 2m 1s) Loss: 0.4667(0.5345) Grad: 85434.0469  LR: 0.00001181  
Epoch: [2][800/911] Elapsed 7m 45s (remain 1m 3s) Loss: 0.4868(0.5340) Grad: 8

Epoch 2 - avg_train_loss: 0.5341  avg_val_loss: 0.5417  time: 571s
Epoch 2 - Score: 0.8361
Epoch 2 - Save Best Score: 0.8361 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.4942(0.5417) 
Epoch: [3][0/911] Elapsed 0m 0s (remain 13m 8s) Loss: 0.5391(0.5391) Grad: 97647.7969  LR: 0.00001000  
Epoch: [3][100/911] Elapsed 0m 59s (remain 7m 53s) Loss: 0.5922(0.5282) Grad: 106694.4531  LR: 0.00000914  
Epoch: [3][200/911] Elapsed 1m 57s (remain 6m 53s) Loss: 0.5420(0.5251) Grad: 206032.7031  LR: 0.00000829  
Epoch: [3][300/911] Elapsed 2m 55s (remain 5m 54s) Loss: 0.5299(0.5258) Grad: 86511.8281  LR: 0.00000745  
Epoch: [3][400/911] Elapsed 3m 53s (remain 4m 56s) Loss: 0.5306(0.5261) Grad: 140107.3750  LR: 0.00000663  
Epoch: [3][500/911] Elapsed 4m 51s (remain 3m 58s) Loss: 0.4532(0.5257) Grad: 74669.9219  LR: 0.00000583  
Epoch: [3][600/911] Elapsed 5m 49s (remain 3m 0s) Loss: 0.5071(0.5245) Grad: 74462.4219  LR: 0.00000506  
Epoch: [3][700/911] Elapsed 6m 47s (remain 2m 1s) Loss: 0.5407(0.5251) Grad: 55522.0078  LR: 0.00000433  
Epoch: [3][800/911] Elapsed 7m 45s (remain 1m 3s) Loss: 0.5074(0.5246) Grad: 6

Epoch 3 - avg_train_loss: 0.5254  avg_val_loss: 0.5390  time: 571s
Epoch 3 - Score: 0.8431
Epoch 3 - Save Best Score: 0.8431 Model


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5011(0.5390) 
Epoch: [4][0/911] Elapsed 0m 0s (remain 12m 44s) Loss: 0.5632(0.5632) Grad: 39382.4727  LR: 0.00000294  
Epoch: [4][100/911] Elapsed 0m 58s (remain 7m 52s) Loss: 0.5253(0.5181) Grad: 112935.1797  LR: 0.00000235  
Epoch: [4][200/911] Elapsed 1m 56s (remain 6m 53s) Loss: 0.5830(0.5207) Grad: 138610.9688  LR: 0.00000183  
Epoch: [4][300/911] Elapsed 2m 55s (remain 5m 54s) Loss: 0.5091(0.5210) Grad: 107867.3828  LR: 0.00000136  
Epoch: [4][400/911] Elapsed 3m 53s (remain 4m 56s) Loss: 0.4424(0.5208) Grad: 31656.5488  LR: 0.00000096  
Epoch: [4][500/911] Elapsed 4m 51s (remain 3m 58s) Loss: 0.5447(0.5205) Grad: 80022.5000  LR: 0.00000063  
Epoch: [4][600/911] Elapsed 5m 49s (remain 3m 0s) Loss: 0.5428(0.5209) Grad: 68245.1328  LR: 0.00000036  
Epoch: [4][700/911] Elapsed 6m 47s (remain 2m 1s) Loss: 0.4911(0.5202) Grad: 119936.0234  LR: 0.00000017  
Epoch: [4][800/911] Elapsed 7m 45s (remain 1m 3s) Loss: 0.4703(0.5196) Grad:

Epoch 4 - avg_train_loss: 0.5200  avg_val_loss: 0.5408  time: 571s
Epoch 4 - Score: 0.8418


EVAL: [227/228] Elapsed 0m 41s (remain 0m 0s) Loss: 0.5025(0.5408) 


Score: 0.8431
Score: 0.8396


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
[fold0] avg_train_loss,█▃▂▁
[fold0] avg_val_loss,█▂▄▁
[fold0] epoch,▁▃▆█
[fold0] loss,▇██▇▆▂▆▂▃▂▆▄▅▆▃▂▆▃▃▅▁▃▄▄▅▄▅▅▃▆▂▄▃▂▁▄▆▃▃▆
[fold0] lr,███████▇▇▇▇▇▆▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold0] score,▁▇▇█
[fold1] avg_train_loss,█▃▂▁
[fold1] avg_val_loss,▃▁▁█
[fold1] epoch,▁▃▆█
[fold1] loss,██▆▆▇█▄▅▆▆▁▆▄▃▁▆▄▁▆▅▅▃▂▅▇▄▄▁▂▆▂▃▅▁▅▁▁▁▂▅

0,1
[fold0] avg_train_loss,0.51891
[fold0] avg_val_loss,0.53988
[fold0] epoch,4.0
[fold0] loss,0.55838
[fold0] lr,0.0
[fold0] score,0.84816
[fold1] avg_train_loss,0.51874
[fold1] avg_val_loss,0.54454
[fold1] epoch,4.0
[fold1] loss,0.51881
