1. データの読み込み

In [2]:
import math
import random
import time
import warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os
import transformers as T
from pathlib import Path
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [3]:
warnings.filterwarnings("ignore")

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

In [23]:
DATA_DIR = './dataset/data1'
OUTPUT_DIR = './result/result3/'
if not os.path.isdir(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [7]:
def init_logger(log_file=OUTPUT_DIR + "/train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

In [8]:
train = pd.read_csv(DATA_DIR  +"/train.csv", index_col=0)
test = pd.read_csv(DATA_DIR + "/test.csv", index_col=0)
sub = pd.read_csv(DATA_DIR + "/sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]

In [9]:
# この値を境に、モデルの出力を 0 と 1 にします。
border = len(train[train["judgement"] == 1]) / len(train["judgement"])
print(border)

0.023282372444280715


In [10]:
def get_train_data(train):

    # 交差検証 用の番号を振ります。
    Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
        train.loc[val_index, "fold"] = int(n)
    train["fold"] = train["fold"].astype(np.uint8)

    return train

In [11]:
def get_test_data(test):
    return test

In [12]:
train = get_train_data(train)

In [13]:
class BaseDataset(Dataset):
    def __init__(self, df, model_name, include_labels=True):
        tokenizer = T.BertTokenizer.from_pretrained(model_name)

        self.df = df
        self.include_labels = include_labels

        self.title = df["title"]
        self.encoded = tokenizer.batch_encode_plus(
            self.title,
            padding = 'max_length',            
            max_length = 72,
            truncation = True,
            return_attention_mask=True
        )
        
        if self.include_labels:
            self.labels = df["judgement"].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encoded['input_ids'][idx])
        attention_mask = torch.tensor(self.encoded['attention_mask'][idx])

        if self.include_labels:
            label = torch.tensor(self.labels[idx]).float()
            return input_ids, attention_mask, label

        return input_ids, attention_mask

In [14]:
class BaseModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()

        self.model = T.BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        out = self.sigmoid(out.logits).squeeze()

        return out

In [15]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [16]:
def train_fn(train_loader, model, criterion, optimizer, epoch, device):
    start = end = time.time()
    losses = AverageMeter()

    # switch to train mode
    model.train()

    for step, (input_ids, attention_mask, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        y_preds = model(input_ids, attention_mask)

        loss = criterion(y_preds, labels)

        # record loss
        losses.update(loss.item(), batch_size)
        loss.backward()

        optimizer.step()

        if step % 100 == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )

    return losses.avg

In [17]:
def valid_fn(valid_loader, model, criterion, device):
    start = end = time.time()
    losses = AverageMeter()

    # switch to evaluation mode
    model.eval()
    preds = []

    for step, (input_ids, attention_mask, labels) in enumerate(valid_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask)

        loss = criterion(y_preds, labels)
        losses.update(loss.item(), batch_size)

        # record score
        preds.append(y_preds.to("cpu").numpy())

        if step % 100 == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "
                f"Loss: {losses.avg:.4f} "
            )

    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [18]:
def train_loop(train, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader
    # ====================================================
    trn_idx = train[train["fold"] != fold].index
    val_idx = train[train["fold"] == fold].index

    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)

    train_dataset = BaseDataset(train_folds, "bert-base-uncased")
    valid_dataset = BaseDataset(valid_folds, "bert-base-uncased")

    train_loader = DataLoader(
        train_dataset,
        batch_size=16,
        shuffle=True,
        num_workers=4,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=16,
        shuffle=False,
        num_workers=4,
        pin_memory=True,
        drop_last=False,
    )

    # ====================================================
    # Model
    # ====================================================
    model = BaseModel("bert-base-uncased")
    model.to(device)

    optimizer = T.AdamW(model.parameters(), lr=2e-5)

    criterion = nn.BCELoss()

    # ====================================================
    # Loop
    # ====================================================
    best_score = -1
    best_loss = np.inf

    for epoch in range(3):
        start_time = time.time()
        
        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, device)

        # eval
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        valid_labels = valid_folds["judgement"].values

        # scoring
        score = fbeta_score(valid_labels, np.where(preds < border, 0, 1), beta=7.0)

        elapsed = time.time() - start_time
        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "preds": preds}, OUTPUT_DIR + f"bert-base-uncased_fold{fold}_best.pth"
            )

    check_point = torch.load(OUTPUT_DIR + f"bert-base-uncased_fold{fold}_best.pth")

    valid_folds["preds"] = check_point["preds"]

    return valid_folds

In [19]:
def inference():
    predictions = []

    test_dataset = BaseDataset(test, "bert-base-uncased", include_labels=False)
    test_loader = DataLoader(
        test_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True
    )

    for fold in range(5):
        LOGGER.info(f"========== model: bert-base-uncased fold: {fold} inference ==========")
        model = BaseModel("bert-base-uncased")
        model.to(device)
        model.load_state_dict(torch.load(OUTPUT_DIR + f"bert-base-uncased_fold{fold}_best.pth")["model"])
        model.eval()
        preds = []
        for i, (input_ids, attention_mask) in tqdm(enumerate(test_loader), total=len(test_loader)):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            with torch.no_grad():
                y_preds = model(input_ids, attention_mask)
            preds.append(y_preds.to("cpu").numpy())
        preds = np.concatenate(preds)
        predictions.append(preds)
    predictions = np.mean(predictions, axis=0)

    return predictions

In [20]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["judgement"].values
    score = fbeta_score(labels, np.where(preds < border, 0, 1), beta=7.0)
    LOGGER.info(f"Score: {score:<.5f}")

In [24]:
def main():
    # Training
    oof_df = pd.DataFrame()
    for fold in range(5):
        _oof_df = train_loop(train, fold)
        oof_df = pd.concat([oof_df, _oof_df])
        LOGGER.info(f"========== fold: {fold} result ==========")
        get_result(_oof_df)
        
    # CV result
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    
    # Save OOF result
    oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)

    # Inference
    predictions = inference()
    predictions = np.where(predictions < border, 0, 1)

    # submission
    sub["judgement"] = predictions
    sub.to_csv(OUTPUT_DIR + "submission.csv", index=False, header=False)

In [25]:
if __name__ == "__main__":
    main()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch: [1][0/1357] Elapsed 0m 0s (remain 5m 4s) Loss: 0.9828 
Epoch: [1][100/1357] Elapsed 0m 10s (remain 2m 9s) Loss: 0.1852 
Epoch: [1][200/1357] Elapsed 0m 20s (remain 1m 58s) Loss: 0.1437 
Epoch: [1][300/1357] Elapsed 0m 31s (remain 1m 48s) Loss: 0.1317 
Epoch: [1][400/1357] Elapsed 0m 41s (remain 1m 38s) Loss: 0.1188 
Epoch: [1][500/1357] Elapsed 0m 51s (remain 1m 28s) Loss: 0.1123 
Epoch: [1][600/1357] Elapsed 1m 2s (remain 1m 18s) Loss: 0.1099 
Epoch: [1][700/1357] Elapsed 1m 12s (remain 1m 7s) Loss: 0.1083 
Epoch: [1][800/1357] Elapsed 1m 22s (remain 0m 57s) Loss: 0.1043 
Epoch: [1][900/1357] Elapsed 1m 33s (remain 0m 47s) Loss: 0.0992 
Epoch: [1][1000/1357] Elapsed 1m 44s (remain 0m 37s) Loss: 0.0952 
Epoch: [1][1100/1357] Elapsed 1m 54s (remain 0m 26s) Loss: 0.0926 
Epoch: [1][1200/1357] Elapsed 2m 4s (remain 0m 16s) Loss: 0.0905 
Epoch: [1][1300/1357] Elapsed 2m 14s (remain 0m 5s) Loss: 0.0889 




Epoch: [1][1356/1357] Elapsed 2m 20s (remain 0m 0s) Loss: 0.0866 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 48s) Loss: 0.0015 




EVAL: [100/340] Elapsed 0m 3s (remain 0m 7s) Loss: 0.0544 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0598 
EVAL: [300/340] Elapsed 0m 9s (remain 0m 1s) Loss: 0.0596 


Epoch 1 - avg_train_loss: 0.0866  avg_val_loss: 0.0601  time: 151s
Epoch 1 - Score: 0.8168927250308262
Epoch 1 - Save Best Score: 0.8169 Model


EVAL: [339/340] Elapsed 0m 10s (remain 0m 0s) Loss: 0.0601 




Epoch: [2][0/1357] Elapsed 0m 0s (remain 5m 1s) Loss: 0.0036 
Epoch: [2][100/1357] Elapsed 0m 10s (remain 2m 12s) Loss: 0.0443 
Epoch: [2][200/1357] Elapsed 0m 20s (remain 1m 58s) Loss: 0.0492 
Epoch: [2][300/1357] Elapsed 0m 30s (remain 1m 48s) Loss: 0.0538 
Epoch: [2][400/1357] Elapsed 0m 41s (remain 1m 38s) Loss: 0.0519 
Epoch: [2][500/1357] Elapsed 0m 52s (remain 1m 29s) Loss: 0.0506 
Epoch: [2][600/1357] Elapsed 1m 3s (remain 1m 19s) Loss: 0.0503 
Epoch: [2][700/1357] Elapsed 1m 14s (remain 1m 9s) Loss: 0.0477 
Epoch: [2][800/1357] Elapsed 1m 26s (remain 1m 0s) Loss: 0.0473 
Epoch: [2][900/1357] Elapsed 1m 37s (remain 0m 49s) Loss: 0.0482 
Epoch: [2][1000/1357] Elapsed 1m 49s (remain 0m 38s) Loss: 0.0483 
Epoch: [2][1100/1357] Elapsed 2m 0s (remain 0m 27s) Loss: 0.0470 
Epoch: [2][1200/1357] Elapsed 2m 11s (remain 0m 17s) Loss: 0.0478 
Epoch: [2][1300/1357] Elapsed 2m 22s (remain 0m 6s) Loss: 0.0480 




Epoch: [2][1356/1357] Elapsed 2m 28s (remain 0m 0s) Loss: 0.0475 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 49s) Loss: 0.0017 




EVAL: [100/340] Elapsed 0m 3s (remain 0m 7s) Loss: 0.0521 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0621 
EVAL: [300/340] Elapsed 0m 9s (remain 0m 1s) Loss: 0.0624 


Epoch 2 - avg_train_loss: 0.0475  avg_val_loss: 0.0633  time: 160s
Epoch 2 - Score: 0.8340649692712907
Epoch 2 - Save Best Score: 0.8341 Model


EVAL: [339/340] Elapsed 0m 10s (remain 0m 0s) Loss: 0.0633 




Epoch: [3][0/1357] Elapsed 0m 0s (remain 5m 3s) Loss: 0.0852 
Epoch: [3][100/1357] Elapsed 0m 11s (remain 2m 22s) Loss: 0.0483 
Epoch: [3][200/1357] Elapsed 0m 22s (remain 2m 10s) Loss: 0.0345 
Epoch: [3][300/1357] Elapsed 0m 33s (remain 1m 59s) Loss: 0.0288 
Epoch: [3][400/1357] Elapsed 0m 45s (remain 1m 47s) Loss: 0.0306 
Epoch: [3][500/1357] Elapsed 0m 56s (remain 1m 36s) Loss: 0.0312 
Epoch: [3][600/1357] Elapsed 1m 7s (remain 1m 25s) Loss: 0.0322 
Epoch: [3][700/1357] Elapsed 1m 18s (remain 1m 13s) Loss: 0.0307 
Epoch: [3][800/1357] Elapsed 1m 29s (remain 1m 2s) Loss: 0.0314 
Epoch: [3][900/1357] Elapsed 1m 41s (remain 0m 51s) Loss: 0.0320 
Epoch: [3][1000/1357] Elapsed 1m 52s (remain 0m 39s) Loss: 0.0312 
Epoch: [3][1100/1357] Elapsed 2m 3s (remain 0m 28s) Loss: 0.0313 
Epoch: [3][1200/1357] Elapsed 2m 14s (remain 0m 17s) Loss: 0.0316 
Epoch: [3][1300/1357] Elapsed 2m 25s (remain 0m 6s) Loss: 0.0311 




Epoch: [3][1356/1357] Elapsed 2m 32s (remain 0m 0s) Loss: 0.0311 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 53s) Loss: 0.0011 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 8s) Loss: 0.0543 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0602 
EVAL: [300/340] Elapsed 0m 10s (remain 0m 1s) Loss: 0.0590 


Epoch 3 - avg_train_loss: 0.0311  avg_val_loss: 0.0574  time: 164s
Epoch 3 - Score: 0.8250572956455308


EVAL: [339/340] Elapsed 0m 11s (remain 0m 0s) Loss: 0.0574 


Score: 0.83406
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the mode

Epoch: [1][0/1357] Elapsed 0m 0s (remain 5m 9s) Loss: 0.9735 
Epoch: [1][100/1357] Elapsed 0m 10s (remain 2m 12s) Loss: 0.1537 
Epoch: [1][200/1357] Elapsed 0m 21s (remain 2m 1s) Loss: 0.1419 
Epoch: [1][300/1357] Elapsed 0m 32s (remain 1m 52s) Loss: 0.1178 
Epoch: [1][400/1357] Elapsed 0m 43s (remain 1m 43s) Loss: 0.1078 
Epoch: [1][500/1357] Elapsed 0m 54s (remain 1m 33s) Loss: 0.1000 
Epoch: [1][600/1357] Elapsed 1m 5s (remain 1m 22s) Loss: 0.0962 
Epoch: [1][700/1357] Elapsed 1m 17s (remain 1m 12s) Loss: 0.0929 
Epoch: [1][800/1357] Elapsed 1m 28s (remain 1m 1s) Loss: 0.0906 
Epoch: [1][900/1357] Elapsed 1m 39s (remain 0m 50s) Loss: 0.0867 
Epoch: [1][1000/1357] Elapsed 1m 50s (remain 0m 39s) Loss: 0.0853 
Epoch: [1][1100/1357] Elapsed 2m 3s (remain 0m 28s) Loss: 0.0828 
Epoch: [1][1200/1357] Elapsed 2m 15s (remain 0m 17s) Loss: 0.0816 
Epoch: [1][1300/1357] Elapsed 2m 26s (remain 0m 6s) Loss: 0.0806 
Epoch: [1][1356/1357] Elapsed 2m 32s (remain 0m 0s) Loss: 0.0792 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 56s) Loss: 0.0861 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 8s) Loss: 0.0760 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0612 
EVAL: [300/340] Elapsed 0m 9s (remain 0m 1s) Loss: 0.0579 


Epoch 1 - avg_train_loss: 0.0792  avg_val_loss: 0.0577  time: 164s
Epoch 1 - Score: 0.7750759878419452
Epoch 1 - Save Best Score: 0.7751 Model


EVAL: [339/340] Elapsed 0m 11s (remain 0m 0s) Loss: 0.0577 




Epoch: [2][0/1357] Elapsed 0m 0s (remain 5m 9s) Loss: 0.0739 
Epoch: [2][100/1357] Elapsed 0m 12s (remain 2m 33s) Loss: 0.0460 
Epoch: [2][200/1357] Elapsed 0m 24s (remain 2m 18s) Loss: 0.0473 
Epoch: [2][300/1357] Elapsed 0m 34s (remain 2m 2s) Loss: 0.0475 
Epoch: [2][400/1357] Elapsed 0m 45s (remain 1m 49s) Loss: 0.0477 
Epoch: [2][500/1357] Elapsed 0m 57s (remain 1m 37s) Loss: 0.0454 
Epoch: [2][600/1357] Elapsed 1m 8s (remain 1m 26s) Loss: 0.0435 
Epoch: [2][700/1357] Elapsed 1m 19s (remain 1m 14s) Loss: 0.0440 
Epoch: [2][800/1357] Elapsed 1m 30s (remain 1m 3s) Loss: 0.0451 
Epoch: [2][900/1357] Elapsed 1m 42s (remain 0m 51s) Loss: 0.0455 
Epoch: [2][1000/1357] Elapsed 1m 53s (remain 0m 40s) Loss: 0.0454 
Epoch: [2][1100/1357] Elapsed 2m 4s (remain 0m 28s) Loss: 0.0456 
Epoch: [2][1200/1357] Elapsed 2m 15s (remain 0m 17s) Loss: 0.0465 
Epoch: [2][1300/1357] Elapsed 2m 26s (remain 0m 6s) Loss: 0.0461 
Epoch: [2][1356/1357] Elapsed 2m 33s (remain 0m 0s) Loss: 0.0462 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 54s) Loss: 0.3545 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 7s) Loss: 0.0705 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0574 
EVAL: [300/340] Elapsed 0m 10s (remain 0m 1s) Loss: 0.0534 


Epoch 2 - avg_train_loss: 0.0462  avg_val_loss: 0.0539  time: 165s
Epoch 2 - Score: 0.7874015748031495
Epoch 2 - Save Best Score: 0.7874 Model


EVAL: [339/340] Elapsed 0m 11s (remain 0m 0s) Loss: 0.0539 




Epoch: [3][0/1357] Elapsed 0m 0s (remain 5m 7s) Loss: 0.0023 
Epoch: [3][100/1357] Elapsed 0m 11s (remain 2m 20s) Loss: 0.0273 
Epoch: [3][200/1357] Elapsed 0m 22s (remain 2m 9s) Loss: 0.0209 
Epoch: [3][300/1357] Elapsed 0m 33s (remain 1m 57s) Loss: 0.0272 
Epoch: [3][400/1357] Elapsed 0m 44s (remain 1m 46s) Loss: 0.0287 
Epoch: [3][500/1357] Elapsed 0m 55s (remain 1m 34s) Loss: 0.0295 
Epoch: [3][600/1357] Elapsed 1m 6s (remain 1m 23s) Loss: 0.0310 
Epoch: [3][700/1357] Elapsed 1m 17s (remain 1m 12s) Loss: 0.0305 
Epoch: [3][800/1357] Elapsed 1m 28s (remain 1m 1s) Loss: 0.0326 
Epoch: [3][900/1357] Elapsed 1m 38s (remain 0m 50s) Loss: 0.0319 
Epoch: [3][1000/1357] Elapsed 1m 49s (remain 0m 38s) Loss: 0.0325 
Epoch: [3][1100/1357] Elapsed 2m 0s (remain 0m 27s) Loss: 0.0320 
Epoch: [3][1200/1357] Elapsed 2m 11s (remain 0m 17s) Loss: 0.0315 
Epoch: [3][1300/1357] Elapsed 2m 22s (remain 0m 6s) Loss: 0.0319 




Epoch: [3][1356/1357] Elapsed 2m 28s (remain 0m 0s) Loss: 0.0316 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 52s) Loss: 0.1557 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 7s) Loss: 0.0764 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0614 
EVAL: [300/340] Elapsed 0m 9s (remain 0m 1s) Loss: 0.0544 


Epoch 3 - avg_train_loss: 0.0316  avg_val_loss: 0.0532  time: 160s
Epoch 3 - Score: 0.8575803981623278
Epoch 3 - Save Best Score: 0.8576 Model


EVAL: [339/340] Elapsed 0m 11s (remain 0m 0s) Loss: 0.0532 


Score: 0.85758
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the mode

Epoch: [1][0/1357] Elapsed 0m 0s (remain 5m 20s) Loss: 0.6798 
Epoch: [1][100/1357] Elapsed 0m 10s (remain 2m 14s) Loss: 0.1414 
Epoch: [1][200/1357] Elapsed 0m 21s (remain 2m 3s) Loss: 0.1199 
Epoch: [1][300/1357] Elapsed 0m 32s (remain 1m 53s) Loss: 0.1146 
Epoch: [1][400/1357] Elapsed 0m 43s (remain 1m 43s) Loss: 0.1110 
Epoch: [1][500/1357] Elapsed 0m 54s (remain 1m 33s) Loss: 0.1103 
Epoch: [1][600/1357] Elapsed 1m 5s (remain 1m 22s) Loss: 0.1035 
Epoch: [1][700/1357] Elapsed 1m 16s (remain 1m 12s) Loss: 0.0996 
Epoch: [1][800/1357] Elapsed 1m 28s (remain 1m 1s) Loss: 0.1001 
Epoch: [1][900/1357] Elapsed 1m 39s (remain 0m 50s) Loss: 0.0974 
Epoch: [1][1000/1357] Elapsed 1m 50s (remain 0m 39s) Loss: 0.0955 
Epoch: [1][1100/1357] Elapsed 2m 2s (remain 0m 28s) Loss: 0.0949 
Epoch: [1][1200/1357] Elapsed 2m 13s (remain 0m 17s) Loss: 0.0932 
Epoch: [1][1300/1357] Elapsed 2m 24s (remain 0m 6s) Loss: 0.0905 




Epoch: [1][1356/1357] Elapsed 2m 30s (remain 0m 0s) Loss: 0.0909 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 52s) Loss: 0.0161 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 7s) Loss: 0.0462 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0549 
EVAL: [300/340] Elapsed 0m 9s (remain 0m 1s) Loss: 0.0575 


Epoch 1 - avg_train_loss: 0.0909  avg_val_loss: 0.0571  time: 161s
Epoch 1 - Score: 0.8086253369272236
Epoch 1 - Save Best Score: 0.8086 Model


EVAL: [339/340] Elapsed 0m 10s (remain 0m 0s) Loss: 0.0571 




Epoch: [2][0/1357] Elapsed 0m 0s (remain 4m 57s) Loss: 0.0238 
Epoch: [2][100/1357] Elapsed 0m 10s (remain 2m 16s) Loss: 0.0663 
Epoch: [2][200/1357] Elapsed 0m 22s (remain 2m 7s) Loss: 0.0643 
Epoch: [2][300/1357] Elapsed 0m 33s (remain 1m 57s) Loss: 0.0616 
Epoch: [2][400/1357] Elapsed 0m 44s (remain 1m 46s) Loss: 0.0624 
Epoch: [2][500/1357] Elapsed 0m 55s (remain 1m 35s) Loss: 0.0660 
Epoch: [2][600/1357] Elapsed 1m 6s (remain 1m 23s) Loss: 0.0643 
Epoch: [2][700/1357] Elapsed 1m 17s (remain 1m 12s) Loss: 0.0633 
Epoch: [2][800/1357] Elapsed 1m 28s (remain 1m 1s) Loss: 0.0615 
Epoch: [2][900/1357] Elapsed 1m 39s (remain 0m 50s) Loss: 0.0608 
Epoch: [2][1000/1357] Elapsed 1m 49s (remain 0m 38s) Loss: 0.0584 
Epoch: [2][1100/1357] Elapsed 2m 0s (remain 0m 27s) Loss: 0.0571 
Epoch: [2][1200/1357] Elapsed 2m 11s (remain 0m 17s) Loss: 0.0554 
Epoch: [2][1300/1357] Elapsed 2m 22s (remain 0m 6s) Loss: 0.0540 




Epoch: [2][1356/1357] Elapsed 2m 29s (remain 0m 0s) Loss: 0.0545 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 53s) Loss: 0.0218 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 8s) Loss: 0.0419 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0536 
EVAL: [300/340] Elapsed 0m 9s (remain 0m 1s) Loss: 0.0561 


Epoch 2 - avg_train_loss: 0.0545  avg_val_loss: 0.0564  time: 160s
Epoch 2 - Score: 0.7627118644067797


EVAL: [339/340] Elapsed 0m 11s (remain 0m 0s) Loss: 0.0564 




Epoch: [3][0/1357] Elapsed 0m 0s (remain 5m 9s) Loss: 0.2975 
Epoch: [3][100/1357] Elapsed 0m 11s (remain 2m 23s) Loss: 0.0377 
Epoch: [3][200/1357] Elapsed 0m 22s (remain 2m 11s) Loss: 0.0413 
Epoch: [3][300/1357] Elapsed 0m 34s (remain 1m 59s) Loss: 0.0389 
Epoch: [3][400/1357] Elapsed 0m 45s (remain 1m 48s) Loss: 0.0374 
Epoch: [3][500/1357] Elapsed 0m 56s (remain 1m 36s) Loss: 0.0365 
Epoch: [3][600/1357] Elapsed 1m 8s (remain 1m 25s) Loss: 0.0370 
Epoch: [3][700/1357] Elapsed 1m 19s (remain 1m 14s) Loss: 0.0381 
Epoch: [3][800/1357] Elapsed 1m 31s (remain 1m 3s) Loss: 0.0370 
Epoch: [3][900/1357] Elapsed 1m 42s (remain 0m 52s) Loss: 0.0374 
Epoch: [3][1000/1357] Elapsed 1m 54s (remain 0m 40s) Loss: 0.0383 
Epoch: [3][1100/1357] Elapsed 2m 5s (remain 0m 29s) Loss: 0.0381 
Epoch: [3][1200/1357] Elapsed 2m 16s (remain 0m 17s) Loss: 0.0395 
Epoch: [3][1300/1357] Elapsed 2m 27s (remain 0m 6s) Loss: 0.0394 




Epoch: [3][1356/1357] Elapsed 2m 34s (remain 0m 0s) Loss: 0.0386 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 53s) Loss: 0.0639 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 8s) Loss: 0.0450 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0562 
EVAL: [300/340] Elapsed 0m 10s (remain 0m 1s) Loss: 0.0543 


Epoch 3 - avg_train_loss: 0.0386  avg_val_loss: 0.0542  time: 166s
Epoch 3 - Score: 0.8223429014740108
Epoch 3 - Save Best Score: 0.8223 Model


EVAL: [339/340] Elapsed 0m 11s (remain 0m 0s) Loss: 0.0542 


Score: 0.82234
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the mode

Epoch: [1][0/1357] Elapsed 0m 0s (remain 4m 58s) Loss: 0.5854 
Epoch: [1][100/1357] Elapsed 0m 10s (remain 2m 13s) Loss: 0.1364 
Epoch: [1][200/1357] Elapsed 0m 21s (remain 2m 0s) Loss: 0.1248 
Epoch: [1][300/1357] Elapsed 0m 31s (remain 1m 50s) Loss: 0.1142 
Epoch: [1][400/1357] Elapsed 0m 42s (remain 1m 40s) Loss: 0.1072 
Epoch: [1][500/1357] Elapsed 0m 53s (remain 1m 30s) Loss: 0.1045 
Epoch: [1][600/1357] Elapsed 1m 4s (remain 1m 21s) Loss: 0.1025 
Epoch: [1][700/1357] Elapsed 1m 15s (remain 1m 10s) Loss: 0.1000 
Epoch: [1][800/1357] Elapsed 1m 27s (remain 1m 0s) Loss: 0.0963 
Epoch: [1][900/1357] Elapsed 1m 39s (remain 0m 50s) Loss: 0.0937 
Epoch: [1][1000/1357] Elapsed 1m 51s (remain 0m 39s) Loss: 0.0894 
Epoch: [1][1100/1357] Elapsed 2m 3s (remain 0m 28s) Loss: 0.0868 
Epoch: [1][1200/1357] Elapsed 2m 14s (remain 0m 17s) Loss: 0.0854 
Epoch: [1][1300/1357] Elapsed 2m 25s (remain 0m 6s) Loss: 0.0830 




Epoch: [1][1356/1357] Elapsed 2m 32s (remain 0m 0s) Loss: 0.0818 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 55s) Loss: 0.0204 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 8s) Loss: 0.0742 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0688 
EVAL: [300/340] Elapsed 0m 10s (remain 0m 1s) Loss: 0.0677 


Epoch 1 - avg_train_loss: 0.0818  avg_val_loss: 0.0677  time: 163s
Epoch 1 - Score: 0.7098625585259024
Epoch 1 - Save Best Score: 0.7099 Model


EVAL: [339/340] Elapsed 0m 11s (remain 0m 0s) Loss: 0.0677 




Epoch: [2][0/1357] Elapsed 0m 0s (remain 5m 26s) Loss: 0.0047 
Epoch: [2][100/1357] Elapsed 0m 11s (remain 2m 22s) Loss: 0.0431 
Epoch: [2][200/1357] Elapsed 0m 22s (remain 2m 10s) Loss: 0.0415 
Epoch: [2][300/1357] Elapsed 0m 33s (remain 1m 58s) Loss: 0.0387 
Epoch: [2][400/1357] Elapsed 0m 45s (remain 1m 47s) Loss: 0.0390 
Epoch: [2][500/1357] Elapsed 0m 56s (remain 1m 36s) Loss: 0.0404 
Epoch: [2][600/1357] Elapsed 1m 7s (remain 1m 25s) Loss: 0.0431 
Epoch: [2][700/1357] Elapsed 1m 18s (remain 1m 13s) Loss: 0.0425 
Epoch: [2][800/1357] Elapsed 1m 30s (remain 1m 2s) Loss: 0.0419 
Epoch: [2][900/1357] Elapsed 1m 43s (remain 0m 52s) Loss: 0.0424 
Epoch: [2][1000/1357] Elapsed 1m 56s (remain 0m 41s) Loss: 0.0433 
Epoch: [2][1100/1357] Elapsed 2m 9s (remain 0m 30s) Loss: 0.0441 
Epoch: [2][1200/1357] Elapsed 2m 21s (remain 0m 18s) Loss: 0.0454 
Epoch: [2][1300/1357] Elapsed 2m 32s (remain 0m 6s) Loss: 0.0449 




Epoch: [2][1356/1357] Elapsed 2m 39s (remain 0m 0s) Loss: 0.0458 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 54s) Loss: 0.0123 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 8s) Loss: 0.0581 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0517 
EVAL: [300/340] Elapsed 0m 10s (remain 0m 1s) Loss: 0.0514 


Epoch 2 - avg_train_loss: 0.0458  avg_val_loss: 0.0517  time: 171s
Epoch 2 - Score: 0.826943316794467
Epoch 2 - Save Best Score: 0.8269 Model


EVAL: [339/340] Elapsed 0m 11s (remain 0m 0s) Loss: 0.0517 




Epoch: [3][0/1357] Elapsed 0m 0s (remain 5m 13s) Loss: 0.0024 
Epoch: [3][100/1357] Elapsed 0m 11s (remain 2m 24s) Loss: 0.0257 
Epoch: [3][200/1357] Elapsed 0m 22s (remain 2m 11s) Loss: 0.0240 
Epoch: [3][300/1357] Elapsed 0m 34s (remain 1m 59s) Loss: 0.0252 
Epoch: [3][400/1357] Elapsed 0m 45s (remain 1m 48s) Loss: 0.0244 
Epoch: [3][500/1357] Elapsed 0m 56s (remain 1m 37s) Loss: 0.0235 
Epoch: [3][600/1357] Elapsed 1m 7s (remain 1m 25s) Loss: 0.0266 
Epoch: [3][700/1357] Elapsed 1m 19s (remain 1m 14s) Loss: 0.0270 
Epoch: [3][800/1357] Elapsed 1m 31s (remain 1m 3s) Loss: 0.0261 
Epoch: [3][900/1357] Elapsed 1m 43s (remain 0m 52s) Loss: 0.0259 
Epoch: [3][1000/1357] Elapsed 1m 54s (remain 0m 40s) Loss: 0.0258 
Epoch: [3][1100/1357] Elapsed 2m 6s (remain 0m 29s) Loss: 0.0263 
Epoch: [3][1200/1357] Elapsed 2m 18s (remain 0m 17s) Loss: 0.0271 
Epoch: [3][1300/1357] Elapsed 2m 30s (remain 0m 6s) Loss: 0.0272 




Epoch: [3][1356/1357] Elapsed 2m 36s (remain 0m 0s) Loss: 0.0270 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 55s) Loss: 0.0061 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 8s) Loss: 0.0824 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0703 
EVAL: [300/340] Elapsed 0m 10s (remain 0m 1s) Loss: 0.0680 


Epoch 3 - avg_train_loss: 0.0270  avg_val_loss: 0.0683  time: 168s
Epoch 3 - Score: 0.7226322263222632


EVAL: [339/340] Elapsed 0m 11s (remain 0m 0s) Loss: 0.0683 


Score: 0.82694
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the mode

Epoch: [1][0/1357] Elapsed 0m 0s (remain 6m 56s) Loss: 0.7124 
Epoch: [1][100/1357] Elapsed 0m 11s (remain 2m 16s) Loss: 0.1473 
Epoch: [1][200/1357] Elapsed 0m 21s (remain 2m 5s) Loss: 0.1275 
Epoch: [1][300/1357] Elapsed 0m 32s (remain 1m 55s) Loss: 0.1139 
Epoch: [1][400/1357] Elapsed 0m 44s (remain 1m 45s) Loss: 0.1060 
Epoch: [1][500/1357] Elapsed 0m 55s (remain 1m 34s) Loss: 0.1007 
Epoch: [1][600/1357] Elapsed 1m 6s (remain 1m 23s) Loss: 0.0964 
Epoch: [1][700/1357] Elapsed 1m 18s (remain 1m 13s) Loss: 0.0940 
Epoch: [1][800/1357] Elapsed 1m 30s (remain 1m 2s) Loss: 0.0915 
Epoch: [1][900/1357] Elapsed 1m 41s (remain 0m 51s) Loss: 0.0889 
Epoch: [1][1000/1357] Elapsed 1m 53s (remain 0m 40s) Loss: 0.0859 
Epoch: [1][1100/1357] Elapsed 2m 4s (remain 0m 28s) Loss: 0.0851 
Epoch: [1][1200/1357] Elapsed 2m 16s (remain 0m 17s) Loss: 0.0836 
Epoch: [1][1300/1357] Elapsed 2m 27s (remain 0m 6s) Loss: 0.0827 




Epoch: [1][1356/1357] Elapsed 2m 33s (remain 0m 0s) Loss: 0.0824 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 58s) Loss: 0.0178 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 8s) Loss: 0.0423 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0455 
EVAL: [300/340] Elapsed 0m 10s (remain 0m 1s) Loss: 0.0483 


Epoch 1 - avg_train_loss: 0.0824  avg_val_loss: 0.0499  time: 165s
Epoch 1 - Score: 0.8587097705296579
Epoch 1 - Save Best Score: 0.8587 Model


EVAL: [339/340] Elapsed 0m 11s (remain 0m 0s) Loss: 0.0499 




Epoch: [2][0/1357] Elapsed 0m 0s (remain 5m 36s) Loss: 0.1120 
Epoch: [2][100/1357] Elapsed 0m 11s (remain 2m 25s) Loss: 0.0549 
Epoch: [2][200/1357] Elapsed 0m 23s (remain 2m 13s) Loss: 0.0534 
Epoch: [2][300/1357] Elapsed 0m 34s (remain 2m 1s) Loss: 0.0553 
Epoch: [2][400/1357] Elapsed 0m 46s (remain 1m 49s) Loss: 0.0576 
Epoch: [2][500/1357] Elapsed 0m 57s (remain 1m 38s) Loss: 0.0560 
Epoch: [2][600/1357] Elapsed 1m 8s (remain 1m 26s) Loss: 0.0534 
Epoch: [2][700/1357] Elapsed 1m 20s (remain 1m 15s) Loss: 0.0511 
Epoch: [2][800/1357] Elapsed 1m 31s (remain 1m 3s) Loss: 0.0511 
Epoch: [2][900/1357] Elapsed 1m 43s (remain 0m 52s) Loss: 0.0506 
Epoch: [2][1000/1357] Elapsed 1m 54s (remain 0m 40s) Loss: 0.0499 
Epoch: [2][1100/1357] Elapsed 2m 5s (remain 0m 29s) Loss: 0.0492 
Epoch: [2][1200/1357] Elapsed 2m 17s (remain 0m 17s) Loss: 0.0489 
Epoch: [2][1300/1357] Elapsed 2m 28s (remain 0m 6s) Loss: 0.0481 




Epoch: [2][1356/1357] Elapsed 2m 34s (remain 0m 0s) Loss: 0.0482 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 56s) Loss: 0.0036 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 8s) Loss: 0.0468 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0441 
EVAL: [300/340] Elapsed 0m 10s (remain 0m 1s) Loss: 0.0463 


Epoch 2 - avg_train_loss: 0.0482  avg_val_loss: 0.0474  time: 166s
Epoch 2 - Score: 0.8806986382474836
Epoch 2 - Save Best Score: 0.8807 Model


EVAL: [339/340] Elapsed 0m 11s (remain 0m 0s) Loss: 0.0474 




Epoch: [3][0/1357] Elapsed 0m 0s (remain 5m 19s) Loss: 0.0879 
Epoch: [3][100/1357] Elapsed 0m 11s (remain 2m 25s) Loss: 0.0382 
Epoch: [3][200/1357] Elapsed 0m 23s (remain 2m 12s) Loss: 0.0317 
Epoch: [3][300/1357] Elapsed 0m 34s (remain 2m 0s) Loss: 0.0293 
Epoch: [3][400/1357] Elapsed 0m 45s (remain 1m 49s) Loss: 0.0296 
Epoch: [3][500/1357] Elapsed 0m 57s (remain 1m 37s) Loss: 0.0312 
Epoch: [3][600/1357] Elapsed 1m 8s (remain 1m 26s) Loss: 0.0332 
Epoch: [3][700/1357] Elapsed 1m 19s (remain 1m 14s) Loss: 0.0322 
Epoch: [3][800/1357] Elapsed 1m 31s (remain 1m 3s) Loss: 0.0315 
Epoch: [3][900/1357] Elapsed 1m 42s (remain 0m 51s) Loss: 0.0301 
Epoch: [3][1000/1357] Elapsed 1m 53s (remain 0m 40s) Loss: 0.0297 
Epoch: [3][1100/1357] Elapsed 2m 5s (remain 0m 29s) Loss: 0.0301 
Epoch: [3][1200/1357] Elapsed 2m 16s (remain 0m 17s) Loss: 0.0301 
Epoch: [3][1300/1357] Elapsed 2m 27s (remain 0m 6s) Loss: 0.0294 




Epoch: [3][1356/1357] Elapsed 2m 33s (remain 0m 0s) Loss: 0.0295 




EVAL: [0/340] Elapsed 0m 0s (remain 0m 56s) Loss: 0.0005 
EVAL: [100/340] Elapsed 0m 3s (remain 0m 8s) Loss: 0.0628 
EVAL: [200/340] Elapsed 0m 6s (remain 0m 4s) Loss: 0.0653 
EVAL: [300/340] Elapsed 0m 10s (remain 0m 1s) Loss: 0.0652 


Epoch 3 - avg_train_loss: 0.0295  avg_val_loss: 0.0687  time: 165s
Epoch 3 - Score: 0.6593406593406593


EVAL: [339/340] Elapsed 0m 11s (remain 0m 0s) Loss: 0.0687 


Score: 0.88070
Score: 0.84447
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/2553 [00:00<?, ?it/s]