# 🏋️ Model Training - Refit

## ⚙️ Setup 

### 📚 Importing Libraries

Importing from packages

In [1]:
import os
import gc
import time
from dotenv import load_dotenv

import pandas as pd
import numpy as np
import torch
from torch import nn
import wandb
from pprint import pprint
import warnings
from transformers import AutoTokenizer
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm import tqdm

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir("../")
warnings.filterwarnings("ignore")
load_dotenv()

True

Importing user defined packages

In [3]:
from lib.config import config
from lib.paths import Paths
from lib.model.epoch_functions import valid_epoch
from lib.model.utils import get_score, get_model_optimizer_and_scheduler
from lib.utils.utils import seed_everything
from lib.utils.average_meter import AverageMeter
from lib.data import (
    clean_text,
    sliding_window,
    negative_sample_df,
    get_data_loaders,
    collate,
)
from lib.criterion.metrics import log_metrics
from lib.model.inference import ensemble_inference

In [4]:
pprint(config)

namespace(apex=True,
          batch_scheduler=True,
          batch_size_train=32,
          batch_size_valid=32,
          betas=[0.9, 0.999],
          data_version=4,
          debug=False,
          decoder_lr=2e-05,
          encoder_lr=2e-05,
          epochs=2,
          eps=1e-06,
          gradient_accumulation_steps=1,
          gradient_checkpointing=True,
          max_grad_norm=1000,
          max_length=512,
          min_lr=1e-06,
          model='microsoft/deberta-v3-xsmall',
          n_folds=7,
          negative_sample=True,
          negative_sample_partitions=3,
          oversample=False,
          num_classes=6,
          num_cycles=0.5,
          num_warmup_steps=0,
          num_workers=6,
          print_freq=6,
          random_seed=20,
          scheduler='cosine',
          stride=192,
          tokenizer_version=2,
          train=True,
          train_folds=[0, 1, 2, 3, 4, 5, 6, 7],
          weight_decay=0.01)


In [5]:
seed_everything()

In [6]:
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


## 📖 Definitions

### 🌎 Global Variables

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

The competition data has a class imbalance problem. Which is why I assign higher weights to classes having lower samples.

In [8]:
# class_weights = torch.tensor([1.0, 0.25, 0.25, 0.5, 1.0, 2.0]).to(device)
class_weights = torch.tensor([1.0] * 6).to(device)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(Paths.TOKENIZER_PATH)
vocabulary = tokenizer.get_vocab()
total_tokens = len(vocabulary)
print("Total number of tokens in the tokenizer:", total_tokens)
print(tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Total number of tokens in the tokenizer: 128003
DebertaV2TokenizerFast(name_or_path='output/microsoft/deberta-v3-xsmall/tokenizer_v2', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	128000: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized

## Data Preparation

In [10]:
# df = pd.read_csv(Paths.TRAIN_CSV_PATH)
# df.shape

### Cleaning

In [11]:
# df["full_text"] = df["full_text"].map(lambda x: clean_text(x))

In [12]:
# df["score"] = df["score"].map(lambda x: x - 1)

### Train-validation split

In [13]:
# skf = StratifiedShuffleSplit(
#     n_splits=1,
#     test_size=1000,
#     random_state=config.random_seed,
# )

# train_idx, valid_idx = next(skf.split(df["full_text"], df["score"]))
# train_idx.shape, valid_idx.shape

In [14]:
# train_df = df.iloc[train_idx]
# valid_df = df.iloc[valid_idx]

# train_df.shape, valid_df.shape

### Sliding Window

In [15]:
# train_df = sliding_window(train_df, tokenizer)
# valid_df = sliding_window(valid_df, tokenizer)
# train_df.shape, valid_df.shape

### Negative Sampling

In [16]:
# if not os.path.exists(Paths.REFIT_DATA_LOADER_PATH):
#     os.makedirs(Paths.REFIT_DATA_LOADER_PATH)

In [17]:
# for i, sampled_df in negative_sample_df(train_df):
#     train_loader, valid_loader = get_data_loaders(sampled_df, valid_df, tokenizer)

#     train_dataloader_name = f"train_{i}.pth"
#     train_dataloader_path = os.path.join(Paths.REFIT_DATA_LOADER_PATH, train_dataloader_name)
#     torch.save(train_loader, train_dataloader_path)
#     print(f"Saved {train_dataloader_path} with {len(sampled_df)} samples ")

#     valid_dataloader_path = os.path.join(Paths.REFIT_DATA_LOADER_PATH, "valid.pth")
#     torch.save(valid_loader, valid_dataloader_path)
#     print(f"Saved {valid_dataloader_path} with {len(valid_df)} samples ")

#     valid_csv_path = os.path.join(Paths.REFIT_DATA_LOADER_PATH, "valid.csv")
#     valid_df.to_csv(valid_csv_path, index=False)
#     print(f"Saved {valid_csv_path}")

### 🛠️ Functions

In [18]:
def train_step(
    step,
    inputs,
    labels,
    criterion,
    model,
    scaler,
    optimizer,
    scheduler,
    loss_tracker,
    score_tracker,
    softmax,
):
    model.train()

    for k, v in inputs.items():
        inputs[k] = v.to(device)

    labels = labels.to(device)

    batch_size = labels.size(0)
    with torch.cuda.amp.autocast(enabled=config.apex):
        y_preds = model(inputs)
        loss = criterion(y_preds, labels)
        _, predictions = torch.max(softmax(torch.tensor(y_preds.detach().to("cpu").numpy())), dim=1)
        score = get_score(labels.detach().to("cpu").numpy(), predictions)

    if config.gradient_accumulation_steps > 1:
        loss = loss / config.gradient_accumulation_steps

    score_tracker.update(score, batch_size)
    loss_tracker.update(loss.item(), batch_size)
    scaler.scale(loss).backward()

    if (step + 1) % config.gradient_accumulation_steps == 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        if config.batch_scheduler:
            scheduler.step()

In [19]:
def valid_epoch(valid_loader, model, criterion, device):
    model.eval()
    loss_tracker = AverageMeter()
    score_tracker = AverageMeter()
    softmax = nn.Softmax(dim=1)

    with tqdm(
        valid_loader, unit="valid_batch", desc=f"Validating"
    ) as tqdm_valid_loader:
        for batch in tqdm_valid_loader:
            inputs = collate(batch.pop("inputs"))
            labels = batch.pop("labels")

            for k, v in inputs.items():
                inputs[k] = v.to(device)

            labels = labels.to(device)

            batch_size = labels.size(0)
            with torch.no_grad():
                y_preds = model(inputs)
                loss = criterion(y_preds, labels)

                _, predictions = torch.max(softmax(torch.tensor(y_preds.detach().to("cpu").numpy())), dim=1)
                score = get_score(labels.detach().to("cpu").numpy(), predictions)

            if config.gradient_accumulation_steps > 1:
                loss = loss / config.gradient_accumulation_steps

            loss_tracker.update(loss.item(), batch_size)
            score_tracker.update(score, batch_size)

    return loss_tracker.avg, score_tracker.avg

In [20]:
def train_model(train_loader, valid_loader, group):
    model, optimizer, scheduler = get_model_optimizer_and_scheduler(
        train_loader, device
    )
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    scaler = torch.cuda.amp.GradScaler(enabled=config.apex)
    loss_tracker = AverageMeter()
    score_tracker = AverageMeter()
    best_score = -1e8
    patience = 5
    model_save_path = os.path.join(
        Paths.MODEL_OUTPUT_PATH,
        f"{config.model.replace('/', '_')}_{group}.pth",
    )
    softmax = nn.Softmax(dim=1)

    print("=" * 100)
    print(f"Training model no {group}")
    print("=" * 100)

    for epoch in range(20):
        for step, batch in enumerate(train_loader):
            inputs, labels = collate(batch.pop("inputs")), batch.pop("labels")

            train_step(
                step,
                inputs,
                labels,
                criterion,
                model,
                scaler,
                optimizer,
                scheduler,
                loss_tracker,
                score_tracker,
                softmax,
            )

            if (step + 1) % 25 == 0:
                print(
                    f"Step: {epoch * len(train_loader) + step + 1} "
                    + f"Train loss: {loss_tracker.avg:<8.6f} "
                    + f"Train Score: {score_tracker.avg:<8.7f}"
                )
                loss_tracker.reset()
                score_tracker.reset()

            if (step + 1) % 100 == 0:
                avg_valid_loss, avg_valid_score = valid_epoch(
                    valid_loader,
                    model,
                    criterion,
                    device,
                )

                print(
                    "\t" * 4
                    + f"Step: {epoch * len(train_loader) + step + 1} "
                    + f"Valid loss: {avg_valid_loss:<8.6f} "
                    + f"Valid Score: {avg_valid_score:<8.7f}"
                )

                if avg_valid_score > best_score:
                    best_score = avg_valid_score
                    print("\t" * 4 + f"Save Best Score: {best_score:.4f} Model")
                    torch.save(model.state_dict(), model_save_path)
                    early_stopping_hook = 0
                else:
                    early_stopping_hook += 1

                    if early_stopping_hook > patience:
                        return

In [21]:
valid_loader = torch.load(os.path.join(Paths.REFIT_DATA_LOADER_PATH, "valid.pth"))

for i in range(config.negative_sample_partitions):
    train_loader = torch.load(
        os.path.join(Paths.REFIT_DATA_LOADER_PATH, f"train_{i}.pth")
    )

    train_model(train_loader, valid_loader, i)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Training model no 0
Step: 25 Train loss: 1.668365 Train Score: 0.3136072
Step: 50 Train loss: 1.595789 Train Score: 0.2681167
Step: 75 Train loss: 1.483813 Train Score: 0.2517583
Step: 100 Train loss: 1.405148 Train Score: 0.5399034


Validating: 100%|██████████| 43/43 [00:07<00:00,  6.08valid_batch/s]


				Step: 100 Valid loss: 1.631641 Valid Score: 0.4545698
				Save Best Score: 0.4546 Model
Step: 125 Train loss: 1.313972 Train Score: 0.5979896
Step: 150 Train loss: 1.231612 Train Score: 0.6472341
Step: 175 Train loss: 1.210390 Train Score: 0.6522348
Step: 200 Train loss: 1.099617 Train Score: 0.7209933


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.68valid_batch/s]


				Step: 200 Valid loss: 1.549360 Valid Score: 0.5598667
				Save Best Score: 0.5599 Model
Step: 225 Train loss: 1.094108 Train Score: 0.7290337
Step: 250 Train loss: 1.081026 Train Score: 0.7378409
Step: 275 Train loss: 1.033602 Train Score: 0.7540578
Step: 300 Train loss: 1.045356 Train Score: 0.7605054


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.63valid_batch/s]


				Step: 300 Valid loss: 1.127172 Valid Score: 0.7061128
				Save Best Score: 0.7061 Model
Step: 347 Train loss: 1.032314 Train Score: 0.7645337
Step: 372 Train loss: 0.964545 Train Score: 0.7869788
Step: 397 Train loss: 0.926067 Train Score: 0.8211608
Step: 422 Train loss: 0.945478 Train Score: 0.8142749


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.61valid_batch/s]


				Step: 422 Valid loss: 1.098279 Valid Score: 0.7181316
				Save Best Score: 0.7181 Model
Step: 447 Train loss: 0.916817 Train Score: 0.8058800
Step: 472 Train loss: 0.953790 Train Score: 0.8068172
Step: 497 Train loss: 0.905455 Train Score: 0.8105239
Step: 522 Train loss: 0.937706 Train Score: 0.8085786


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.61valid_batch/s]


				Step: 522 Valid loss: 1.026089 Valid Score: 0.7315657
				Save Best Score: 0.7316 Model
Step: 547 Train loss: 0.843696 Train Score: 0.8268022
Step: 572 Train loss: 0.896103 Train Score: 0.8124611
Step: 597 Train loss: 0.889271 Train Score: 0.8109945
Step: 622 Train loss: 0.845814 Train Score: 0.8370697


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.62valid_batch/s]


				Step: 622 Valid loss: 1.114254 Valid Score: 0.7163236
Step: 669 Train loss: 0.826415 Train Score: 0.8448669
Step: 694 Train loss: 0.823945 Train Score: 0.8549754
Step: 719 Train loss: 0.884044 Train Score: 0.8299577
Step: 744 Train loss: 0.830496 Train Score: 0.8329360


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.80valid_batch/s]


				Step: 744 Valid loss: 1.183954 Valid Score: 0.7031672
Step: 769 Train loss: 0.782057 Train Score: 0.8570753
Step: 794 Train loss: 0.787777 Train Score: 0.8507694
Step: 819 Train loss: 0.867009 Train Score: 0.8296344
Step: 844 Train loss: 0.777243 Train Score: 0.8503658


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.59valid_batch/s]


				Step: 844 Valid loss: 1.001806 Valid Score: 0.7606428
				Save Best Score: 0.7606 Model
Step: 869 Train loss: 0.766312 Train Score: 0.8559080
Step: 894 Train loss: 0.841452 Train Score: 0.8399428
Step: 919 Train loss: 0.793294 Train Score: 0.8520326
Step: 944 Train loss: 0.688452 Train Score: 0.8847934


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.68valid_batch/s]


				Step: 944 Valid loss: 0.982088 Valid Score: 0.7657854
				Save Best Score: 0.7658 Model
Step: 991 Train loss: 0.727722 Train Score: 0.8712275
Step: 1016 Train loss: 0.730045 Train Score: 0.8649657
Step: 1041 Train loss: 0.653481 Train Score: 0.8791255
Step: 1066 Train loss: 0.632664 Train Score: 0.8766852


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.57valid_batch/s]


				Step: 1066 Valid loss: 1.024375 Valid Score: 0.7599330
Step: 1091 Train loss: 0.709419 Train Score: 0.8696680
Step: 1116 Train loss: 0.714261 Train Score: 0.8622958
Step: 1141 Train loss: 0.724121 Train Score: 0.8620969
Step: 1166 Train loss: 0.718790 Train Score: 0.8508093


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.66valid_batch/s]


				Step: 1166 Valid loss: 1.322416 Valid Score: 0.6960582
Step: 1191 Train loss: 0.756524 Train Score: 0.8399147
Step: 1216 Train loss: 0.640135 Train Score: 0.8890805
Step: 1241 Train loss: 0.663968 Train Score: 0.8822136
Step: 1266 Train loss: 0.664545 Train Score: 0.8817361


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.70valid_batch/s]


				Step: 1266 Valid loss: 0.992712 Valid Score: 0.7584647
Step: 1313 Train loss: 0.635364 Train Score: 0.8815894
Step: 1338 Train loss: 0.589857 Train Score: 0.8921267
Step: 1363 Train loss: 0.560935 Train Score: 0.8932134
Step: 1388 Train loss: 0.605351 Train Score: 0.8770950


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.69valid_batch/s]


				Step: 1388 Valid loss: 1.101825 Valid Score: 0.7500285
Step: 1413 Train loss: 0.598124 Train Score: 0.8804385
Step: 1438 Train loss: 0.577070 Train Score: 0.8821380
Step: 1463 Train loss: 0.600212 Train Score: 0.8838637
Step: 1488 Train loss: 0.616666 Train Score: 0.8765288


Validating: 100%|██████████| 43/43 [00:07<00:00,  5.95valid_batch/s]


				Step: 1488 Valid loss: 1.104084 Valid Score: 0.7523852
Step: 1513 Train loss: 0.589711 Train Score: 0.8981576
Step: 1538 Train loss: 0.601114 Train Score: 0.8771168
