In [None]:
!pip install --upgrade wandb
!pip install --upgrade transformers
!pip install -q bitsandbytes-cuda110
!pip install sentencepiece
!pip install bitsandbytes

Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.0.1-py2.py3-none-any.whl (266 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.8/266.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb

In [None]:
# import manipulation
import numpy as np
import pandas as pd

# import Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.utils.checkpoint import checkpoint
from torch.autograd import Variable

# import wandb
import wandb

import tokenizers

# import Transformer model
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW
from transformers import DataCollatorWithPadding
from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout, ContextPooler

# import SKLearn
from sklearn.model_selection import  KFold, GroupKFold, StratifiedKFold, StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss


# import ...
import string
import random
import os
import joblib
import gc
import copy
import time


# other
from tqdm import tqdm
from collections import defaultdict

#8-bits optimizer
import bitsandbytes as bnb

# from codecarbon import track_emissions

os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [None]:
print(f"torch.__version__: {torch.__version__}")
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

torch.__version__: 2.2.1+cu121
tokenizers.__version__: 0.19.1
transformers.__version__: 4.40.1


In [None]:
from google.colab import userdata

# Get secret key from kaggle
# Go to Add-ons -> Secrets and provide your Wandb access token with Label name as wandb_api and value from https://wandb.ai/authorize
api_key = userdata.get("wandb_api")

# Connect to wandb
wandb.login(key=api_key)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
class CFG:
    seed = 2022
    max_length = 512
    epoch = 4
    train_batch_size = 16
    valid_batch_size = 32

    model_name = "microsoft/deberta-v3-base"
    token_name = "microsoft/deberta-v3-base"

    scheduler = "CosineAnnealingLR"
    learning_rate = 1e-5
    min_lr = 1e-6
    T_max = 500
    weight_decay = 0.005
    dropout = 0.1

    num_classes = 2
    n_fold = 3
    n_accumulate = 2
    freezing = True
    gradient_checkpoint = True
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    wandb_id = f"PL{round(time.time())}" # ID on WandB
    group = f'{wandb_id}-Baseline'
    competition = "Multi-author"
    _wandb_kernel = "deb"

CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.token_name, use_fast=False)
CFG.tokenizer.model_max_length = CFG.max_length
CFG.tokenizer.is_fast

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

False

In [None]:
AutoConfig.from_pretrained(CFG.model_name)


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.40.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=CFG.seed)


In [None]:
def criterion(outputs, labels):
    """
    Calculate Cross Entropy Loss
    """
    return nn.CrossEntropyLoss()(outputs, labels)

def get_score(outputs, labels):
    """
    Calculate Log Loss from softmax output
    """
    outputs = F.softmax(torch.tensor(outputs)).numpy()
    return log_loss(labels, outputs)

def freeze(module):
    """
    Freezes module's parameters.
    """
    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)

    return freezed_parameters

# 8-bits optimizer
def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"

        if hasattr(embeddings_path, attr_name):
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Now, you can access files in your Google Drive
# For example, to read a text file
INPUT_DIR = '/data'

TRAIN_CSV = os.path.join(INPUT_DIR, "train-table-mediumparser.csv")

TEST_CSV = os.path.join(INPUT_DIR, "eval-table-mediumparser.csv")


Mounted at /content/drive


In [None]:
df = pd.read_csv(TRAIN_CSV)

df

Unnamed: 0,Paragraphs1,Paragraphs2,Truth_changes,file_number
0,"In general, be courteous to others. Debate/dis...",My goodness. The poor woman can’t even get her...,1,1
1,My goodness. The poor woman can’t even get her...,"Exactly. If Sicknik had ""recovered"" that day a...",1,1
2,"Exactly. If Sicknik had ""recovered"" that day a...",r/politics is currently accepting new moderato...,1,1
3,r/politics is currently accepting new moderato...,"Essentially, if someone commits a tort, like a...",1,1
4,"Essentially, if someone commits a tort, like a...","I am a bot, and this action was performed auto...",1,1
...,...,...,...,...
51988,"This is genocide, the destruction of a people ...","Either way, we agree that what Russia is doing...",1,999
51989,"Either way, we agree that what Russia is doing...","Apologies for being pedantic here, I agree wit...",0,999
51990,"Apologies for being pedantic here, I agree wit...","Exactly! This is no place for Euphemisms, Russ...",0,999
51991,"Exactly! This is no place for Euphemisms, Russ...",The boy children will be trained for war and t...,1,999


In [None]:
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


In [None]:
df['Paragraphs1'] = df['Paragraphs1'].apply(lambda x : resolve_encodings_and_normalize(x))
df['Paragraphs2'] = df['Paragraphs2'].apply(lambda x : resolve_encodings_and_normalize(x))


In [None]:
# gkf = KFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
# gkf = GroupKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
gkf = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
# gkf = StratifiedGroupKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)

for fold, (train_id, val_id) in enumerate(gkf.split(X=df, y=df.Truth_changes, groups=df.file_number)):
    # For all row in val_id list => create kfold column value
    df.loc[val_id , "kfold"] = fold

In [None]:
df.groupby('kfold')['Truth_changes'].value_counts()

kfold  Truth_changes
0.0    1                10503
       0                 6828
1.0    1                10503
       0                 6828
2.0    1                10502
       0                 6829
Name: count, dtype: int64

In [None]:
class RedditDataset(Dataset):
    def __init__(self, df, max_length, tokenizer, training=True):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.paragraphs1 = self.df['Paragraphs1'].values
        self.paragraphs2 = self.df['Paragraphs2'].values
        self.training = training

        if self.training:
            self.targets = self.df['Truth_changes'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        paragraphs1 = self.paragraphs1[index]
        paragraphs2 = self.paragraphs2[index]

        inputs = self.tokenizer.encode_plus(
            paragraphs1,
            paragraphs2,
            truncation = True,
            add_special_tokens = True,
            max_length = self.max_len
        )

        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }

        if self.training:
            samples['target'] = self.targets[index]

        return samples


In [None]:
CFG.tokenizer.encode_plus(
            "Hello world",
            "Hello world",
            truncation = True,
            add_special_tokens = True,
            max_length = 200
        )

{'input_ids': [1, 5365, 447, 2, 5365, 447, 2], 'token_type_ids': [0, 0, 0, 0, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [None]:
# Dynamic Padding (Collate)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output

# collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)
collate_fn = Collate(CFG.tokenizer)


In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask

        return mean_embeddings

class MeanMaxPooling(nn.Module):
    def __init__(self):
        super(MeanMaxPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        mean_pooling_embeddings = torch.mean(last_hidden_state, 1)
        _, max_pooling_embeddings = torch.max(last_hidden_state, 1)
        mean_max_embeddings = torch.cat((mean_pooling_embeddings, max_pooling_embeddings), 1)
        return mean_max_embeddings


class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm):
        super(LSTMPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm
        self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        self.dropout = nn.Dropout(0.1)

    def forward(self, all_hidden_states):
        ## forward
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers+1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = torch.stack(list(all_hidden_states), dim=0)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average


In [None]:
class MultiSampleDropout(nn.Module):
    # Multisample Dropout: https://arxiv.org/abs/1905.09788
    def __init__(self, classifier, start_prob=0.2, num_samples=8, increment=0.01):
        super(MultiSampleDropout, self).__init__()
        #self.dropout = nn.Dropout
        self.dropouts = [StableDropout(start_prob + (increment*i)) for i in range(num_samples)]
        self.classifier = classifier

    def forward(self, out):
        return torch.mean(torch.stack([
            self.classifier(dropout(out)) for dropout in self.dropouts
        ], dim=0), dim=0)


In [None]:
class MultiAuthorModel(nn.Module):
    def __init__(self, model_name):
        super(MultiAuthorModel, self).__init__()

        # DeBERTa
        self.model = AutoModel.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)

        # gradient checkpointing
        if CFG.gradient_checkpoint:
            self.model.gradient_checkpointing_enable()
            print(f"Gradient Checkpointing: {self.model.is_gradient_checkpointing}")

        # freezing embeddings and first 6 layers of encoder
        if  CFG.freezing:
            freeze(self.model.embeddings)
            freeze(self.model.encoder.layer[:6])

        # Pooling
        #self.weighted_pooler = WeightedLayerPooling(num_hidden_layers=self.config.num_hidden_layers, layer_start=4)
        #self.pooler = MeanPooling()

        self.context_pooler = ContextPooler(self.config)

        #self.bilstm = nn.LSTM(self.config.hidden_size, self.config.hidden_size//2, num_layers=2,
        #                      dropout=self.config.hidden_dropout_prob, batch_first=True,
        #                      bidirectional=False)

        #self.drop = nn.Dropout(p=0.2)

        # Multi Sample Dropout
        self.fc = nn.Linear(self.config.hidden_size, CFG.num_classes)
        self.multi_sample_dropout = MultiSampleDropout(self.fc, start_prob=CFG.dropout, num_samples=8, increment=0.01)

    def forward(self, ids, mask):
        out = self.model(input_ids=ids,attention_mask=mask,
                        output_hidden_states=True)

        # out = self.weighted_pooler(out.hidden_states) # For WeightedLayerPooling
        # out = self.pooler(out, mask) # For MeanPooling

        #out = self.context_pooler(torch.stack(list(out.hidden_states), dim=0)) # For ContextPooler
        out = self.context_pooler(out[0]) # For ContextPooler

        # out = self.pooler(out.last_hidden_state, mask)

        outputs = self.multi_sample_dropout(out)

        #out = self.pooler(out.last_hidden_state, mask)
        #out = self.bilstm(out)[0]
        #out = self.drop(out)
        #outputs = self.fc(out)

        return outputs

    def set_optimizer_scheduler(self, option="Adam8bit"):
        if option == "AdamW":
            model_parameters = filter(lambda parameter: parameter.requires_grad, self.parameters())

            # Optimizer and scheduler
            optimizer = AdamW(model_parameters, lr=CFG.learning_rate, weight_decay = CFG.weight_decay)
            scheduler = fetch_scheduler(optimizer)
        elif option == "Adam8bit":
            # Adam 8-bits optimizer
            no_decay = ["bias", "LayerNorm.weight"]
            optimizer_grouped_parameters = [
                {
                    "params": [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay) and p[1].requires_grad],
                    "weight_decay": CFG.weight_decay,
                },
                {
                    "params": [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay) and p[1].requires_grad],
                    "weight_decay": 0.0,
                },
            ]

            # initializing optimizer
            # bnb_optimizer = bnb.optim.AdamW(params=model_parameters, lr=CFG.learning_rate, weight_decay=CFG.weight_decay, optim_bits=8)
            optimizer = bnb.optim.Adam8bit(optimizer_grouped_parameters, lr=CFG.learning_rate)
            print(f"8-bit Optimizer:\n\n{optimizer}")

            # setting embeddings parameters
            # set_embedding_parameters_bits(embeddings_path=self.model.embeddings)

            scheduler = fetch_scheduler(optimizer)
        else:
            embedding_parameters = filter(lambda parameter: parameter.requires_grad, self.model.parameters())

            optimizer_model = AdamW(embedding_parameters, lr=5e-6, weight_decay = CFG.weight_decay)
            optimizer_linear = AdamW(model.fc.parameters(), lr=1e-4, weight_decay = CFG.weight_decay)

            scheduler_model = fetch_scheduler(optimizer_model)
            scheduler_linear = fetch_scheduler(optimizer_linear)

            optimizer = [optimizer_model, optimizer_linear]
            scheduler = [scheduler_model, scheduler_linear]

        self.optimizer = optimizer
        self.scheduler = scheduler
        return True

    def get_optimizer_scheduler(self):
        return self.optimizer, self.scheduler


In [None]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction='sum')
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None

    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))

# criterion = LabelSmoothing(size=3, padding_idx=CFG.tokenizer.pad_token_id, smoothing=0.1)


In [None]:
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)


In [None]:
def train_one_epoch(model, dataloader, device, epoch):
    model.train()
    dataset_size = 0
    running_loss= 0
    type_running_loss = 0
    effect_running_loss = 0
    epoch_loss=0

    optimizer, scheduler = model.get_optimizer_scheduler()

    bar = tqdm(enumerate(dataloader), total= len(dataloader))
    for step, data in bar:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)

        targets = data['target'].to(device, dtype = torch.long)

        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)
        loss = loss/CFG.n_accumulate
        loss.backward()

        if (step+1)% CFG.n_accumulate ==0:
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

        running_loss += (loss.item()*batch_size) * CFG.n_accumulate
        effect_running_loss += (loss.item()*batch_size)

        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size
        type_epoch_loss = type_running_loss / dataset_size
        effect_epoch_loss = effect_running_loss / dataset_size

        wandb.log({'Train Combine Loss': epoch_loss})
        wandb.log({'Train Type Loss': type_epoch_loss})
        wandb.log({'Train Effect Loss': effect_epoch_loss})

        wandb.log({'Train Effect Loss1': loss.item()})

        bar.set_postfix(Epoch = epoch, Train_loss = epoch_loss, Effect_loss = loss.item(), LR=optimizer.param_groups[0]['lr'])
        # bar.set_postfix(Epoch = epoch, Train_loss = epoch_loss)
    gc.collect()
    return epoch_loss


In [None]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss= 0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype = torch.long)

        batch_size = ids.size(0)
        outputs = model(ids, mask)
        loss = criterion(outputs, targets)

        running_loss += (loss.item()*batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size
#         bar.set_postfix(Epoch = epoch, Valid_loss = epoch_loss, LR=optimizer.param_groups[0]['lr'])
        bar.set_postfix(Epoch = epoch, Valid_loss = epoch_loss)
    gc.collect()
    return epoch_loss


In [None]:
def prepare_loaders(fold):
    df_train = df[df['kfold'] != fold].reset_index(drop=True)
    df_valid = df[df['kfold'] == fold].reset_index(drop=True)

    train_dataset = RedditDataset(df_train, tokenizer=CFG.tokenizer, max_length=CFG.max_length)
    valid_dataset = RedditDataset(df_valid, tokenizer=CFG.tokenizer, max_length=CFG.max_length)

    train_loader = DataLoader(train_dataset, batch_size=CFG.train_batch_size, collate_fn=collate_fn,
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CFG.valid_batch_size, collate_fn=collate_fn,
                              num_workers=2, shuffle=False, pin_memory=True)

    return train_loader, valid_loader


In [None]:
def fetch_scheduler(optimizer):
    if CFG.scheduler == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr)
    elif CFG.scheduler == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0 = CFG.T_0, eta_min=CFG.min_lr)
    elif CFG.scheduler == None:
        return None
    return scheduler


In [None]:
def run_training(model, device, num_epochs, fold, train_loader, valid_loader):
    wandb.watch(model, log_freq = 100)

    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    for epoch in range(1,num_epochs+1):
        gc.collect()
        train_epoch_loss = train_one_epoch(model,train_loader, device, epoch)
        valid_epoch_loss = valid_one_epoch(model, valid_loader, device, epoch)

        history['Train Loss'].append(train_epoch_loss)
        history['Eval Loss'].append(valid_epoch_loss)

        wandb.log({'Train Loss': train_epoch_loss})
        wandb.log({'Eval Loss': valid_epoch_loss})

        if valid_epoch_loss <= best_epoch_loss:
            print(f"Valid Loss Improved: {best_epoch_loss} -------> {valid_epoch_loss}")
            best_epoch_loss = valid_epoch_loss
            run.summary['Best Loss']= best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            path = 'weights/DeBERTa' + f'LossFold-{fold}.bin'
            torch.save(model.state_dict(), path)
            print('Model Saved')

    end = time.time()
    time_eclipsed = end-start
    print(f'Time complete in: {time_eclipsed//3600}h:{(time_eclipsed%3600)//60}m:{time_eclipsed%60}s')
    print(f'Best Loss: {best_epoch_loss}')

    model.load_state_dict(best_model_wts)

    return model, history


In [None]:
transformers.logging.set_verbosity_error()
for fold in range(CFG.n_fold):
    print(f'================ Fold: {fold} =================')

    cfg = dict(CFG.__dict__)
    del cfg['__dict__'], cfg['__weakref__']
    run = wandb.init(
        project = 'Multi-author',
        config = cfg,
        job_type = 'Train',
        group = CFG.group,
        tags = [CFG.model_name, CFG.wandb_id],
        name = f'{CFG.wandb_id}-Fold-{fold}',
        anonymous='must'
    )

    train_loader, valid_loader = prepare_loaders(fold)
    model = MultiAuthorModel(CFG.model_name)
    model.to(CFG.device)

    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))

    model.set_optimizer_scheduler("Adam8bit")

    model, history = run_training(model, CFG.device, CFG.epoch, fold, train_loader, valid_loader)

    run.finish()
    gc.collect()


[34m[1mwandb[0m: Currently logged in as: [33malessandro-corona-m[0m ([33malessandro-corona[0m). Use [1m`wandb login --relogin`[0m to force relogin




pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Gradient Checkpointing: True
[INFO] Using GPU: Tesla T4

8-bit Optimizer:

Adam8bit (
Parameter Group 0
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 1e-05
    weight_decay: 0.005

Parameter Group 1
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 1e-05
    weight_decay: 0.0
)


  self.pid = os.fork()
  self.pid = os.fork()
100%|██████████| 2166/2166 [39:47<00:00,  1.10s/it, Effect_loss=0.305, Epoch=1, LR=9.4e-6, Train_loss=0.507]
100%|██████████| 542/542 [05:58<00:00,  1.51it/s, Epoch=1, Valid_loss=0.451]


Valid Loss Improved: inf -------> 0.45101842585303875
Model Saved


100%|██████████| 2166/2166 [39:50<00:00,  1.10s/it, Effect_loss=0.214, Epoch=2, LR=7.77e-6, Train_loss=0.443]
100%|██████████| 542/542 [05:58<00:00,  1.51it/s, Epoch=2, Valid_loss=0.424]


Valid Loss Improved: 0.45101842585303875 -------> 0.4240165781998179
Model Saved


100%|██████████| 2166/2166 [39:52<00:00,  1.10s/it, Effect_loss=0.0727, Epoch=3, LR=5.53e-6, Train_loss=0.41]
100%|██████████| 542/542 [05:59<00:00,  1.51it/s, Epoch=3, Valid_loss=0.425]
100%|██████████| 2166/2166 [39:49<00:00,  1.10s/it, Effect_loss=0.109, Epoch=4, LR=3.28e-6, Train_loss=0.381]
100%|██████████| 542/542 [05:59<00:00,  1.51it/s, Epoch=4, Valid_loss=0.415]


Valid Loss Improved: 0.4240165781998179 -------> 0.41476610681408244
Model Saved
Time complete in: 3.0h:3.0m:41.671417474746704s
Best Loss: 0.41476610681408244


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Eval Loss,█▃▃▁
Train Combine Loss,█▇▆▅▅▅▅▅▄▄▁▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
Train Effect Loss,█▇▆▅▅▅▅▅▄▄▁▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
Train Effect Loss1,█▆▄▆▄▄▂▅▅▄▂▆▅▄▃▃▅▆▅▅▂▃▃▃▄▅▅▃▅▄▃▁▅▆▂▂▃▃▆▃
Train Loss,█▄▃▁
Train Type Loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Best Loss,0.41477
Eval Loss,0.41477
Train Combine Loss,0.38094
Train Effect Loss,0.19047
Train Effect Loss1,0.10861
Train Loss,0.38094
Train Type Loss,0.0




Gradient Checkpointing: True
[INFO] Using GPU: Tesla T4

8-bit Optimizer:

Adam8bit (
Parameter Group 0
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 1e-05
    weight_decay: 0.005

Parameter Group 1
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 1e-05
    weight_decay: 0.0
)


 97%|█████████▋| 2102/2166 [38:59<01:11,  1.12s/it, Effect_loss=0.212, Epoch=1, LR=9.77e-6, Train_loss=0.514]

In [None]:
import warnings,transformers,logging,torch

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)


In [None]:
test_df = pd.read_csv(TEST_CSV)

test_df.head()


In [None]:
class RedditTestDataset(Dataset):
    def __init__(self,df, max_length, tokenizer):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.paragraphs1 = self.df['Paragraphs1'].values
        self.paragraphs2 = self.df['Paragraphs2'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        paragraphs1 = self.discourse_type[index]
        paragraphs2 = self.discourse_text[index]

        inputs = self.tokenizer.encode_plus(
            paragraphs1,
            paragraphs2,
            truncation = True,
            add_special_tokens = True,
            max_length = self.max_len
        )

        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }

        return samples


In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)

softmax = nn.Softmax(dim=1)
model = MultiAuthorModel(CFG.model_name)

In [None]:
def prepare_test_loader(test_df):
    test_dataset = RedditDataset(test_df,
                                   tokenizer=CFG.tokenizer,
                                   max_length=CFG.max_length,
                                   training=False)

    test_loader = DataLoader(test_dataset,
                             batch_size=CFG.valid_batch_size,
                             collate_fn=collate_fn,
                             num_workers=2,
                             shuffle=False,
                             pin_memory=True,
                             drop_last=False)
    return test_loader

test_loader = prepare_test_loader(test_df)


In [None]:
@torch.no_grad()
def inference(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)

    bar = tqdm(enumerate(test_loader), total=len(test_loader))

    for step, data in bar:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)

        output = model(ids, mask)
        y_preds = softmax(torch.tensor(output.to('cpu'))).numpy()

        preds.append(y_preds)

    predictions = np.concatenate(preds)
    return predictions

In [None]:
deberta_predictions = []

for fold in range(CFG.n_fold):
    print("Fold {}".format(fold))
    path = 'data/weights/' + f'Hard-LossFold-{fold}.bin'
    state = torch.load(path)
    model.load_state_dict(state)

    prediction = inference(test_loader, model, CFG.device)
    deberta_predictions.append(prediction)
    del state, prediction; gc.collect()
    torch.cuda.empty_cache()

del model


In [None]:
deberta_predictions

In [None]:
predictions = np.mean(deberta_predictions, axis=0)
predictions.shape

In [None]:
test_df['pred_0'] = predictions[:, 0]
test_df['pred_1'] = predictions[:, 1]
test_df['prediction'] = 0
test_df.loc[test_df['pred_0'] <= test_df['pred_1'], 'prediction'] = 1

In [None]:
test_df

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_true = test_df['Truth_changes'].values  # Ground truth labels
y_pred = test_df['prediction'].values  # Predicted labels

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)

# Calculate precision
precision = precision_score(y_true, y_pred)

# Calculate recall
recall = recall_score(y_true, y_pred)

# Calculate F1-score
f1 = f1_score(y_true, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


In [None]:
filtered_df = test_df[test_df['Truth_changes'] != test_df['prediction']]
filtered_df_goods = test_df[test_df['Truth_changes'] == test_df['prediction']]

fn = filtered_df.loc[filtered_df['prediction'] == 0, 'pred_0']
fp = filtered_df.loc[filtered_df['prediction'] == 1, 'pred_1']
tn = filtered_df_goods.loc[filtered_df_goods['prediction'] == 0, 'pred_0']
tp = filtered_df_goods.loc[filtered_df_goods['prediction'] == 1, 'pred_1']

vecs = {"False negatives": fn, "False positives": fp, "True negatives": tn, "True positives": tp}

print("Analysis")
print()

for key, vec in vecs.items():
  print(key)
  print()
  mean = np.mean(vec)
  median = np.median(vec)
  std_dev = np.std(vec)
  variance = np.var(vec)
  minimum = np.min(vec)
  maximum = np.max(vec)
  print("Qty:", len(vec))
  print("Mean:", mean)
  print("Median:", median)
  print("Standard Deviation:", std_dev)
  print("Variance:", variance)
  print("Minimum:", minimum)
  print("Maximum:", maximum)
  print()

# Print the calculated statistics