# 0. Imports and predefines

In [1]:
!pip install transformers
!pip install seaborn
!pip install wandb -qU
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1
Looking in in

In [20]:
from pathlib import Path
from dataclasses import dataclass, asdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from tqdm.notebook import tqdm

import torch

import transformers

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
# Log in to your W&B account
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# 1. Prepare data

In [9]:
def read_tsv(path):
    return pd.read_csv(path, delimiter="\t", index_col=0)


final_folder = Path("/content/drive/MyDrive/NLP_project_2023/data/final")

final_train = read_tsv(final_folder / "train.tsv")
final_val = read_tsv(final_folder / "dev.tsv")
final_test = read_tsv(final_folder / "test.tsv")


swap_train = read_tsv("/content/drive/MyDrive/NLP_project_2023/data/swap/train.tsv")


unlabeled_train = read_tsv("/content/drive/MyDrive/NLP_project_2023/data/unlabeled/final/train.tsv")
unlabeled_val = read_tsv("/content/drive/MyDrive/NLP_project_2023/data/unlabeled/final/dev.tsv")

In [10]:
# print some info

for df in [final_train, final_val, final_test, swap_train, unlabeled_train, unlabeled_val]:
    print("+" * 30)

    print(df.info())

++++++++++++++++++++++++++++++
<class 'pandas.core.frame.DataFrame'>
Int64Index: 49401 entries, 1 to 49401
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentence1  49401 non-null  object
 1   sentence2  49401 non-null  object
 2   label      49401 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.5+ MB
None
++++++++++++++++++++++++++++++
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 1 to 8000
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentence1  8000 non-null   object
 1   sentence2  8000 non-null   object
 2   label      8000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 250.0+ KB
None
++++++++++++++++++++++++++++++
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 1 to 8000
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   se

In [11]:
# merge into train, val and test

train_df = pd.concat([final_train, swap_train])

val_df = pd.concat([final_val])

test_df = pd.concat([final_test])


print("Train:", train_df.shape)

print("Val:", val_df.shape)

print("Test:", test_df.shape)

Train: (79798, 3)
Val: (8000, 3)
Test: (8000, 3)


In [12]:
train_df.head(5)

Unnamed: 0_level_0,sentence1,sentence2,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"In Paris , in October 1560 , he secretly met t...","In October 1560 , he secretly met with the Eng...",0
2,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,1
3,"There are also specific discussions , public p...","There are also public discussions , profile sp...",0
4,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,1
5,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,1


In [13]:
class PairedSentenceDataset(torch.utils.data.Dataset):
    def __init__(self, table: pd.DataFrame, tokenizer: transformers.PreTrainedTokenizer,
                 max_length: int):

        super().__init__()

        self.first_sentences = table["sentence1"].values

        self.second_sentences = table["sentence2"].values

        self.labels = table["label"].values


        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.first_sentences)

    def __getitem__(self, index: int):
        first_sentence = self.first_sentences[index]

        second_sentence = self.second_sentences[index]

        label = self.labels[index]

        tokenizer_output = self.tokenizer(first_sentence, second_sentence,
                                return_tensors="pt",
                                return_token_type_ids=True,
                                max_length=self.max_length,
                                padding="max_length",
                                truncation=True)

        return {
            "labels": torch.LongTensor([label]),
            **tokenizer_output
        }


def build_tokenizer(model: str):
    return transformers.AutoTokenizer.from_pretrained(model)

In [14]:
tokenizer = build_tokenizer("microsoft/deberta-v3-large")

dataset = PairedSentenceDataset(train_df, tokenizer, 128)

assert dataset[0]["labels"].shape == (1, )
assert dataset[0]["input_ids"].shape == (1, 128)
assert dataset[0]

dataset = PairedSentenceDataset(train_df, tokenizer, 20)

assert dataset[0]["input_ids"].shape == (1, 20)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 2. Model side

In [27]:
@dataclass
class TrainConfig:
    model: str
    checkpoints_folder: str
    device: torch.device

    batch_size: int
    epochs: int
    max_length: int

    lr: float

In [24]:
def train_model(model: transformers.DebertaModel, optimizer: torch.optim.Optimizer,
                train_dataloader: torch.utils.data.DataLoader, val_dataloader: torch.utils.data.DataLoader,
                config: TrainConfig, wandb_project: str):

    wandb.init(
        project=wandb_project,
        config=config
    )


    for epoch in range(config["epochs"]):

        # train
        model.train()

        for batch in tqdm(train_dataloader):
            optimizer.zero_grad()

            batch = {key: batch[key].squeeze().to(device) for key in batch}
            outputs = model(**batch)

            outputs["loss"].backward()

            wandb.log({"train_loss": outputs["loss"].detach().cpu().numpy()})

            optimizer.step()

        # val

        model.eval()

        predicts_batches = []
        ground_truths_batches = []

        for batch in tqdm(val_dataloader):
            ground_truths_batches.append(batch["labels"])

            batch = {key: batch[key].squeeze().to(device) for key in batch}
            outputs = model(batch)

            predicts_batches.append(torch.argmax(outputs.cpu(), dim=-1).numpy())


        predicts = np.concatenate(predicts_batches)
        ground_truths = np.concatenate(ground_truths_batches)

        wandb.log({
            "accuracy": accuracy_score(ground_truths, predicts),
            "f1": f1_score(ground_truths, predicts),
            "recall": recall_score(ground_truths, predicts),
            "precision": precision_score(ground_truths, predicts)
        })


    wandb.finish()


class Trainer:
    checkpoint_field_model: str = "model"
    checkpoint_field_optimizer: str = "optimizer"
    checkpoint_field_epoch: int = "epoch"

    checkpoint_last_name: str = "last.tar"

    # TODO add logging of the best
    checkpoint_best_name: str = "best.tar"

    def __init__(self, model: transformers.DebertaModel, optimizer: torch.optim.Optimizer,
                 device: torch.device) -> None:
        self.model = model
        self.optimizer = optimizer

    def train(self, train_dataloader: torch.utils.data.DataLoader,
              val_dataloader: torch.utils.data.DataLoader,
              config: TrainConfig, wandb_project: str) -> None:

        wandb.init(
            project=wandb_project,
            config=asdict(config)
        )

        model.to(config.device)

        self.load_checkpoint(config.checkpoints_folder)

        for epoch in range(config.epochs):
            self.train_step(train_dataloader)

            self.evaluation_step(val_dataloader)

            self.save_checkpoint(config.checkpoints_folder)

        wandb.finish()

    def make_inferece(self, dataloader: torch.utils.data.DataLoader,
                return_labels: bool = True):
        self.model.eval()

        with torch.no_grad():
            predicts = []
            labels = []

            for batch in dataloader:
                labels.append(batch["labels"].numpy())
                
                batch = Trainer._move_dict_items_to_device(batch, model.device)

                outputs = self.model(batch).cpu().numpy()

                predicts.append(outputs["logits"])


        return np.concatenate(logits), np.concatenate(labels)

    @classmethod
    def _move_dict_items_to_device(target_dict: dict, device: str)
        return {key: target_dict[key].squeeze().to(device) for key in target_dict}

    def evaluation_step(self, dataloader: torch.utils.data.DataLoader,
                        return_labels: bool = True):
        
        
        

    def load_checkpoint(self, folder: str) -> int:
        checkpoint = torch.load(Path(folder) / Trainer.checkpoint_last_name)

        self.model.load_state_dict(checkpoint[Trainer.checkpoint_field_model])
        self.optimizer.load_state_dict(checkpoint[Trainer.checkpoint_field_optimizer])

        return checkpoint[Trainer.checkpoint_field_epoch]

    def save_checkpoint(self, folder: str, epoch: int) -> None:
        checkpoint_name = Path(folder) / Trainer.checkpoint_last_name

        torch.save(
            {
                Trainer.checkpoint_field_model: self.model.state_dict(),
                Trainer.checkpoint_field_optimizer: self.optimizer.state_dict(),
                Trainer.checkpoint_field_epoch: epoch
            }
        )





# def save_checkpoint(model: torch.nn.Module, optimizer: torch.nn.optimizer)

SyntaxError: ignored

# 3. Experiment

In [17]:
config = {
    # "model": "microsoft/deberta-v3-large",
    "model": "microsoft/deberta-v3-small",
    "weights_folder": "/content/drive/MyDrive/NLP_project_2023/artifacts/test_exp",

    "batch_size": 8,
    "epochs": 2,
    "max_length": 128,

    "lr": 1e-5,
}

In [18]:
tokenizer = build_tokenizer(config["model"])

train_loader = torch.utils.data.DataLoader(PairedSentenceDataset(train_df[:100], tokenizer, config["max_length"]),
                                           batch_size=config["batch_size"], shuffle=True)

val_loader = torch.utils.data.DataLoader(PairedSentenceDataset(val_df[:100], tokenizer, config["max_length"]),
                                         batch_size=config["batch_size"], shuffle=False)

test_loader = torch.utils.data.DataLoader(PairedSentenceDataset(test_df[:100], tokenizer, config["max_length"]),
                                          batch_size=config["batch_size"], shuffle=False)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model = transformers.DebertaV2ForSequenceClassification.from_pretrained(config["model"], num_labels=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])


train_model(model, optimizer, train_loader, val_loader, config, "nlp_project_2023")

Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

  0%|          | 0/9975 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
# in case of some problems
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.132349…

0,1
train_loss,▇▅▅▇▅▇▆▅█▅▅▅██▇▆▅▇▄▅▃▅▂▁▂▂▅▁▂▃▂▁▃▁▁▅▆▃▁▁

0,1
train_loss,0.07791


In [None]:
for batch in train_loader:
    model(batch["input_ids"].squeeze())

    break

In [None]:
batch["input_ids"].shape