# 0. Imports and predefines

In [None]:
!pip install transformers
!pip install seaborn
!pip install wandb -qU
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from tqdm.notebook import tqdm

import torch

import transformers

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
# Log in to your W&B account
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33msmt[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# 1. Prepare data

In [None]:
def read_tsv(path):
    return pd.read_csv(path, delimiter="\t", index_col=0)


final_folder = Path("/content/drive/MyDrive/NLP_project_2023/data/final")

final_train = read_tsv(final_folder / "train.tsv")
final_val = read_tsv(final_folder / "dev.tsv")
final_test = read_tsv(final_folder / "test.tsv")


swap_train = read_tsv("/content/drive/MyDrive/NLP_project_2023/data/swap/train.tsv")


unlabeled_train = read_tsv("/content/drive/MyDrive/NLP_project_2023/data/unlabeled/final/train.tsv")
unlabeled_val = read_tsv("/content/drive/MyDrive/NLP_project_2023/data/unlabeled/final/dev.tsv")

In [None]:
# print some info

for df in [final_train, final_val, final_test, swap_train, unlabeled_train, unlabeled_val]:
    print("+" * 30)

    print(df.info())

++++++++++++++++++++++++++++++
<class 'pandas.core.frame.DataFrame'>
Int64Index: 49401 entries, 1 to 49401
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentence1  49401 non-null  object
 1   sentence2  49401 non-null  object
 2   label      49401 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.5+ MB
None
++++++++++++++++++++++++++++++
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 1 to 8000
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentence1  8000 non-null   object
 1   sentence2  8000 non-null   object
 2   label      8000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 250.0+ KB
None
++++++++++++++++++++++++++++++
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 1 to 8000
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   se

In [None]:
# merge into train, val and test

train_df = pd.concat([final_train, swap_train])

val_df = pd.concat([final_val])

test_df = pd.concat([final_test])


print("Train:", train_df.shape)

print("Val:", val_df.shape)

print("Test:", test_df.shape)

Train: (79798, 3)
Val: (8000, 3)
Test: (8000, 3)


In [None]:
train_df.head(5)

Unnamed: 0_level_0,sentence1,sentence2,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"In Paris , in October 1560 , he secretly met t...","In October 1560 , he secretly met with the Eng...",0
2,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,1
3,"There are also specific discussions , public p...","There are also public discussions , profile sp...",0
4,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,1
5,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,1


In [None]:
class PairedSentenceDataset(torch.utils.data.Dataset):
    def __init__(self, table: pd.DataFrame, tokenizer: transformers.PreTrainedTokenizer,
                 max_length: int):

        super().__init__()

        self.first_sentences = table["sentence1"].values

        self.second_sentences = table["sentence2"].values

        self.labels = table["label"].values


        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.first_sentences)

    def __getitem__(self, index: int):
        first_sentence = self.first_sentences[index]

        second_sentence = self.second_sentences[index]

        label = self.labels[index]

        tokenizer_output = self.tokenizer(first_sentence, second_sentence,
                                return_tensors="pt",
                                return_token_type_ids=True,
                                max_length=self.max_length,
                                padding="max_length",
                                truncation=True)

        return {
            "labels": torch.LongTensor([label]),
            **tokenizer_output
        }


def build_tokenizer(model: str):
    return transformers.AutoTokenizer.from_pretrained(model)

In [None]:
tokenizer = build_tokenizer("microsoft/deberta-v3-large")

dataset = PairedSentenceDataset(train_df, tokenizer, 128)

assert dataset[0]["labels"].shape == (1, )
assert dataset[0]["input_ids"].shape == (1, 128)
assert dataset[0]

dataset = PairedSentenceDataset(train_df, tokenizer, 20)

assert dataset[0]["input_ids"].shape == (1, 20)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 2. Model side

In [None]:
def train_model(model: transformers.DebertaModel, optimizer: torch.optim.Optimizer,
                train_dataloader: torch.utils.data.DataLoader, val_dataloader: torch.utils.data.DataLoader,
                config: dict, wandb_project: str):

    wandb.init(
        project=wandb_project,
        config=config
    )


    for epoch in range(config["epochs"]):
        # train
        model.train()

        for batch in tqdm(train_dataloader):
            optimizer.zero_grad()

            batch = {key: batch[key].squeeze().to(device) for key in batch}
            outputs = model(**batch)

            outputs["loss"].backward()

            wandb.log({"train_loss": outputs["loss"].detach().cpu().numpy()})

            optimizer.step()

        # val

        model.eval()

        predicts_batches = []
        ground_truths_batches = []

        for batch in tqdm(val_dataloader):
            ground_truths_batches.append(batch["labels"])

            batch = {key: batch[key].squeeze().to(device) for key in batch}
            outputs = model(batch)

            predicts_batches.append(torch.argmax(outputs.cpu(), dim=-1).numpy())


        predicts = np.concatenate(predicts_batches)
        ground_truths = np.concatenate(ground_truths_batches)

        wandb.log({
            "accuracy": accuracy_score(ground_truths, predicts),
            "f1": f1_score(ground_truths, predicts),
            "recall": recall_score(ground_truths, predicts),
            "precision": precision_score(ground_truths, predicts)
        })


    wandb.finish()

# 3. Experiment

In [None]:
config = {
    "model": "microsoft/deberta-v3-large",
    "weights_folder": "/content/drive/MyDrive/NLP_project_2023/artifacts/test_exp",

    "batch_size": 8,
    "epochs": 2,
    "max_length": 128,

    "lr": 1e-5,
}

In [None]:
tokenizer = build_tokenizer(config["model"])

train_loader = torch.utils.data.DataLoader(PairedSentenceDataset(train_df, tokenizer, config["max_length"]),
                                           batch_size=config["batch_size"], shuffle=True)

val_loader = torch.utils.data.DataLoader(PairedSentenceDataset(val_df, tokenizer, config["max_length"]),
                                         batch_size=config["batch_size"], shuffle=False)

test_loader = torch.utils.data.DataLoader(PairedSentenceDataset(test_df, tokenizer, config["max_length"]),
                                          batch_size=config["batch_size"], shuffle=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model = transformers.DebertaV2ForSequenceClassification.from_pretrained(config["model"], num_labels=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])


train_model(model, optimizer, train_loader, val_loader, config, "nlp_project_2023")

Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

  0%|          | 0/9975 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
# in case of some problems
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.132349…

0,1
train_loss,▇▅▅▇▅▇▆▅█▅▅▅██▇▆▅▇▄▅▃▅▂▁▂▂▅▁▂▃▂▁▃▁▁▅▆▃▁▁

0,1
train_loss,0.07791


In [None]:
for batch in train_loader:
    model(batch["input_ids"].squeeze())

    break

In [None]:
batch["input_ids"].shape