# Let's build a cross encoder - Step by step guide

We'll use it for the STS task. We'll use the pretrained BERT model for transfer learning on this new semantic sim task.

In [1]:
!pip install -q transformers pytorch_lightning
!wget -q https://raw.githubusercontent.com/mircea007/petidate/main/test_data.csv
!wget -q https://raw.githubusercontent.com/mircea007/petidate/main/train_data.csv
!wget -q https://raw.githubusercontent.com/dumitrescustefan/RO-STS/master/dataset/text-similarity/RO-STS.dev.tsv
!wget -q https://raw.githubusercontent.com/dumitrescustefan/RO-STS/master/dataset/text-similarity/RO-STS.test.tsv

import logging, os, sys, json, torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
import pytorch_lightning as pl
from transformers import AutoTokenizer, AutoModel, AutoConfig, Trainer, TrainingArguments
from pytorch_lightning.callbacks import EarlyStopping
import numpy as np

# we'll define or model name here
transformer_model_name = "dumitrescustefan/bert-base-romanian-cased-v1"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.2/812.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import math

import subprocess
import os
import sys

raw_train_data = pd.read_csv( 'train_data.csv' )

good_bad = {
    'offensive': 0, # bad
    'non-offensive': 1, # good
    'direct': 2, # bad
    'descriptive': 3, # bad
    'reporting': 4, # ignore for now
}

sanitized_verdicts = list( raw_train_data['Final Labels'].map(
    lambda label: good_bad[label]
) )

# https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
import re

emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
def sanitize_text( text ):
    word_list = re.split( r'[\ .,;!?]', emoj.sub( r'', text ) )

    #return list( map( lambda word: light_pipeline.fullAnnotate( word.lower() ).result, filter( lambda word: not (len(word) == 0 or word[0] == '@'), word_list ) ) )
    return ' '.join(list( map( lambda word: word.lower(), filter( lambda word: not (len(word) == 0 or word[0] == '@'), word_list ) ) ))
sanitized_text = list( raw_train_data['Text'].map( sanitize_text ) )

ID_CELL = 0
TEXT_CELL = 1
LABEL_CELL = 2


#out = list( zip( test_data['Id'], list( map( lambda text: 'non-offensive' if test_score( text ) else 'offensive', test_data['Text'] ) ) ) )

# out_df = pd.DataFrame( out )
# out_df.to_csv( '/kaggle/working/submission_2.csv', index=False, header=['Id', 'Label'] )


## Data loading

In [8]:
# before writting any code we're going to need our tokenizer:
tokenizer = AutoTokenizer.from_pretrained(transformer_model_name, strip_accents=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/397k [00:00<?, ?B/s]

In [4]:
class MyDataset(Dataset):
    def __init__(self, tokenizer, file):
        self.tokenizer = tokenizer  # we'll need this in the __getitem__ function
        self.instances = []

        raw_data = pd.read_csv( file )
        for i in range( len( raw_data ) ):
            raw_text = raw_data['Text'][i]
            label = raw_data['Final Labels'][i]
            sanitized_text = sanitize_text( raw_text.strip() )
            self.instances.append({
                "sim": good_bad[label],
                "sent": f"[CLS]{sanitized_text}[SEP]"
            })
        '''
        with open(file, "r", encoding="utf8") as f:
            lines = f.readlines()
            for i in range(1, len(lines)):
              line = lines[i]
              parts = line.split(",")
                parts[0] = sanitize_text( parts[0].strip() )
              print(parts)
              self.instances.append({
                  "sim": good_bad[parts[1].strip()],
                  "sent": f"[CLS]{parts[0]}[SEP]{parts[1].strip()}[SEP]"
                  })
        '''

    def __len__(self):
        return len(self.instances)  # return how many instances we have. It's a list after all

    def __getitem__(self, index):
        return self.instances[index]

Let's test it's working. Load a dataset and print the first example.

In [9]:
# create the MyDataset object with the test_data
test_dataset = MyDataset(tokenizer, "train_data.csv")
instance = len(test_dataset)  # this calls our __getitem__(0) method

# now let's print what it contains

Now, we need to collate the instances in a batch.

In [10]:
class MyCollator(object):
    def __init__(self, tokenizer, max_seq_len):
        self.max_seq_len = max_seq_len  # this will be our model's maximum sequence length
        self.tokenizer = tokenizer   # we still need our tokenizer to know that the pad token's id is


    def __call__(self, input_batch):
        sims = []
        sents = []

        for instance in input_batch:
          sims.append(instance['sim'])
          sents.append(instance['sent'])

        tokenized_batch = self.tokenizer(sents, padding=True, max_length = self.max_seq_len, truncation=True, return_tensors="pt")
        sims = torch.tensor(sims, dtype=torch.float)

        return {
            "tokenized_batch": tokenized_batch,
            "sim": sims
        }

In [11]:
# let's test our collator
test_dataset = MyDataset(tokenizer, "train_data.csv")
my_collator = MyCollator(tokenizer=tokenizer, max_seq_len=64)

# crete a dataloader and get first batch of 3
test_dataloader = DataLoader(test_dataset, batch_size=3, collate_fn=my_collator)

iterable_data = iter(test_dataloader)
first_batch = next(iterable_data) # this is the output_batch from above
for key in first_batch:
  print(f"{key} is a {first_batch[key]}")

tokenized_batch is a {'input_ids': tensor([[    2,     2,   990,   411,   592,   474,  1488,   552,  2488,   393,
          2524,  9143,     3,     3],
        [    2,     2,   509, 21862,   416,  1062,     3,     3,     0,     0,
             0,     0,     0,     0],
        [    2,     2,   398, 10973,  1616, 14638,   761,   734,  1384, 38884,
           209,     3,     3,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}
sim is a tensor([1., 1., 1.])


## Model preparation

We're finally here :)

As we're using Pytorch Lightning to do the behind-the-scenes training, we do need to define a few functions:

* ``__init__``, ``forward``
* ``training_step``
* ``validation_step``
* ``configure_optimizers``

As this is a single block of code, comments will be inline:


In [17]:
class TransformerModel(pl.LightningModule):
    def __init__(self, model_name, lr=2e-05, model_max_length=512):
        super().__init__()

        print("Loading AutoModel [{}] ...".format(model_name))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name, strip_accents=False)
        self.model = AutoModel.from_pretrained(model_name)
        self.output_layer = torch.nn.Linear(768,1)

        self.loss_fct = torch.nn.MSELoss()

        self.lr = lr

    def forward(self, tokenized_batch):
        # we're just wrapping the code on the AutoModelForTokenClassification
        # it needs the input_ids, attention_mask and labels

        output = self.model(
            input_ids=tokenized_batch['input_ids'],
            attention_mask=tokenized_batch['attention_mask'],
            return_dict=True
        )
        pooler_output = output['pooler_output']  # [batch_size, 768]
        prediction = self.output_layer(pooler_output)  # [batch_size, 1]

        return prediction.flatten()


    def training_step(self, batch, batch_idx):
        tokenized_batch = batch['tokenized_batch']
        sims = batch['sim']

        prediction = self.forward(tokenized_batch)  # [batch_size, 1]

        loss = self.loss_fct(prediction, sims)

        self.log("train_loss", loss.detach().cpu().item(), on_step=True, on_epoch=True, prog_bar=True,)
        return {"loss": loss}


    def validation_step(self, batch, batch_idx):
        tokenized_batch = batch['tokenized_batch']
        sims = batch['sim']

        prediction = self.forward(tokenized_batch)  # [batch_size, seq_len, 768]

        loss = self.loss_fct(prediction, sims)

        self.log("val_loss", loss.detach().cpu().item(), on_step=True, on_epoch=True, prog_bar=True,)
        return {"loss": loss}

    def configure_optimizers(self):
        # we'll use AdamW optimizer
        print('apeleaza')
        return torch.optim.AdamW([p for p in self.parameters() if p.requires_grad], lr=self.lr, eps=1e-08)

## Training phase

At this point we're ready to start training. When the code is ready, switch your colab to GPU, and run every cell up to this point, to have the training run on the GPU. Notice that Pytorch Lightning abstracts all the hassle of training on different devices.

So, what do we need?

We need the model itself (the ``TransformerModel`` object), and the trainer object which receives a few parameters detailed below. The trainer will move the data on GPU automatically, call ``train_step`` and ``train_epoch_end``, then do the same for validation, and then do backprop (internally calls Pytorch's ``.backward()``, ``optimizer_step`` and ``zero_grad`` to update the model weights. It also handles all the gritty stuff like early stopping, logging, model saving, distributed training (if you have more than 1 GPU), etc.


In [19]:
model = TransformerModel(
    model_name=transformer_model_name,
    lr=2e-5,
    model_max_length=512
)

trainer = pl.Trainer(
    devices=-1,  # uncomment this when training on gpus
    accelerator="gpu",  # uncomment this when training on gpus
    max_epochs=-1,  # set this to -1 when training fully
    #limit_train_batches=10,  # comment this out when training fully
    #limit_val_batches=5,  # comment this out when training fully
    gradient_clip_val=1.0,
    enable_checkpointing=True  # this disables saving the model each epoch
)

# instantiate dataloaders
# a batch_size of 8 should work fine on 16GB GPUs
train_data = MyDataset(tokenizer, "train_data.csv")
train_dataloader = DataLoader(train_data[0:30000], batch_size=8, collate_fn=my_collator, shuffle=True, pin_memory=True)
validation_dataloader = DataLoader(train_data[30000:35000], batch_size=8, collate_fn=my_collator, shuffle=False, pin_memory=True)

# call this to start training
trainer.fit(model, train_dataloader, validation_dataloader)

Loading AutoModel [dumitrescustefan/bert-base-romanian-cased-v1] ...


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name         | Type      | Params | Mode 
---------------------------------------------------
0 | model        | BertModel | 124 M  | eval 
1 | output_layer | Linear    | 769    | train
2 | loss_fct     | MSELoss   | 0      | train
---------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.768   Total estimated model params size (MB)


apeleaza


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


## Let's use our model

In [None]:
def predict (model, sent1):
    concatenated_sentences = f"[CLS]{sent1.strip()}"

    tokenized_batch = model.tokenizer([concatenated_sentences], padding=True, max_length = 512, truncation=True, return_tensors="pt")

    predictions = model.forward(tokenized_batch)  # returns a [batch_size, ]

    return predictions[0].item()

### Solution (hidden)

In [None]:
def predict (model, sent1, sent2):
    concatenated_sentences = f"[CLS]{sent1.strip()}[SEP]{sent2.strip()}[SEP]"

    tokenized_batch = model.tokenizer([concatenated_sentences], padding=True, max_length = 512, truncation=True, return_tensors="pt")

    predictions = model.forward(tokenized_batch)  # returns a [batch_size, ]

    return predictions[0].item()*5.  # select the first item and multiply by 5

## Evaluation

In [None]:
# let's test our code
model.eval()

test_data = pd.read_csv( 'test_data.csv' )
train_data = pd.read_csv( 'train_data.csv' )


def lmao (s):
  return int(max(0, min(4, round(predict(model, s), 0))))
answer = ['offensive', 'non-offensive', 'direct', 'descriptive', 'reporting']

out = list( zip( test_data['Id'], list( map( lambda text: answer[lmao(text)], test_data['Text'] ) ) ) )

out_df = pd.DataFrame( out )
out_df.to_csv( 'submission_3.csv', index=False, header=['Id', 'Label'] )