# Let's build a cross encoder - Step by step guide

We'll use it for the STS task. We'll use the pretrained BERT model for transfer learning on this new semantic sim task.

In [None]:
!pip install -q transformers pytorch_lightning
!wget -q https://raw.githubusercontent.com/dumitrescustefan/RO-STS/master/dataset/text-similarity/RO-STS.train.tsv
!wget -q https://raw.githubusercontent.com/dumitrescustefan/RO-STS/master/dataset/text-similarity/RO-STS.dev.tsv
!wget -q https://raw.githubusercontent.com/dumitrescustefan/RO-STS/master/dataset/text-similarity/RO-STS.test.tsv



# we'll define or model name here
transformer_model_name = "dumitrescustefan/bert-base-romanian-cased-v1"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m801.6/801.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m968.6 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

## Data loading

In [None]:
# before writting any code we're going to need our tokenizer:
tokenizer = AutoTokenizer.from_pretrained(transformer_model_name, strip_accents=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/397k [00:00<?, ?B/s]

In [None]:
class MyDataset(Dataset):
    def __init__(self, tokenizer, file):
        self.tokenizer = tokenizer  # we'll need this in the __getitem__ function
        self.instances = []
        with open(file, "r", encoding="utf8") as f:
            lines = f.readlines()
            for line in lines:
              parts = line.split("\t")
              self.instances.append({
                  "sim": float(parts[0])/5.,
                  "sent": f"[CLS]{parts[1].strip()}[SEP]{parts[2].strip()}[SEP]"
                  })

    def __len__(self):
        return len(self.instances)  # return how many instances we have. It's a list after all

    def __getitem__(self, index):
        return self.instances[index]

Let's test it's working. Load a dataset and print the first example.

In [None]:
# create the MyDataset object with the test_data
test_dataset = MyDataset(tokenizer, "RO-STS.test.tsv")
instance = test_dataset[0]  # this calls our __getitem__(0) method

# now let's print what it contains:
for key in instance:
  print(f"{key}: {instance[key]}")

0.5
0.72
1.0
0.8400000000000001
0.3
0.36
0.7
0.44000000000000006
0.44000000000000006
0.3428
0.3428
1.0
0.12
0.8800000000000001
0.4
0.36
0.8800000000000001
0.72
0.72
0.24
0.48
0.04
0.8400000000000001
0.8800000000000001
0.45
0.4
0.15
0.44000000000000006
0.16
0.44000000000000006
0.64
0.96
0.27999999999999997
0.85
0.6799999999999999
0.1066
0.08
0.24
1.0
0.1076
0.75
0.6
0.72
0.1
0.3
0.16
0.16
0.12
0.8800000000000001
0.35
0.08
0.27999999999999997
0.08
0.16
0.4
0.026600000000000002
0.8
0.0534
0.6799999999999999
0.24
1.0
0.0
0.76
0.15
0.6799999999999999
0.0
0.04
0.8
0.1
0.76
0.48
0.95
0.0
0.75
0.52
0.0
0.15
0.0
0.76
0.5599999999999999
0.0
0.16
0.6
0.2
0.0
0.2
0.6799999999999999
1.0
0.4666
0.27999999999999997
0.15
0.7076
0.16
0.7
0.4
0.95
0.8
0.9
0.3
0.48
0.0
0.08
0.5
0.52
0.72
0.6
0.0
0.32
0.25
0.72
0.4
0.72
0.5599999999999999
0.8800000000000001
0.0
0.0
0.75
0.5
0.0
0.16
0.2
0.0
0.05
0.08
0.0
1.0
0.06
0.55
1.0
0.85
0.6799999999999999
0.32
0.52
1.0
0.64
0.96
1.0
0.48
0.4
1.0
1.0
0.6799999999999

Now, we need to collate the instances in a batch.

In [None]:
class MyCollator(object):
    def __init__(self, tokenizer, max_seq_len):
        self.max_seq_len = max_seq_len  # this will be our model's maximum sequence length
        self.tokenizer = tokenizer   # we still need our tokenizer to know that the pad token's id is


    def __call__(self, input_batch):
        sims = []
        sents = []

        for instance in input_batch:
          sims.append(instance['sim'])
          sents.append(instance['sent'])

        tokenized_batch = self.tokenizer(sents, padding=True, max_length = self.max_seq_len, truncation=True, return_tensors="pt")
        sims = torch.tensor(sims, dtype=torch.float)

        return {
            "tokenized_batch": tokenized_batch,
            "sim": sims
        }

In [None]:
# let's test our collator
test_dataset = MyDataset(tokenizer, "RO-STS.train.tsv")
my_collator = MyCollator(tokenizer=tokenizer, max_seq_len=512)

# crete a dataloader and get first batch of 3
test_dataloader = DataLoader(test_dataset, batch_size=3, collate_fn=my_collator)

iterable_data = iter(test_dataloader)
first_batch = next(iterable_data) # this is the output_batch from above
for key in first_batch:
  print(f"{key} is a {first_batch[key]}")

## Model preparation

We're finally here :)

As we're using Pytorch Lightning to do the behind-the-scenes training, we do need to define a few functions:

* ``__init__``, ``forward``
* ``training_step``
* ``validation_step``
* ``configure_optimizers``

As this is a single block of code, comments will be inline:


In [None]:
class TransformerModel(pl.LightningModule):
    def __init__(self, model_name, lr=2e-05, model_max_length=512):
        super().__init__()

        print("Loading AutoModel [{}] ...".format(model_name))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name, strip_accents=False)
        self.model = AutoModel.from_pretrained(model_name)
        self.output_layer = torch.nn.Linear(768,1)

        self.loss_fct = torch.nn.MSELoss()

        self.lr = lr

    def forward(self, tokenized_batch):
        # we're just wrapping the code on the AutoModelForTokenClassification
        # it needs the input_ids, attention_mask and labels

        output = self.model(
            input_ids=tokenized_batch['input_ids'],
            attention_mask=tokenized_batch['attention_mask'],
            return_dict=True
        )
        pooler_output = output['pooler_output']  # [batch_size, 768]
        prediction = self.output_layer(pooler_output)  # [batch_size, 1]

        return prediction.flatten()


    def training_step(self, batch, batch_idx):
        tokenized_batch = batch['tokenized_batch']
        sims = batch['sim']

        prediction = self.forward(tokenized_batch)  # [batch_size, 1]

        loss = self.loss_fct(prediction, sims)

        self.log("train_loss", loss.detach().cpu().item(), on_step=True, on_epoch=True, prog_bar=True,)
        return {"loss": loss}


    def validation_step(self, batch, batch_idx):
        tokenized_batch = batch['tokenized_batch']
        sims = batch['sim']

        prediction = self.forward(tokenized_batch)  # [batch_size, seq_len, 768]

        loss = self.loss_fct(prediction, sims)

        self.log("val_loss", loss.detach().cpu().item(), on_step=True, on_epoch=True, prog_bar=True,)
        return {"loss": loss}

    def configure_optimizers(self):
        return torch.optim.AdamW([p for p in self.parameters() if p.requires_grad], lr=self.lr, eps=1e-08)

In [None]:
class RoBERTModel(pl.LightningModule):
    def __init__(self, model_name: str, lr: float = 2e-5, model_max_length: int = 256):
      """
      Initializes the TransformerModel with a specified base model and configuration.
      :param model_name: Name or path to the pretrained model.
      :param lr: Learning rate for the optimizer.
      :param model_max_length: Maximum input sequence length for the model.
      """
      super().__init__()
      self.tokenizer = AutoTokenizer.from_pretrained(model_name, strip_accents=False)
      self.model = AutoModel.from_pretrained(model_name)
      self.output_layer = nn.Linear(self.model.config.hidden_size, 1)

    self.loss_fct = nn.MSELoss()

    self.save_hyperparameters()

  def forward(self, tokenized_batch):
  """
  Forward pass through the model.

  :param tokenized_batch: Tokenized input batch including input_ids and attention_mask.
  :return: Predictions for the input batch.
  """
    output = self.model(**tokenized_batch, return_dict=True)
    cls_embedding = output.pooler_output
    prediction = self.output_layer(cls_embedding)

    return prediction.flatten()

  def training_step(self, batch, batch_idx):
  """
  Performs a training step.
  :param batch: The batch of data provided by the DataLoader.
  :param batch_idx: The index of the current batch.
  :return: A dictionary containing the loss.
  """
    tokenized_batch = batch['tokenized_batch']
    ground_truth = batch['similarity_scores']
    prediction = self(tokenized_batch)
    loss = self.loss_fct(prediction, ground_truth)
    self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
    return {"loss": loss}

## Training phase

At this point we're ready to start training. When the code is ready, switch your colab to GPU, and run every cell up to this point, to have the training run on the GPU. Notice that Pytorch Lightning abstracts all the hassle of training on different devices.

So, what do we need?

We need the model itself (the ``TransformerModel`` object), and the trainer object which receives a few parameters detailed below. The trainer will move the data on GPU automatically, call ``train_step`` and ``train_epoch_end``, then do the same for validation, and then do backprop (internally calls Pytorch's ``.backward()``, ``optimizer_step`` and ``zero_grad`` to update the model weights. It also handles all the gritty stuff like early stopping, logging, model saving, distributed training (if you have more than 1 GPU), etc.


In [None]:
GPU_AVAILABLE = False

if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
    GPU_AVAILABLE = True
else:
    print("GPU is not available, using CPU.")

In [None]:
model = TransformerModel(
    model_name=transformer_model_name,
    lr=2e-5,
    model_max_length=512
)

trainer = pl.Trainer(
    devices=-1,  # uncomment this when training on gpus
    accelerator="gpu",  # uncomment this when training on gpus
    max_epochs=2,  # set this to -1 when training fully
    #limit_train_batches=10,  # comment this out when training fully
    #limit_val_batches=5,  # comment this out when training fully
    gradient_clip_val=1.0,
    enable_checkpointing=False  # this disables saving the model each epoch
)

# instantiate dataloaders
# a batch_size of 8 should work fine on 16GB GPUs
train_dataloader = DataLoader(MyDataset(tokenizer, "RO-STS.train.tsv"), batch_size=8, collate_fn=my_collator, shuffle=True, pin_memory=True)
validation_dataloader = DataLoader(MyDataset(tokenizer, "RO-STS.dev.tsv"), batch_size=8, collate_fn=my_collator, shuffle=False, pin_memory=True)

# call this to start training
trainer.fit(model, train_dataloader, validation_dataloader)

## Let's use our model

In [None]:
def predict (model, sent1, sent2):
    # prepare a string with the concatenated sentences

    # tokenize the sentence, don't forget to make it an 1-element list

    # run through the model, it returns a [batch_size] prediction

    return # return the value that you multiply by 5

### Solution (hidden)

In [None]:
def predict (model, sent1, sent2):
    concatenated_sentences = f"[CLS]{sent1.strip()}[SEP]{sent2.strip()}[SEP]"

    tokenized_batch = model.tokenizer([concatenated_sentences], padding=True, max_length = 512, truncation=True, return_tensors="pt")

    predictions = model.forward(tokenized_batch)  # returns a [batch_size, ]

    return predictions[0].item()*5.  # select the first item and multiply by 5

## Evaluation

In [None]:
# let's test our code
model.eval()

tests = {
    ("Ana are mere.", "Andrei are pere."),
    ("Filmul este foarte bun", "Filmul este extrem de slab."),
    ("Cerul este albastru azi", "Pisica a urcat pe acoperiș."),
    ("Cartea a fost interesantă", "Lectura nu m-a captivat deloc."),
    ("Am alergat un maraton", "Am terminat cursa de 42 de kilometri."),
    ("Muzica clasică este relaxantă", "Genul clasic muzical îmi calmează mintea.")
}

for (s1, s2) in tests:
  print(f"{s1} || {s2} \t SIM = {predict(model, s1, s2)}")