In [1]:
# we have train, test, and validate data in 3 separate JSONL files which look like this:
{"mention": "word1", "entity": "word2", "id": "unique_id_A"}
{"mention": "word3", "entity": "word4", "id": "unique_id_B"}
{"mention": "word5", "entity": "word6", "id": "unique_id_C"}

{'mention': 'word5', 'entity': 'word6', 'id': 'unique_id_C'}

In [None]:
%pip install datasets



In [3]:
from google.colab import userdata
from huggingface_hub import login

# Login into Hugging Face Hub
hf_token = userdata.get('HF_TOKEN') # If you are running inside a Google Colab
login(hf_token)

In [4]:
from datasets import load_dataset

# there are all "positive" pairs"
dataset = load_dataset("Stevenf232/BC5CDR_MeSH2015_nameonly")


README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

name_only_train.jsonl: 0.00B [00:00, ?B/s]

name_only_val.jsonl: 0.00B [00:00, ?B/s]

name_only_test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/2654 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2559 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2656 [00:00<?, ? examples/s]

# Constructing negative pairs

In [5]:
# I want to have 4 times as many negative pairs as positive pairs
# There are several negative sampling techinques
# for now take one mention name from the positives and one entity name from the positives that don't match
# make it sample randomly from the positives
import random

def create_negative_pairs(positive_pairs):
  negative_pairs = []
  while len(negative_pairs) < 4 * len(positive_pairs):
      entry = random.choice(positive_pairs)
      mention_name, mention_id = entry['mention'], entry['id']
      entry = random.choice(positive_pairs)
      entity_name, entity_id = entry['entity'], entry['id']
      if mention_id != entity_id:
          negative_pairs.append({"mention": mention_name, "entity": entity_name, "id": mention_id})

  return negative_pairs


In [6]:
negative_pairs_test = create_negative_pairs(dataset['test'])
negative_pairs_train = create_negative_pairs(dataset['train'])
negative_pairs_validation = create_negative_pairs(dataset['validation'])

# Combining Positive and Negative pairs, and adding labels


In [17]:
def add_labels(positive_pairs, negative_pairs):
  training_data = []
  for entry in positive_pairs:
      training_data.append({'mention': entry["mention"], 'entity': entry["entity"], 'label': 1}) # Changed to 1 (from True)

  for entry in negative_pairs:
      training_data.append({'mention': entry["mention"], 'entity': entry["entity"], 'label': 0}) # Changed to 0 (from False)

  return training_data

In [19]:
train_data = add_labels(dataset['train'], negative_pairs_train)
test_data = add_labels(dataset['test'], negative_pairs_test)
validation_data = add_labels(dataset['validation'], negative_pairs_validation)

In [20]:
# load the new datasets
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)
validation_dataset = Dataset.from_list(validation_data)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': validation_dataset,
})

In [21]:
# sanity check entries from splits
# first come all the positive pairs, after that the negatives
print(
    dataset_dict["train"][0],
    dataset_dict["test"][0],
    dataset_dict["validation"][-1]
)

{'mention': 'human immunodeficiency virus', 'entity': 'HIV Infections', 'label': 1} {'mention': 'oral ulcers', 'entity': 'Oral Ulcer', 'label': 1} {'mention': 'impaired cognitive functions', 'entity': 'Hypokalemia', 'label': 0}


In [22]:
train_pairs = dataset_dict['train']
val_pairs = dataset_dict['validation']

# Training using sentence-transformers library (supported by HuggingFace)

In [None]:
%pip install -U sentence-transformers



In [24]:
from sentence_transformers import SentenceTransformer, models, losses

model_name = 'cambridgeltl/SapBERT-from-PubMedBERT-fulltext'

# 1. Load the base transformer model
word_embedding_model = models.Transformer(model_name)

# 2. Add the correct pooling layer
# SapBERT was trained to use the [CLS] token
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode='cls' # Use the first token (CLS token) as text representations
)

# 3. Create the final SentenceTransformer model
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [25]:
# Choosing Loss function
train_loss = losses.ContrastiveLoss(model)

In [26]:
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define training arguments
# using same config as in Jake's implementation
training_args = SentenceTransformerTrainingArguments(
    output_dir="models/my-finetuned-model",
    num_train_epochs=3,
    learning_rate = 1e-5,
    optim = "adamw_torch",
    per_device_train_batch_size=8,
    save_strategy="epoch",
    report_to="none", # for some reason defaults to W&B (weights&biases), documentation states default is none
)

# Create the Trainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_pairs,
    eval_dataset=val_pairs,
    loss=train_loss,
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [27]:
trainer.train()

Step,Training Loss
500,0.0042
1000,0.0034
1500,0.0032
2000,0.0022
2500,0.002
3000,0.0023
3500,0.0019
4000,0.0017
4500,0.0018


TrainOutput(global_step=4977, training_loss=0.002457640904941448, metrics={'train_runtime': 600.9447, 'train_samples_per_second': 66.246, 'train_steps_per_second': 8.282, 'total_flos': 0.0, 'train_loss': 0.002457640904941448, 'epoch': 3.0})

In [28]:
# create a new huggingFace repo for this model
model.push_to_hub("fine-tuned-SapBERT4")


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...sdrzy1q/model.safetensors:   0%|          | 17.5kB /  438MB            

'https://huggingface.co/Stevenf232/fine-tuned-SapBERT4/commit/a1d0a6b45566e5b5bc749c9aadc76cacb2daf417'

# Training using HuggingFace Trainer API
Bi-Encoders are not supported by default for the trainer API

Our model is a Bi-encoder :(

**Thus it doesn't make sense to use the trainer API in our case! (so we skip it for now)**