In [None]:
# we have train, test, and validate data in 3 separate JSONL files which look like this:
{"mention": "word1", "entity": "word2", "id": "unique_id_A"}
{"mention": "word3", "entity": "word4", "id": "unique_id_B"}
{"mention": "word5", "entity": "word6", "id": "unique_id_C"}

In [None]:
%pip install datasets

In [None]:
from google.colab import userdata
from huggingface_hub import login

# Login into Hugging Face Hub
hf_token = userdata.get('HF_TOKEN') # If you are running inside a Google Colab
login(hf_token)

In [None]:
from datasets import load_dataset

# there are all "positive" pairs"
dataset = load_dataset("Stevenf232/BC5CDR_MeSH2015_nameonly")


# Constructing negative pairs

In [None]:
# I want to have 4 times as many negative pairs as positive pairs
# There are several negative sampling techinques
# for now take one mention name from the positives and one entity name from the positives that don't match
# make it sample randomly from the positives
import random

def create_negative_pairs(positive_pairs):
  negative_pairs = []
  while len(negative_pairs) < 4 * len(positive_pairs):
      entry = random.choice(positive_pairs)
      mention_name, mention_id = entry['mention'], entry['id']
      entry = random.choice(positive_pairs)
      entity_name, entity_id = entry['entity'], entry['id']
      if mention_id != entity_id:
          negative_pairs.append({"mention": mention_name, "entity": entity_name, "id": mention_id})

  return negative_pairs


In [None]:
negative_pairs_test = create_negative_pairs(dataset['test'])
negative_pairs_train = create_negative_pairs(dataset['train'])
negative_pairs_validation = create_negative_pairs(dataset['validation'])

# Combining Positive and Negative pairs, and adding labels


In [None]:
def add_labels(positive_pairs, negative_pairs):
  training_data = []
  for entry in positive_pairs:
      training_data.append({'mention': entry["mention"], 'entity': entry["entity"], 'label': 1}) # Changed to 1 (from True)

  for entry in negative_pairs:
      training_data.append({'mention': entry["mention"], 'entity': entry["entity"], 'label': 0}) # Changed to 0 (from False)

  return training_data

In [None]:
train_data = add_labels(dataset['train'], negative_pairs_train)
test_data = add_labels(dataset['test'], negative_pairs_test)
validation_data = add_labels(dataset['validation'], negative_pairs_validation)

In [None]:
# load the new datasets
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)
validation_dataset = Dataset.from_list(validation_data)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': validation_dataset,
})

In [None]:
# sanity check entries from splits
# first come all the positive pairs, after that the negatives
print(
    dataset_dict["train"][0],
    dataset_dict["test"][0],
    dataset_dict["validation"][-1]
)

In [None]:
train_pairs = dataset_dict['train']
val_pairs = dataset_dict['validation']

# Training using sentence-transformers library (supported by HuggingFace)
To do fine tuning correctly, we need to treat this model as a sentence transformer (a.k.a Bi-encoder) because it's designed to compare whole sentences (e.g., to calculate similarity scores).

In [None]:
%pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, models, losses

model_name = 'cambridgeltl/SapBERT-from-PubMedBERT-fulltext'

# 1. Load the base transformer model
word_embedding_model = models.Transformer(model_name)

# 2. Add the correct pooling layer
# SapBERT uses the [CLS] token
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode='cls' # Use the first token (CLS token) as text representations
)

# 3. Create the final SentenceTransformer model
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
# Choosing Loss function
train_loss = losses.ContrastiveLoss(model)

In [None]:
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define training arguments
# using same config as in Jake's implementation
training_args = SentenceTransformerTrainingArguments(
    output_dir="models/my-finetuned-model",
    num_train_epochs=3,
    learning_rate = 1e-5,
    optim = "adamw_torch",
    per_device_train_batch_size=8,
    save_strategy="epoch",
    report_to="none", # for some reason defaults to W&B (weights&biases), documentation states default is none
)

# Create the Trainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_pairs,
    eval_dataset=val_pairs,
    loss=train_loss,
)

In [None]:
trainer.train()

In [None]:
# create a new huggingFace repo for this model
model.push_to_hub("fine-tuned-SapBERT4")
