In [None]:
!pip install transformers tqdm more_itertools scikit-learn torch

In [None]:
# Use this when working on the full entity dataset of 260_000 entities
# mention names aren't ordered inn a particular way, just what appears first in the documents
# we can have many duplicates, so in this case where we are only encoding the name we want to avoid that
# unique_mention_name_id_pairs = list({name: _id for name, _id in bc5cdr_name_id_pairs}.items())
# mention_names = unique_mention_name_id_pairs.keys()

In [None]:
# this is the feature extraction pipeline so we can get the embeddings directly (we can only do inference with this, no fine-tuning)
from transformers import pipeline

model_name = "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"

# core model
extractor = pipeline("feature-extraction", model=model_name)

In [None]:
from datasets import load_dataset

# there are all "positive" pairs"
dataset = load_dataset("Stevenf232/BC5CDR_MeSH2015_nameonly")
train_pairs = dataset["train"]

In [None]:
from tqdm import tqdm
def extract_features(extractor ,pairs):
  batch_size=16
  # input all names to pipeline so it will create dense vectors of all the names (of the mentions and entities)
  mention_name_features = []
  entity_name_features = []

  for i in tqdm(range(0, len(pairs), batch_size), desc="Extracting features"):
      # extract mention features
      batch = pairs[i:i + batch_size]["mention"]
      features = extractor(batch, truncation=True, padding=True)
      mention_name_features.extend(features)

      # extract entity features
      batch = pairs[i:i + batch_size]["entity"]
      features = extractor(batch, truncation=True, padding=True)
      entity_name_features.extend(features)

  return mention_name_features, entity_name_features

In [None]:
mention_name_features, entity_name_features = extract_features(extractor, train_pairs)

In [None]:
# ------------------ evaluating core model -------------------------
# In standard BERT and SapBERT models, the [CLS] (classification) token is located at the very beginning of the input sequence.
# once we have our features in shape [n,m,i]
# The CLS token which represents the word is in [0,0,:]

import numpy as np
mention_vectors = [np.array(f[0][0]) for f in mention_name_features]
mention_vectors[0].shape

entity_vectors = [np.array(f[0][0]) for f in entity_name_features]
entity_vectors[0].shape



In [None]:
mention_vectors[0][:5]

# Simple model evaluation

Potential issue - this finds relevance using Cosine Similarity (will it have bias towards fine-tuning on cosineSimilarityLoss vs other loss functions?)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def evaluate(mention_vectors, entity_vectors):
  correct_count = 0
  for i, vector in enumerate(mention_vectors):
      scores = cosine_similarity(vector.reshape(1,-1), entity_vectors).flatten().tolist()
      top_idx = np.argmax(scores)
      # the strange conversion to int from here on out is because the original idx is of type numpy.int64
      top_match_id = train_pairs[int(top_idx)]["id"]
      correct_id = train_pairs[int(i)]["id"]

      if top_match_id == correct_id:
          correct_count += 1

      mention_name = train_pairs[int(i)]["mention"]
      top_match = train_pairs[int(top_idx)]["entity"]
      correct_name = train_pairs[int(i)]["entity"]
      print(f"mention_name: {mention_name}\ncorrect entity name: {correct_name}\ntop_match: {top_match}\n")


  print(f"total comparisons: {len(mention_vectors)}")
  print(f"correct comparisons: {correct_count}")
  print(f"accuracy: {correct_count / len(mention_vectors)}")

In [None]:
evaluate(mention_vectors, entity_vectors)

In [None]:
# more evaluation methods
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
# print(f"{accuracy_score(train_labels, predicted_labels)=:.3f}")
# print(f"{recall_score(train_labels, predicted_labels)=:.3f}")
# print(f"{precision_score(train_labels, predicted_labels)=:.3f}")
# print(f"{f1_score(train_labels, predicted_labels)=:.3f}")

## Evaluating Fine-tuned model


In [None]:
# finetuned model
# don't think I need a feature-extraction pipeline based on auto-generated docs: https://huggingface.co/Stevenf232/fine-tuned-SapBERT2
#from transformers import pipeline

from sentence_transformers import SentenceTransformer

fine_tuned_model_name = "Stevenf232/fine-tuned-SapBERT4"
model = SentenceTransformer(fine_tuned_model_name)

In [None]:
from tqdm import tqdm
def encode(pairs):
  batch_size=16
  mention_encodings = []
  entity_encodings = []

  for i in tqdm(range(0, len(pairs), batch_size), desc="Extracting features"):
      # encode mentions
      batch = pairs[i:i + batch_size]["mention"]
      encodings = model.encode(batch)
      mention_encodings.extend(encodings)

      # encode entities
      batch = pairs[i:i + batch_size]["entity"]
      encodings = model.encode(batch)
      entity_encodings.extend(encodings)

  return mention_encodings, entity_encodings

In [None]:
mention_encodings, entity_encodings = encode(train_pairs)

In [None]:
# ------------------ evaluating fine-tuned model -------------------------
# don't think I actually need to do this because we already have the pooling layer implemented as part of the sentece-transformer
# mention_vectors = [np.array(f[0][0]) for f in mention_name_features]
# entity_vectors = [np.array(f[0][0]) for f in entity_name_features]

In [None]:
mention_vectors[0][:5]

In [None]:
evaluate(mention_encodings, entity_encodings)