In [None]:
!pip install transformers tqdm more_itertools scikit-learn torch

In [None]:
# Use this when working on the full entity dataset of 260_000 entities
# mention names aren't ordered inn a particular way, just what appears first in the documents
# we can have many duplicates, so in this case where we are only encoding the name we want to avoid that
# unique_mention_name_id_pairs = list({name: _id for name, _id in bc5cdr_name_id_pairs}.items())
# mention_names = unique_mention_name_id_pairs.keys()

In [None]:
# this is the feature extraction pipeline so we can get the embeddings directly (we can only do inference with this, no fine-tuning)
from transformers import pipeline

model_name = "google-bert/bert-base-uncased"

# core model
extractor = pipeline("feature-extraction", model=model_name, device='cuda')

In [None]:
import torch

# Setup the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # This should say 'cuda'

### Move the model to the GPU

In [None]:
from datasets import load_dataset

# there are all "positive" pairs"
dataset = load_dataset("Stevenf232/BC5CDR_MeSH2015_name_and_aliases")
train_pairs = dataset["train"]

In [None]:
def extract_features(extractor, pairs):
    ''' includes [CLS] pooling '''
    batch_size = 16

    # Create generators (saves RAM compared to creating full lists)
    mention_gen = (p["mention"] for p in pairs)
    entity_gen = (p["entity"] for p in pairs)

    mention_name_features = []
    entity_name_features = []


    print("Extracting mention features...")
    for output in tqdm(extractor(mention_gen, batch_size=batch_size, truncation=True, padding=True, return_tensors='pt'), total=len(pairs)):
        # The pipeline yields one result at a time, but processes in batches on GPU
        cls_vector = output[0, 0, :].cpu()
        mention_name_features.append(cls_vector)

    print("Extracting entity features...")
    for output in tqdm(extractor(entity_gen, batch_size=batch_size, truncation=True, padding=True, return_tensors='pt'), total=len(pairs)):
        cls_vector = output[0, 0, :].cpu()
        entity_name_features.append(cls_vector)

    return mention_name_features, entity_name_features

In [None]:
mention_cls, entity_cls = extract_features(extractor, train_pairs)

# Model evaluation

Potential issue - this finds relevance using Cosine Similarity (will it have bias towards fine-tuning on cosineSimilarityLoss vs other loss functions?)

In [None]:
def evaluate(mention_cls, entity_cls, train_pairs):
    print("Processing vectors on GPU (CLS Pooling)...")

    # 1. Stack
    # Since they are already tensors, we just stack them.
    mentions_tensor = torch.stack(mention_cls).to('cuda')
    entities_tensor = torch.stack(entity_cls).to('cuda')

    # --- 2. Normalize ---
    # Standardize vector length so Dot Product = Cosine Similarity
    mentions_norm = torch.nn.functional.normalize(mentions_tensor, p=2, dim=1)
    entities_norm = torch.nn.functional.normalize(entities_tensor, p=2, dim=1)

    # --- 3. Matrix Multiplication ---
    # Compute similarity between ALL mentions and ALL entities instantly
    similarity_matrix = torch.mm(mentions_norm, entities_norm.T)

    # --- 4. Find Best Matches ---
    # Returns the index of the highest score for each row
    top_indices = torch.argmax(similarity_matrix, dim=1).cpu().numpy()

    # --- 5. Print Loop ---
    correct_count = 0
    print("\n--- Starting Evaluation ---\n")

    for i, top_idx in enumerate(top_indices):
        # the strange conversion to int from here on out is because the original idx is of type numpy.int64
        top_idx = int(top_idx)
        i = int(i)

        top_match_id = train_pairs[top_idx]["id"]
        correct_id = train_pairs[i]["id"]

        if top_match_id == correct_id:
            correct_count += 1

        mention_name = train_pairs[i]["mention"]
        top_match = train_pairs[top_idx]["entity"]
        correct_name = train_pairs[i]["entity"]

        print(f"mention_name: {mention_name}")
        print(f"correct entity name: {correct_name}")
        print(f"top_match: {top_match}")
        print("")

    # --- 6. Statistics ---
    accuracy = correct_count / len(train_pairs)
    print(f"total comparisons: {len(train_pairs)}")
    print(f"correct comparisons: {correct_count}")
    print(f"accuracy: {accuracy:.4f}")

    return accuracy

In [None]:
evaluate(mention_cls, entity_cls, train_pairs)

In [None]:
# more evaluation methods
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
# print(f"{accuracy_score(train_labels, predicted_labels)=:.3f}")
# print(f"{recall_score(train_labels, predicted_labels)=:.3f}")
# print(f"{precision_score(train_labels, predicted_labels)=:.3f}")
# print(f"{f1_score(train_labels, predicted_labels)=:.3f}")

## Evaluating Fine-tuned model


In [None]:
# finetuned model
# don't think I need a feature-extraction pipeline based on auto-generated docs: https://huggingface.co/Stevenf232/fine-tuned-SapBERT2
#from transformers import pipeline

from sentence_transformers import SentenceTransformer

fine_tuned_model_name = "Stevenf232/fine-tuned-SapBERT4"
model = SentenceTransformer(fine_tuned_model_name)

In [None]:
from tqdm import tqdm
def encode(model, pairs):
  batch_size=16
  mention_encodings = []
  entity_encodings = []

  for i in tqdm(range(0, len(pairs), batch_size), desc="Extracting features"):
      # encode mentions
      batch = pairs[i:i + batch_size]["mention"]
      encodings = model.encode(batch, convert_to_tensor=True)
      mention_encodings.extend(encodings)

      # encode entities
      batch = pairs[i:i + batch_size]["entity"]
      encodings = model.encode(batch, convert_to_tensor=True)
      entity_encodings.extend(encodings)

  return mention_encodings, entity_encodings

In [None]:
mention_encodings, entity_encodings = encode(model, train_pairs)

In [None]:
evaluate(mention_encodings, entity_encodings, train_pairs)