<a href="https://colab.research.google.com/github/Steve-Falkovsky/Hypencoder-Entity-Linking/blob/main/notebooks/Hypencoder_Vs_fine_tuned_Hypencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

REPO_NAME = "Hypencoder-Entity-Linking"
GIT_URL = f"https://github.com/Steve-Falkovsky/{REPO_NAME}.git"
BRANCH_NAME = "main"

!git clone -b {BRANCH_NAME} --single-branch {GIT_URL}

# Move into the downloaded repo (The Root)
os.chdir(REPO_NAME)

%pip install -q -e "./hypencoder-paper"

os.chdir("hypencoder-paper")

print(f"üìç Working Directory is now: {os.getcwd()}")
print("‚úÖ Environment Ready!")

In [None]:
from datasets import load_dataset

# there are all "positive" pairs
dataset = load_dataset("Stevenf232/BC5CDR_MeSH2015_nameonly")

In [None]:
train_pairs = dataset['train']
print(train_pairs)

mention_names = train_pairs['mention']
entity_names = train_pairs['entity']
print(mention_names[:3])
print(entity_names[:3])

### Load the model

In [None]:
# Core Hypencoder model for outputing dense vector representations
from hypencoder_cb.modeling.hypencoder import Hypencoder, HypencoderDualEncoder, TextEncoder
from transformers import AutoTokenizer

model_name = "Stevenf232/hypencoder_BC5CDR"

dual_encoder = HypencoderDualEncoder.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


query_encoder: Hypencoder = dual_encoder.query_encoder
passage_encoder: TextEncoder = dual_encoder.passage_encoder

### Move the model to the GPU

In [None]:
import torch

# Setup the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # This should say 'cuda'

# Move the model to the GPU
passage_encoder.to(device)
query_encoder.to(device)

### Load datasets and tokenise

In [None]:
# convert from type "datasets" to python list
queries = list(mention_names)
passages = list(entity_names)


# the output of the tokenizer contains 3 fields:
# input_ids, token_type_ids, and attention_mask
# all contain a tensor in the shape (number of queries, max number of tokens)

query_inputs = tokenizer(queries, return_tensors="pt", padding=True, truncation=True)
passage_inputs = tokenizer(passages, return_tensors="pt", padding=True, truncation=True)


In [None]:
print(f"query_inputs:\n{query_inputs}")
print("\n\n\n")
print(f"passage_inputs:\n{passage_inputs}")

# Passage Encodings


In [None]:
from tqdm import tqdm
import torch
from torch.amp import autocast

def batch_encode_passages(encoder ,passages):
  batch_size=256
  entity_name_features = []

  num_passages = passages["input_ids"].shape[0]

  with torch.no_grad(): # Disable gradient calculation (saves tons of memory)
    for i in tqdm(range(0, num_passages, batch_size), desc="Extracting features"):

        # extract entity features
        # Autocast does the math in fp16 where possible (default is fp32)
        # this will save memory and increase speed. The loss in precision shouldn't matter much (can check on a small sample if we want)
        with autocast("cuda"):
          features = encoder(
              input_ids=passages["input_ids"][i:i + batch_size].to(device),
              attention_mask=passages["attention_mask"][i:i + batch_size].to(device)
            ).representation

          entity_name_features.append(features.detach().cpu()) # Detach and move to CPU to save VRAM/RAM


  features_tensor = torch.cat(entity_name_features, dim=0)

  return features_tensor

In [None]:
passage_embeddings = batch_encode_passages(passage_encoder, passage_inputs)

##Now, we create the q-nets.

For each q-net, we feed through it all the passages the calculate the similarity.

But the q-nets are created in batches, and every batch is represented as a single object `NoTorchSequential`. (Check out the `RepeatedDenseBlockConverter` class in q_net.py for more info)

This object expects an input in the shape (N, M, H):

* N = number of queries (mentions)

* M = number of passages (entities)

* H = Hidden dimension (e.g., 768 for BERT)



---



The passage embeddings have the shape (M, H) so we must create an additional dimension of size N.

This will be done like so:
`passages_batch = passages.unsqueeze(0).expand(num_queries, -1, -1)`

* `.unsqueeze()` adds a new dimension (in our case at location 0)

* `.expand()` "expands" that new dimension to be size "num_queries"

* `.expand()` creates a view, so it costs almost 0 memory! (compared to .repeat() which changes the tensor)

# Q-nets take **a lot** of memory.

Instead of creating all of them and then doing the similarity calculation, we will create batches and calculate similarities for just those q-nets, then discard those q-nets and move on to the next batch.

In [None]:
def batch_encode_queries(encoder, queries, passage_embeddings):
  batch_size = 8
  similarity_scores = []

  num_queries = queries["input_ids"].shape[0]

  with torch.no_grad():
    for i in tqdm(range(0, num_queries, batch_size), desc="Creating q-nets and calculating similarity scores"):

        # create q-nets
        with autocast("cuda"):
          q_nets = encoder(
              input_ids=queries["input_ids"][i:i + batch_size].to(device),
              attention_mask=queries["attention_mask"][i:i + batch_size].to(device)
            ).representation


        passages_gpu = passage_embeddings.to(device)

        # Note: we use q_nets.num_queries (our repo's noTorch equivalent of q_nets.shape[0]) instead of batch_size
        # because the total number might not be divisible by batch_size so the last batch might be smaller than the actual batch size
        passages_batch = passages_gpu.unsqueeze(0).expand(q_nets.num_queries, -1, -1)

        # calculate similarity
        batch_scores = q_nets(passages_batch)
        similarity_scores.append(batch_scores.detach().cpu())


  scores_tensor = torch.cat(similarity_scores, dim=0)
  return scores_tensor


In [None]:
similarity_scores = batch_encode_queries(query_encoder, query_inputs, passage_embeddings)

In [None]:
# Case 1 - comparing a query to its respective passage

# In the simple case where each q_net only takes one passage, we can just
# reshape the passage_embeddings to (N, 1, H).
# passage_embeddings_single = passage_embeddings.unsqueeze(1)
# print(f"passage_embeddings shape: {passage_embeddings_single.shape}")
# giving the nueral network the input of passage_embeddings
# the output provides the relevance score of query 1 against passage 1, query 2 against passage 2, etc...
# scores = q_nets(passage_embeddings_single)
# print(f"scores: {scores}")

In [None]:
# Case 2 - comparing a query to all passages

# The case where each q_net takes multiple passages
# meaning multiple passages are now associated with each of the queries

# this operation creates a 3D tensor which takes too much memory
# passage_embeddings_multi = passage_embeddings.repeat(N, 1).reshape(N, M, H)
# print(f"passage_embeddings shape: {passage_embeddings_multi.shape}")


# unbatched similarity scores for q-nets
# similarity_scores = q_nets(passage_embeddings_multi)
# print(f"similarity_scores shape: {similarity_scores.shape}")
#print(f"similarity_scores: {similarity_scores}")


In [None]:
similarity_scores

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def evaluate(train_pairs):
  correct_count = 0
  top_idxs = torch.argmax(similarity_scores,dim=1).flatten()

  for i in range(len(queries)):
      top_idx = top_idxs[i]
      # the conversion to int from here on out is because the original idx is of type numpy.int64
      top_match_id = train_pairs["id"][int(top_idx)]
      correct_id = train_pairs["id"][int(i)]

      if top_match_id == correct_id:
          correct_count += 1

      mention_name = train_pairs["mention"][int(i)]
      top_match = train_pairs["entity"][int(top_idx)]
      correct_name = train_pairs["entity"][int(i)]
      print(f"mention_name: {mention_name}\ncorrect entity name: {correct_name}\ntop_match: {top_match}\n")


  print(f"total comparisons: {len(queries)}")
  print(f"correct comparisons: {correct_count}")
  print(f"accuracy: {correct_count / len(queries)}")

In [None]:
evaluate(train_pairs)

In [None]:
# more evaluation methods
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
# print(f"{accuracy_score(train_labels, predicted_labels)=:.3f}")
# print(f"{recall_score(train_labels, predicted_labels)=:.3f}")
# print(f"{precision_score(train_labels, predicted_labels)=:.3f}")
# print(f"{f1_score(train_labels, predicted_labels)=:.3f}")