<a href="https://colab.research.google.com/github/Steve-Falkovsky/Hypencoder-Entity-Linking/blob/main/notebooks/nameonly_hard_negative_mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import importlib.util

REPO_NAME = "Hypencoder-Entity-Linking"
GIT_URL = f"https://github.com/Steve-Falkovsky/{REPO_NAME}.git"
BRANCH_NAME = "main"

if not os.path.exists(REPO_NAME):
    !git clone -b {BRANCH_NAME} --single-branch {GIT_URL}

    # Move into the downloaded repo (The Root)
    os.chdir(REPO_NAME)


%pip install -q -e "./hypencoder-paper"

os.chdir("hypencoder-paper")

print(f"üìç Working Directory is now: {os.getcwd()}")
print("‚úÖ Environment Ready!")

Cloning into 'Hypencoder-Entity-Linking'...
remote: Enumerating objects: 380, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 380 (delta 10), reused 8 (delta 8), pack-reused 367 (from 1)[K
Receiving objects: 100% (380/380), 815.75 KiB | 15.69 MiB/s, done.
Resolving deltas: 100% (201/201), done.
  Preparing metadata (setup.py) ... [?25l[?25hdone
üìç Working Directory is now: /content/Hypencoder-Entity-Linking/hypencoder-paper
‚úÖ Environment Ready!


In [2]:
from datasets import load_dataset

# there are all "positive" pairs"
dataset = load_dataset("Stevenf232/BC5CDR_MeSH2015_nameonly")

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

name_only_train.jsonl: 0.00B [00:00, ?B/s]

name_only_val.jsonl: 0.00B [00:00, ?B/s]

name_only_test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/2654 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2559 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2656 [00:00<?, ? examples/s]

In [3]:
train_pairs = dataset['train']
print(train_pairs)

mention_names = train_pairs['mention']
entity_names = train_pairs['entity']
print(mention_names[:3])
print(entity_names[:3])

Dataset({
    features: ['mention', 'entity', 'id'],
    num_rows: 2654
})
['human immunodeficiency virus', "non-Hodgkin's lymphoma", 'renal cell carsinom']
['HIV Infections', 'Lymphoma, Non-Hodgkin', 'Carcinoma, Renal Cell']


### Load the model

In [4]:
# Core Hypencoder model for outputing dense vector representations
from hypencoder_cb.modeling.hypencoder import Hypencoder, HypencoderDualEncoder, TextEncoder
from transformers import AutoTokenizer

model_name = "Stevenf232/SapBERT_freeze_hypencoder"

dual_encoder = HypencoderDualEncoder.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


query_encoder: Hypencoder = dual_encoder.query_encoder
passage_encoder: TextEncoder = dual_encoder.passage_encoder

config.json:   0%|          | 0.00/940 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/483M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/483M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

### Move the model to the GPU

In [5]:
import torch

# Setup the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # This should say 'cuda'

# Move the model to the GPU
passage_encoder.to(device)
query_encoder.to(device)

Using device: cuda


Hypencoder(
  (transformer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

### Load datasets and tokenise

In [6]:
# convert from type "datasets" to python list
queries = list(mention_names)
passages = list(entity_names)


# the output of the tokenizer contains 3 fields:
# input_ids, token_type_ids, and attention_mask
# all contain a tensor in the shape (number of queries, max number of tokens)

query_inputs = tokenizer(queries, return_tensors="pt", padding=True, truncation=True)
passage_inputs = tokenizer(passages, return_tensors="pt", padding=True, truncation=True)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [7]:
print(f"query_inputs:\n{query_inputs}")
print("\n\n\n")
print(f"passage_inputs:\n{passage_inputs}")

query_inputs:
{'input_ids': tensor([[    2,  2616, 13141,  ...,     0,     0,     0],
        [    2,  2447,    17,  ...,     0,     0,     0],
        [    2,  4604,  2024,  ...,     0,     0,     0],
        ...,
        [    2, 10838,     3,  ...,     0,     0,     0],
        [    2,  4328,  6859,  ...,     0,     0,     0],
        [    2, 13096,  1036,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}




passage_inputs:
{'input_ids': tensor([[    2,  3525,  5024,  ...,     0,     0,     0],
        [    2,  8636,    16,  ...,     0, 

# Passage Encodings


In [8]:
from tqdm import tqdm
import torch
from torch.amp import autocast

def batch_encode_passages(encoder ,passages):
  batch_size=256
  entity_name_features = []

  num_passages = passages["input_ids"].shape[0]

  with torch.no_grad(): # Disable gradient calculation (saves tons of memory)
    for i in tqdm(range(0, num_passages, batch_size), desc="Extracting features"):

        # extract entity features
        # Autocast does the math in fp16 where possible (default is fp32)
        # this will save memory and increase speed. The loss in precision shouldn't matter much (can check on a small sample if we want)
        with autocast("cuda"):
          features = encoder(
              input_ids=passages["input_ids"][i:i + batch_size].to(device),
              attention_mask=passages["attention_mask"][i:i + batch_size].to(device)
            ).representation

          entity_name_features.append(features.detach().cpu()) # Detach and move to CPU to save VRAM/RAM


  features_tensor = torch.cat(entity_name_features, dim=0)

  return features_tensor

In [9]:
passage_embeddings = batch_encode_passages(passage_encoder, passage_inputs)

Extracting features: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:01<00:00,  6.32it/s]


## Now, we create the q-nets.

For each q-net, we feed through it all the passages the calculate the similarity.

But the q-nets are created in batches, and every batch is represented as a single object `NoTorchSequential`. (Check out the `RepeatedDenseBlockConverter` class in q_net.py for more info)

This object expects an input in the shape (N, M, H):

* N = number of queries (mentions)

* M = number of passages (entities)

* H = Hidden dimension (e.g., 768 for BERT)



---



The passage embeddings have the shape (M, H) so we must create an additional dimension of size N.

This will be done like so:
`passages_batch = passages.unsqueeze(0).expand(num_queries, -1, -1)`

* `.unsqueeze()` adds a new dimension (in our case at location 0)

* `.expand()` "expands" that new dimension to be size "num_queries"

* `.expand()` creates a view, so it costs almost 0 memory! (compared to .repeat() which changes the tensor)

# Q-nets take **a lot** of memory.

Instead of creating all of them and then doing the similarity calculation, we will create batches and calculate similarities for just those q-nets, then discard those q-nets and move on to the next batch.

In [10]:
def batch_encode_queries(encoder, queries, passage_embeddings):
  batch_size = 8
  similarity_scores = []

  num_queries = queries["input_ids"].shape[0]

  with torch.no_grad():
    for i in tqdm(range(0, num_queries, batch_size), desc="Creating q-nets and calculating similarity scores"):

        # create q-nets
        with autocast("cuda"):
          q_nets = encoder(
              input_ids=queries["input_ids"][i:i + batch_size].to(device),
              attention_mask=queries["attention_mask"][i:i + batch_size].to(device)
            ).representation


        passages_gpu = passage_embeddings.to(device)

        # Note: we use q_nets.num_queries (our repo's noTorch equivalent of q_nets.shape[0]) instead of batch_size
        # because the total number might not be divisible by batch_size so the last batch might be smaller than the actual batch size
        passages_batch = passages_gpu.unsqueeze(0).expand(q_nets.num_queries, -1, -1)

        # calculate similarity
        batch_scores = q_nets(passages_batch)
        similarity_scores.append(batch_scores.detach().cpu())


  scores_tensor = torch.cat(similarity_scores, dim=0)
  return scores_tensor


In [11]:
similarity_scores = batch_encode_queries(query_encoder, query_inputs, passage_embeddings)

Creating q-nets and calculating similarity scores: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 332/332 [00:11<00:00, 28.90it/s]


In [12]:
# Case 1 - comparing a query to its respective passage

# In the simple case where each q_net only takes one passage, we can just
# reshape the passage_embeddings to (N, 1, H).
# passage_embeddings_single = passage_embeddings.unsqueeze(1)
# print(f"passage_embeddings shape: {passage_embeddings_single.shape}")
# giving the nueral network the input of passage_embeddings
# the output provides the relevance score of query 1 against passage 1, query 2 against passage 2, etc...
# scores = q_nets(passage_embeddings_single)
# print(f"scores: {scores}")

In [13]:
# Case 2 - comparing a query to all passages

# The case where each q_net takes multiple passages
# meaning multiple passages are now associated with each of the queries

# this operation creates a 3D tensor which takes too much memory
# passage_embeddings_multi = passage_embeddings.repeat(N, 1).reshape(N, M, H)
# print(f"passage_embeddings shape: {passage_embeddings_multi.shape}")


# unbatched similarity scores for q-nets
# similarity_scores = q_nets(passage_embeddings_multi)
# print(f"similarity_scores shape: {similarity_scores.shape}")
#print(f"similarity_scores: {similarity_scores}")


In [14]:
similarity_scores

tensor([[[14.2939],
         [ 3.3896],
         [ 2.0139],
         ...,
         [ 3.8044],
         [ 4.3594],
         [-1.4324]],

        [[ 1.7608],
         [18.0904],
         [ 3.9561],
         ...,
         [ 0.3262],
         [ 1.6085],
         [-1.9267]],

        [[ 2.1958],
         [ 8.3981],
         [11.8573],
         ...,
         [ 6.4257],
         [ 2.4701],
         [ 0.1667]],

        ...,

        [[ 0.7215],
         [ 3.1691],
         [-0.2798],
         ...,
         [ 8.9770],
         [ 2.8766],
         [-0.0259]],

        [[ 2.8675],
         [ 2.5784],
         [-0.8699],
         ...,
         [ 4.5016],
         [19.4824],
         [ 3.7884]],

        [[ 2.8397],
         [ 3.0975],
         [-0.3504],
         ...,
         [ 4.9332],
         [ 3.3746],
         [ 2.8347]]])

In [15]:
similarity_scores.shape

torch.Size([2654, 2654, 1])

## Create a dataset of Hard Negatives based on "Negative Hard Mining"
We take the top "incorrect" item similarities of each query as negatives

In [16]:
"""
Desired format for each line in the JSONL file:
{
  "query": {
    "id": query ID,
    "content": query text,
  },
  "items": [
    {
      "id": passage ID,
      "content": passage text,
      "score": Optional teacher score,
      "type": Sometimes used to specify type of item,
    },
    {
        # another item
    },
  ]
}

Contrastive Loss with Hard Negatives: The positive must be the first item, all following items
will be treated as negative
"""

'\nDesired format for each line in the JSONL file:\n{\n  "query": {\n    "id": query ID,\n    "content": query text,\n  },\n  "items": [\n    {\n      "id": passage ID,\n      "content": passage text,\n      "score": Optional teacher score,\n      "type": Sometimes used to specify type of item,\n    },\n    {\n        # another item\n    },\n  ]\n}\n\nContrastive Loss with Hard Negatives: The positive must be the first item, all following items\nwill be treated as negative\n'

In [37]:
from pathlib import Path
import json
import torch

def write_hardneg_contrastive_jsonl(
    pairs,
    similarity_scores: torch.Tensor,
    output_jsonl_path: str | Path,
    num_negatives: int = 8,
):
    """
    For each query:
      - items[0] = the correct entity
      - items[1:1+num_negatives] = top scoring incorrect entities
    """

    output_jsonl_path = Path(output_jsonl_path)
    output_jsonl_path.parent.mkdir(parents=True, exist_ok=True)

    # Ensure CPU tensor for safe indexing
    similarity_scores = similarity_scores.detach().cpu()


    # check shapes are correct
    N, M, _ = similarity_scores.shape
    if len(pairs["mention"]) != N:
        raise ValueError(f"Mismatch: scores have N={N}, but pairs has {len(pairs['mention'])} mentions")
    if len(pairs["entity"]) != M:
        raise ValueError(f"Mismatch: scores have M={M}, but pairs has {len(pairs['entity'])} entities")


    # generate the jsonl entires
    with output_jsonl_path.open("w", encoding="utf-8") as f:
        for i in range(N):
            q_id = pairs["id"][i]
            mention = pairs["mention"][i]

            # Positive
            pos_item = {
                "id": pairs["id"][i],
                "content": pairs["entity"][i],
                "score": None,
                "type": None,
            }



            # Get top candidates, then filter out the true positive index

            # +32 is for buffer since our dataset contains pairs of unique mentions which
            # might refer to the same entity. so we might find duplicate positive entity matches.
            k=num_negatives+32
            _, top_idxs = torch.topk(similarity_scores[i], dim=0, k=k, largest=True, sorted=True)

            top_idxs = top_idxs.squeeze(1) # make it into a list of scalars instead of a list of vectors with dimension size 1

            neg_items = []
            for j in top_idxs.tolist():

                # avoid duplicate positives
                if pairs["id"][i] == pairs["id"][j]:
                    continue

                # add negatives
                neg_items.append(
                    {
                        "id": pairs["id"][j],
                        "content": pairs["entity"][j],
                        "score": None,
                        "type": None,
                    }
                )
                if len(neg_items) >= num_negatives:
                    break

            if len(neg_items) < num_negatives:
                raise RuntimeError(f"Only collected {len(neg_items)} negatives for query {i} (M={M}).")

            entry = {
                "query": {"id": q_id, "content": mention},
                "items": [pos_item, *neg_items],
            }
            json.dump(entry, f, ensure_ascii=False)
            f.write("\n")

    print(f"Wrote {N} lines to {output_jsonl_path}")

In [38]:
# generate jsonl for contrastiev loss for train split
write_hardneg_contrastive_jsonl(
    pairs=train_pairs,
    similarity_scores=similarity_scores,
    output_jsonl_path="bc5cdr_train_hypencoder_contrastive.jsonl",
    num_negatives=8,
)

Wrote 2654 lines to bc5cdr_train_hypencoder_contrastive.jsonl


In [None]:
# generate jsonl for contrastive loss for train/val/test splits

# data_splits = ("train", "val", "test")
# seen = set()
# splits = [s for s in data_splits if (s in dataset and not (s in seen or seen.add(s)))]

# for split in splits:
#     pairs = dataset[split]

#     # build query/passage lists for this split
#     queries = list(pairs["mention"])
#     passages = list(pairs["entity"])

#     # tokenize
#     query_inputs = tokenizer(queries, return_tensors="pt", padding=True, truncation=True)
#     passage_inputs = tokenizer(passages, return_tensors="pt", padding=True, truncation=True)

#     # encode + score
#     passage_embeddings = batch_encode_passages(passage_encoder, passage_inputs)
#     similarity_scores = batch_encode_queries(query_encoder, query_inputs, passage_embeddings)

#     # write jsonl
#     write_hardneg_contrastive_jsonl(
#         pairs=pairs,
#         similarity_scores=similarity_scores,
#         output_jsonl_path=f"bc5cdr_{split}_hypencoder_contrastive.jsonl",
#         num_negatives=8,
#     )

## Upload dataset to HuggingFace

In [40]:
from huggingface_hub import HfApi

repo_id="Stevenf232/BC5CDR_nameonly_hard_negative_mining"

api = HfApi()
api.create_repo(
    repo_id=repo_id,
    repo_type="dataset",
    private=False
)

api.upload_file(
    path_or_fileobj="bc5cdr_train_hypencoder_contrastive.jsonl",
    path_in_repo="bc5cdr_train_hypencoder_contrastive.jsonl",
    repo_id=repo_id,
    repo_type="dataset"
)


CommitInfo(commit_url='https://huggingface.co/datasets/Stevenf232/BC5CDR_nameonly_hard_negative_mining/commit/0c00291a728ac96cf059795f3672d58950e5ecfc', commit_message='Upload bc5cdr_train_hypencoder_contrastive.jsonl with huggingface_hub', commit_description='', oid='0c00291a728ac96cf059795f3672d58950e5ecfc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Stevenf232/BC5CDR_nameonly_hard_negative_mining', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Stevenf232/BC5CDR_nameonly_hard_negative_mining'), pr_revision=None, pr_num=None)