In [None]:
pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
import ast
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from google.colab import drive
import matplotlib.pyplot as plt
from rank_bm25 import BM25Okapi
from types import SimpleNamespace
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, models

from huggingface_hub import login
login(token="", add_to_git_credential=True)

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
df_train=pd.read_csv('/content/drive/Shareddrives/Master_Thesis/Data/train.csv')
print(f"df_train.shape: {df_train.shape}")
df_train.head()

df_train.shape: (79897, 7)


Unnamed: 0,id,paper id,title,categories,type,content,question
0,130,2501.00784,cloitre's self-generating sequence,"['math.co', 'cs.dm', 'cs.fl', 'math.nt']",theorems,Let $g_n$ be the number of $1$'s in the sequen...,What is the limit of the proportion of 1's in ...
1,265,2501.00809,initial ideals of weighted forms and the genus...,"['math.ac', 'math.ag']",theorems,\label{ThmConjAreTrue}\nConjectures \ref{Conj1...,Does the statement of \textbf{ThmConjAreTrue} ...
2,266,2501.00809,initial ideals of weighted forms and the genus...,"['math.ac', 'math.ag']",propositions,}\n\newcommand{\ep}{,\\emph{Is the statement \emph{If $X$ is a comp...
3,267,2501.00809,initial ideals of weighted forms and the genus...,"['math.ac', 'math.ag']",definitions,}\n\newcommand{\ed}{,Is the statement $\ed{True}$?
4,313,2501.00845,spectral spaces of normal subgroups,"['math.gr', 'math.gn']",theorems,\label{mth}\nLet $G$ be a group having a maxim...,Does the set $\mathcal{N}^+(G)$ of proper norm...


# Building BM25 Index

## Define the main paper classes we use to filter the BM25

In [None]:
df_train['categories'] = df_train['categories'].apply(ast.literal_eval)
# 1. Define the selected top categories
selected_cats = [
    'math.co', 'math.ap', 'math.ag', 'math.pr', 'math.nt', 'math.oc', 'math.mp',
    'math-ph', 'math.ds', 'math.dg', 'math.fa', 'math.rt', 'math.gr', 'math.lo',
    'math.gt', 'math.ca', 'math.it', 'cs.it', 'cs.lg', 'math.na', 'cs.na',
    'math.at', 'stat.th', 'math.st', 'math.ra', 'math.ct', 'stat.ml',
    'math.qa', 'math.ac', 'math.oa', 'math.mg', 'math.cv', 'math.sp'
]

# 2. Bucketize train DataFrame
df_train['buckets'] = df_train['categories'].apply(
    lambda cats: [c for c in cats if c in selected_cats] or ['math.other']
)

## Build the BM25 indexes by category (for computational efficiency)

In [None]:
bm25_by_bucket = {}
passage_ids_by_bucket = {}
tokenized_passages_by_bucket = {}

for bucket in selected_cats + ['math.other']:
    # Select only passages in this bucket
    rows = df_train[df_train['buckets'].apply(lambda bs: bucket in bs)]
    ids   = rows['id'].tolist()
    texts = rows['content'].tolist()
    toks  = [txt.split() for txt in texts]

    # Build and store
    bm25_by_bucket[bucket]            = BM25Okapi(toks)
    passage_ids_by_bucket[bucket]     = ids
    tokenized_passages_by_bucket[bucket] = toks

print(f"Built BM25 indexes for {len(bm25_by_bucket)} buckets.")


Built BM25 indexes for 34 buckets.


## Main function

In [None]:
def mine_hard_negs(
    q_text: str,
    q_emb: np.ndarray,
    true_pid: int,
    buckets: list[str],
    bm25_by_bucket: dict,
    passage_ids_by_bucket: dict,
    paper_ids: dict,
    pid_to_embidx: dict[int,int],
    passage_embs: np.ndarray,
    k_bm25: int = 50,
    k_emb: int   = 50,
    top_m: int   = 3
) -> list[int]:
    """
    Find the top-m hard negatives for one query using rank_bm25.

    Args:
      q_text           the question string (used as the BM25 query)
      q_emb            the L2‐normalized embedding of q_text
      true_pid         the statement ID of the correct (positive) passage
      buckets          list of category tags (e.g. ['math.co','math.other'])
      bm25_by_bucket   mapping bucket → BM25Okapi index for that bucket
      passage_ids_by_bucket
                       mapping bucket → list of statement IDs in that index
      paper_ids        mapping statement ID → paper ID (for same‐paper exclusion)
      pid_to_embidx    Dict: passage ID → row index in passage_embs.
      passage_embs     array of shape (N_passages, dim), L2‐normalized embeddings
      k_bm25           number of BM25 candidates to collect
      k_emb            number of candidates to re‐rank by embedding
      top_m            final number of hard negatives to return

    Returns:
      A list of up to top_m passage IDs that are lexically close via BM25
      and semantically close via cosine similarity, excluding any from the same paper.
    """
    # 1) Score all passages lexically
    candidates = []  # list of (passage_id, bm25_score)
    q_tokens = q_text.split()


    # 1) Lexical shortlist in each bucket
    for b in buckets:
        bm25 = bm25_by_bucket[b]
        ids  = passage_ids_by_bucket[b]
        scores = bm25.get_scores(q_tokens)
        n_docs=len(scores)

        # how many to pick from this bucket?
        pick_k = min(k_bm25, n_docs)
        if pick_k <= 0:
            continue

        # get the top pick_k indices
        idxs = np.argpartition(scores, -pick_k)[-pick_k:]
        for idx in idxs:
            pid = ids[idx]
            # exclude same-paper
            if paper_ids[pid] == paper_ids[true_pid]:
                continue
            candidates.append((pid, scores[idx]))

    if not candidates:
        return []

    # keep the global top k_bm25 by BM25 score
    candidates = sorted(candidates, key=lambda x: x[1], reverse=True)[:k_bm25]
    cand_ids   = [pid for pid, _ in candidates]

    # 2) Semantic rerank
    emb_idxs = [pid_to_embidx[pid] for pid in cand_ids]
    cand_embs= passage_embs[emb_idxs]
    sims     = (cand_embs @ q_emb).flatten()

    # safety for embedding shortlist
    n_cand = len(sims)
    pick_e = min(k_emb, n_cand)
    if pick_e <= 0:
        return []

    emb_top = np.argpartition(sims, -pick_e)[-pick_e:]
    # now pick top_m among those
    sorted_top = emb_top[np.argsort(sims[emb_top])[::-1][:top_m]]
    hard_ids   = [cand_ids[i] for i in sorted_top]
    return hard_ids

## ModernBERT DAPT

In [None]:
repo_id = "Master-thesis-NAP/ModernBert-DAPT-math"   # tu checkpoint DAPT

#  Cargamos el encoder con los pesos DAPT
word_model = models.Transformer(repo_id )             # usa AutoModel + AutoTokenizer
#  Añadimos mean pooling
pooling = models.Pooling(
    word_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

#  Construimos un modelo SentenceTransformer
model = SentenceTransformer(modules=[word_model, pooling])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [None]:
# 1. Extract question texts
questions = df_train['question'].tolist()

# 2. Encode into embeddings (L2-normalized)
query_embs = model.encode(
    questions,
    batch_size=32,
    convert_to_numpy=True,
    show_progress_bar=True,
    normalize_embeddings=True #important: normalize so that a dot product == cos similarity
)

# 3. Inspect dimensions
print(f"Generated query_embs with shape: {query_embs.shape}")

# 4. Save embeddings for later reuse
np.save('/content/drive/Shareddrives/Master_Thesis/Fine-Tuning/Embeddings/query_embs_modernbert_dapt.npy', query_embs)

Batches:   0%|          | 0/2497 [00:00<?, ?it/s]

W0523 13:24:47.665000 269 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


Generated query_embs with shape: (79897, 768)


In [None]:
passages = df_train['content'].tolist()

# 4) Encode into L2-normalized embeddings
passage_embs = model.encode(
    passages,
    batch_size=128,
    convert_to_numpy=True,
    show_progress_bar=True,
    normalize_embeddings=True
)

print("passage_embs shape:", passage_embs.shape)  # (n_passages, dim)

# 5) Save for reuse
np.save('/content/drive/Shareddrives/Master_Thesis/Fine-Tuning/Embeddings/passage_embs_modernbert_dapt.npy', passage_embs)

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

passage_embs shape: (79897, 768)


In [None]:
#upload
query_embs = np.load('/content/drive/Shareddrives/Master_Thesis/Fine-Tuning/Embeddings/query_embs_modernbert_dapt.npy')
passage_embs = np.load('/content/drive/Shareddrives/Master_Thesis/Fine-Tuning/Embeddings/passage_embs_modernbert_dapt.npy')

# modernBERT base (no DAPT)

### Load model

In [None]:
repo_id = "answerdotai/ModernBERT-base"   # tu checkpoint DAPT

#  Cargamos el encoder con los pesos DAPT
word_model = models.Transformer(repo_id )             # usa AutoModel + AutoTokenizer
#  Añadimos mean pooling
pooling = models.Pooling(
    word_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

#  Construimos un modelo SentenceTransformer
model = SentenceTransformer(modules=[word_model, pooling])

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

### Embed questions

In [None]:
# 1. Extract question texts
questions = df_train['question'].tolist()

# 2. Encode into embeddings (L2-normalized)
query_embs = model.encode(
    questions,
    batch_size=32,
    convert_to_numpy=True,
    show_progress_bar=True,
    normalize_embeddings=True #important: normalize so that a dot product == cos similarity
)

# 3. Inspect dimensions
print(f"Generated query_embs with shape: {query_embs.shape}")

# 4. Save embeddings for later reuse
np.save('/content/drive/Shareddrives/Master_Thesis/Fine-Tuning/Embeddings/query_embs_modernbert_base.npy', query_embs)

Batches:   0%|          | 0/2497 [00:00<?, ?it/s]

W0524 11:53:49.273000 389 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


Generated query_embs with shape: (79897, 768)


### Embed statements

In [None]:
passages = df_train['content'].tolist()

# 4) Encode into L2-normalized embeddings
passage_embs = model.encode(
    passages,
    batch_size=128,
    convert_to_numpy=True,
    show_progress_bar=True,
    normalize_embeddings=True
)

print("passage_embs shape:", passage_embs.shape)  # (n_passages, dim)

# 5) Save for reuse
np.save('/content/drive/Shareddrives/Master_Thesis/Fine-Tuning/Embeddings/passage_embs_modernbert_base.npy', passage_embs)

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

passage_embs shape: (79897, 768)


# Implementation

In [None]:
# Ensure df_train is reset so that indexes allign with embeddings
df_train = df_train.reset_index(drop=True)

passage_ids = df_train['id'].tolist()
paper_ids   = dict(zip(df_train['id'], df_train['paper id']))
pid_to_embidx = {pid: idx for idx, pid in enumerate(passage_ids)}
# query_embs, passage_embs already loaded

hard_negs_by_query = {}
for i, row in tqdm(df_train.iterrows(), total=len(df_train)):
    stmt_id = row['id']
    hard_negs_by_query[stmt_id] = mine_hard_negs(
        q_text           = row['question'],
        q_emb            = query_embs[i],
        true_pid         = stmt_id,
        buckets          = row['buckets'],
        bm25_by_bucket   = bm25_by_bucket,
        passage_ids_by_bucket = passage_ids_by_bucket,
        paper_ids        = paper_ids,
        pid_to_embidx    = pid_to_embidx,
        passage_embs     = passage_embs,
        k_bm25           = 50,
        k_emb            = 50,
        top_m            = 3
    )

# 2. Save the result to disk
with open('/content/drive/Shareddrives/Master_Thesis/Fine-Tuning/Embeddings/hard_negatives_modernbert_base.pkl', 'wb') as f:
    pickle.dump(hard_negs_by_query, f)

print("Completed hard negative mining. Results saved to 'hard_negatives_modernbert_dapt.pkl'.")

100%|██████████| 79897/79897 [1:13:34<00:00, 18.10it/s]

Completed hard negative mining. Results saved to 'hard_negatives_modernbert_dapt.pkl'.





# Inspection

In [None]:
with open('/content/drive/Shareddrives/Master_Thesis/Fine-Tuning/Embeddings/hard_negatives_modernbert_base.pkl','rb') as f:
    hard_negs = pickle.load(f)

# 2) Basic properties
print("Type:", type(hard_negs))                     # should be <class 'dict'>
print("Number of queries:", len(hard_negs))         # approx your train set size

# 3) Peek at one entry
some_id = next(iter(hard_negs))
print(f"Query id {some_id} → hard negatives:", hard_negs[some_id])

Type: <class 'dict'>
Number of queries: 79897
Query id 130 → hard negatives: [38408, 28733, 85275]


In [None]:
hard_negs

{130: [38408, 28733, 85275],
 265: [58471, 45711, 70707],
 266: [40966, 59281, 68962],
 267: [84604, 15823, 84607],
 313: [47045, 4089, 3110],
 314: [20325, 65886, 33609],
 423: [72249, 72249, 72256],
 424: [72249, 72249, 72268],
 425: [72253, 72253, 74981],
 1147: [23838, 80189, 24112],
 1148: [24113, 8396, 64982],
 1149: [4436, 54999, 11778],
 2042: [67978, 67978, 17970],
 2043: [4610, 4610, 49632],
 2044: [77498, 77498, 44803],
 2163: [64051, 19169, 79203],
 2164: [76169, 26767, 6400],
 2165: [38544, 17107, 17124],
 2366: [73130, 73128, 12881],
 2367: [26307, 30313, 10827],
 2368: [2047, 2045, 80810],
 2493: [62835, 62835, 45233],
 2494: [5043, 5043, 50333],
 3609: [42375, 42375, 34946],
 3610: [78274, 78274, 16018],
 3611: [55419, 55419, 42355],
 3673: [71814, 71823, 70342],
 3674: [27807, 38417, 17787],
 3760: [39058, 39058, 33522],
 3761: [4531, 4531, 37558],
 3762: [18816, 18816, 26100],
 3964: [34835, 37086, 8704],
 3988: [83499, 34667, 74248],
 3989: [38421, 38070, 51496],
 39

In [None]:
i= iter(hard_negs)

In [None]:
some_id = next(i)
print(f"Query id {some_id} → hard negatives:", hard_negs[some_id])

Query id 265 → hard negatives: [58471, 45711, 70707]


In [None]:
df_train.head()

Unnamed: 0,id,paper id,title,categories,type,content,question,buckets
0,130,2501.00784,cloitre's self-generating sequence,"[math.co, cs.dm, cs.fl, math.nt]",theorems,Let $g_n$ be the number of $1$'s in the sequen...,What is the limit of the proportion of 1's in ...,"[math.co, math.nt]"
1,265,2501.00809,initial ideals of weighted forms and the genus...,"[math.ac, math.ag]",theorems,\label{ThmConjAreTrue}\nConjectures \ref{Conj1...,Does the statement of \textbf{ThmConjAreTrue} ...,"[math.ac, math.ag]"
2,266,2501.00809,initial ideals of weighted forms and the genus...,"[math.ac, math.ag]",propositions,}\n\newcommand{\ep}{,\\emph{Is the statement \emph{If $X$ is a comp...,"[math.ac, math.ag]"
3,267,2501.00809,initial ideals of weighted forms and the genus...,"[math.ac, math.ag]",definitions,}\n\newcommand{\ed}{,Is the statement $\ed{True}$?,"[math.ac, math.ag]"
4,313,2501.00845,spectral spaces of normal subgroups,"[math.gr, math.gn]",theorems,\label{mth}\nLet $G$ be a group having a maxim...,Does the set $\mathcal{N}^+(G)$ of proper norm...,[math.gr]


In [None]:
df_train[df_train['id']==58471]

Unnamed: 0,id,paper id,title,categories,type,content,question,buckets
70996,58471,2502.02981,a determinant on birational maps of severi-bra...,[math.ag],lemmas,[{\cite[Corollary 2.2.2 with $p=3$]{BSY}}]\n ...,Does a Severi-Brauer surface over a perfect fi...,[math.ag]


In [None]:
df_train[df_train['id']==45711]

Unnamed: 0,id,paper id,title,categories,type,content,question,buckets
32526,45711,2501.17353,non-smooth regular curves via a descent approach,[math.ag],theorems,\label{Primer Teorema central de este trabajo}...,Does the theorem imply that every regular curv...,[math.ag]


In [None]:
df_train[df_train['id']==70707]

Unnamed: 0,id,paper id,title,categories,type,content,question,buckets
14029,70707,2502.08153,"stable rationality of hypersurfaces in sch\""{o...",[math.ag],theorems,[See Theorem \ref{thm: d in Grassmannian}]\lab...,Is the stable rationality of a very general hy...,[math.ag]


In [None]:
0 in hard_negs[]

True