# Sentence-BERT Embedding Generation

This notebook generates dense paper embeddings using a pretrained Sentence-BERT model and writes them to `data/processed/`.

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer

# Directory for preprocessed data
PRE_DIR = Path("../data/preprocessed")

# Load cleaned/preprocessed papers
papers = pd.read_csv(PRE_DIR / "papers_cleaned.csv")

papers.head()


Unnamed: 0,paper_id,title,abstract,year,topic_primary,paper_recency
0,0,Military staff happy event.,Senior nor ahead consider. Success light capit...,2005,MentalHealth,20
1,1,Possible standard former whether smile.,Maintain hair general let. Character material ...,2001,HIV,24
2,2,Shake evidence yeah cover.,Season education easy space argue. Stage inter...,2001,MCH,24
3,3,Customer lay politics sure pretty.,Detail herself easy miss red. Nor arm line for...,2011,CVD,14
4,4,Write animal forward dark tax if.,Health memory budget matter simply set. None c...,2015,NCD,10


In [4]:
# Load lightweight Sentence-BERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert abstracts to strings & list
texts = papers["abstract"].astype(str).tolist()

# Generate normalized embeddings for cosine similarity use
paper_embeddings = model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

paper_embeddings.shape

Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:03<00:00,  9.56it/s]


(1000, 384)

In [5]:
# Save embeddings to file
np.save(PRE_DIR / "paper_embeddings.npy", paper_embeddings)

# Save mapping of paper_id to embedding row index
mapping = pd.DataFrame({
    "paper_id": papers["paper_id"],
    "index": range(len(papers)),
})
mapping.to_csv(PRE_DIR / "paper_id_to_index.csv", index=False)

print("Saved embeddings and mapping to data/preprocessed/")

Saved embeddings and mapping to data/preprocessed/
