# Embedding

In [1]:
import pandas as pd

df = pd.read_csv('../data/pubmed_baseline/csv/pubmed25n1274.csv')
df.head()

Unnamed: 0,PMID,Title,Abstract,Authors,Year,Journal,Keyword
0,39764487,Can gut microbiota explain acute diverticuliti...,Why patients with symptomatic uncomplicated di...,"Antonio Tursi, Giorgia Procaccianti, Federica ...",2025.0,"Bioscience of microbiota, food and health",acute diverticulitis&gut microbiota&symptomati...
1,39764488,Effects of moderate beer consumption on immuni...,Beer contains a variety of bioactive ingredien...,"Shumin Hu, Hua Yin, Xiaxia Li, Minghao Fan, Hu...",2025.0,"Bioscience of microbiota, food and health",alcoholic beer&cyclophosphamide&gut microbiome...
2,39764489,Gut microbiota involvement in the effect of wa...,The beneficial effects of water-soluble dietar...,"Satoshi Sato, Chikara Iino, Daisuke Chinda, Ta...",2025.0,"Bioscience of microbiota, food and health",fatty liver&fibrosis&gut microbiota&water-solu...
3,39764490,Impact of container type on the microbiome of ...,"Airag, a fermented mare's milk in Mongolia, ex...","Akari Shinoda, Yuri Koga, Ryouta Tsuchiya, Bat...",2025.0,"Bioscience of microbiota, food and health",Lactobacillus helveticus&Mongolia&airag&fermen...
4,39764493,α-Lipoic acid increases phagocytosis of some l...,Phagocytosis by immunocompetent cells is a key...,"Naoto Nomura, Nobuo Miyadai, Ichiro Kawase",2025.0,"Bioscience of microbiota, food and health",CD36&flow cytometer&high-performance liquid ch...


In [2]:
import torch
from transformers import AutoTokenizer, AutoModel

# 選擇設備
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 載入 Tokenizer & Model
# model_name = "jinaai/jina-embeddings-v3"
model_name = "abhinand/MedEmbed-base-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device)

In [4]:
# 修正：將 BFloat16 轉換為 Float32
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs).last_hidden_state  # 所有 token 的輸出
    embedding = outputs.mean(dim=1)  # 均值池化
    return embedding.to(torch.float32).cpu().numpy().flatten()

In [5]:
query = "Image & Contrastive Learning"
embedding = generate_embedding(query)
print(embedding.shape)  # 應該是 (768,) 或 (1024,) 取決於模型

(768,)


# Vector Database

In [6]:
import faiss
import numpy as np

# 設定 embedding 維度（根據 jinaai/jina-embeddings-v3，應該是 1024 維）
embedding_dim = 768  
index = faiss.IndexFlatIP(embedding_dim)  # 使用 L2 距離（內積 IndexFlatIP 也可以）

# 儲存 metadata
paper_metadata = []

In [7]:
import pandas as pd
from tqdm import tqdm

# 計算 embedding 並存入 FAISS
all_embeddings = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    text = f"{row['Title']} {row['Abstract']}"  # 合併標題與摘要
    embedding = generate_embedding(text)  # 轉換成 embedding
    all_embeddings.append(embedding)
    paper_metadata.append({"PMID": row["PMID"], "Title": row["Title"], "Abstract": row["Abstract"], "Year": row["Year"]})

# 轉換為 FAISS 格式並儲存
all_embeddings = np.array(all_embeddings).astype("float32")
index.add(all_embeddings)

# 儲存 FAISS 索引與 metadata
faiss.write_index(index, "pubmed_index.faiss")
np.save("pubmed_metadata.npy", paper_metadata)

100%|██████████| 11553/11553 [01:21<00:00, 141.96it/s]


In [None]:
import numpy as np

def search_papers(query, top_k=5, threshold=0.5):
    index = faiss.read_index("pubmed_index.faiss")
    metadata = np.load("pubmed_metadata.npy", allow_pickle=True)

    query_emb = np.array(generate_embedding(query)).reshape(1, -1).astype("float32")

    # 執行檢索
    D, I = index.search(query_emb, top_k)

    # 過濾掉相似度低於 threshold 的結果
    results = []
    for i, idx in enumerate(I[0]):
        if idx < len(metadata) and D[0][i] >= threshold:
            paper = metadata[idx]
            if paper and isinstance(paper.get("Title"), str) and isinstance(paper.get("Abstract"), str):
                results.append(paper)

    return results

# 測試查詢
query = "Kidney ultrasound"
search_results = search_papers(query, top_k=3)
for paper in search_results:
    print(f"Title: {paper.get('Title', 'N/A')}\nAbstract: {paper.get('Abstract', 'N/A')}\nYear: {paper.get('Year', 'N/A')}\n")

Title: The utility of renal sonographic measurements in differentiating children with grades 2, 3, and 4 hydronephrosis.
Abstract: Prior analysis of children with grade 3 and 4 congenital hydronephrosis demonstrated that renal medullary pyramidal thickness (PT) is predictive of subsequent pyeloplasty (area under the curve [AUC] = 0.78). The objective of this study was to further analyze the utility of sonographic measurements including PT, anteroposterior pelvic diameter (APD), and renal length with an expansion of the number of infants with hydronephrotic kidneys including grades 2, 3, and 4 hydronephrosis.
Year: 2024.0

Title: Advancements in Elastography for Evaluating Fibrosis in Renal Transplants: Current Perspectives.
Abstract: Renal fibrosis is a leading cause of chronic allograft nephropathy. While renal biopsy remains the gold standard for diagnosing fibrosis, it is an invasive procedure with potential for severe complications. Elastography, an emerging ultrasound imaging tech

: 