In [1]:
import faiss
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 選擇設備
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

# 載入 Tokenizer & Model
# model_name = "jinaai/jina-embeddings-v3"
model_name = "abhinand/MedEmbed-base-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device)

In [3]:
# 修正：將 BFloat16 轉換為 Float32
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs).last_hidden_state  # 所有 token 的輸出
    embedding = outputs.mean(dim=1)  # 均值池化
    return embedding.to(torch.float32).cpu().numpy().flatten()

In [4]:
def search_papers(query, top_k=5, threshold=0.5):
    index = faiss.read_index("pubmed_index.faiss")
    metadata = np.load("pubmed_metadata.npy", allow_pickle=True)

    query_emb = np.array(generate_embedding(query)).reshape(1, -1).astype("float32")

    # 執行檢索
    D, I = index.search(query_emb, top_k)

    # 過濾掉相似度低於 threshold 的結果
    results = []
    for i, idx in enumerate(I[0]):
        if idx < len(metadata) and D[0][i] >= threshold:
            paper = metadata[idx]
            if paper and isinstance(paper.get("Title"), str) and isinstance(paper.get("Abstract"), str):
                results.append(paper)

    return results

# 測試查詢
query = "Image & Contrastive Learning"
search_results = search_papers(query, top_k=3)
for paper in search_results:
    print(f"Title: {paper.get('Title', 'N/A')}\nAbstract: {paper.get('Abstract', 'N/A')}\n")

Title: Probabilistic Attention Map: A Probabilistic Attention Mechanism for Convolutional Neural Networks.
Abstract: The attention mechanism is essential to 

Title: Time-Series Representation Feature Refinement with a Learnable Masking Augmentation Framework in Contrastive Learning.
Abstract: In this study, we propose a novel framework for time-series representation learning that integrates a learnable masking-augmentation strategy into a contrastive learning framework. Time-series data pose challenges due to their temporal dependencies and feature-extraction complexities. To address these challenges, we introduce a masking-based reconstruction approach within a contrastive learning context, aiming to enhance the model's ability to learn discriminative temporal features. Our method leverages self-supervised learning to effectively capture both global and local patterns by strategically masking segments of the time-series data and reconstructing them, which aids in revealing nuanced te

In [5]:
def format_retrieved_docs(search_results):
    context = ""
    for i, doc in enumerate(search_results):
        title = doc.get("Title", "Unknown Title")  # 確保標題存在
        abstract = doc.get("Abstract", "No abstract available.")  # 確保摘要存在
        context += f"Document {i+1}:\nTitle: {title}\nAbstract: {abstract}\n\n"
    return context

In [6]:
def generate_rag_prompt(query, search_results):
    context = format_retrieved_docs(search_results)
    
    prompt = f"""You are a medical expert specializing in kidney ultrasound imaging.
    
    The user has asked the following question:
    "{query}"
    
    Based on the retrieved relevant documents below, generate a well-informed answer.
    
    Retrieved Documents:
    {context}
    
    Answer the user's query using the most relevant information. If necessary, provide additional insights based on your knowledge.
    
    Response:
    """
    
    return prompt

In [None]:
from openai import OpenAI


API_KEY = "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"

client = OpenAI(api_key=API_KEY)  # 用 OpenAI 客戶端初始化

search_results = search_papers(query, top_k=3)
# 檢查檢索結果是否完整
for i, paper in enumerate(search_results):
    print(f"Document {i+1}:")
    print(f"Title: {paper.get('Title', 'MISSING TITLE')} ({paper.get('Year', 'N/A')})")
    # print(f"Abstract: {paper.get('Abstract', 'MISSING ABSTRACT')}\n")

print("\n===============================\n")
prompt = generate_rag_prompt(query, search_results)

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are an expert in kidney ultrasound analysis."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.7
)

generated_answer = response.choices[0].message.content
print(generated_answer)

Document 1:
Title: Probabilistic Attention Map: A Probabilistic Attention Mechanism for Convolutional Neural Networks. (2024.0)
Document 2:
Title: Time-Series Representation Feature Refinement with a Learnable Masking Augmentation Framework in Contrastive Learning. (2024.0)
Document 3:
Title: Automated Screening of Precancerous Cervical Cells Through Contrastive Self-Supervised Learning. (2024.0)


"Image & Contrastive Learning" refers to a method used in machine learning, specifically in the context of self-supervised learning, to improve the extraction of meaningful representations from images. This approach plays a significant role in healthcare, particularly in tasks like time-series analysis and cervical cancer screening, as suggested by the documents.

For instance, Document 2 introduces a masking-augmentation strategy incorporated into a contrastive learning framework for time-series representation learning. This approach allows for the optimization of contextual relationships i

: 