In [None]:
import polars as pl
import faiss
import torch
import numpy as np
from tqdm import tqdm 
from transformers import AutoTokenizer, AutoModel
from multiprocessing import Pool, cpu_count

In [None]:
csv_path = "../data/pubmed_baseline/csv/pubmed25n1274.csv" 
df = pl.read_csv(csv_path)
print(f'Number of rows: {len(df)}')

columns_to_check = ["PMID", "Title", "Abstract", "Authors", "Year", "Journal"]
df = df.drop_nulls(subset=columns_to_check)
print(f'Number of rows after dropping nulls: {len(df)}')

df = df.with_columns(df["Year"].cast(int))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "abhinand/MedEmbed-base-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device).eval()
model = model.half()

if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

In [None]:
texts = df["Title"] + ". " + df["Abstract"]

def embed_text_in_batches(texts, batch_size=32):
    texts = list(texts)  # Ensure texts is a list
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing Batches"):
        batch_texts = texts[i:i+batch_size]  # Directly slice the list
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        batch_embeddings = outputs.last_hidden_state[:, 0, :].to(torch.float32).cpu().numpy()  # Use CLS token
        all_embeddings.append(batch_embeddings)

        del inputs, outputs  # Release memory
        torch.cuda.empty_cache()  # Clear GPU memory

    return np.vstack(all_embeddings)  # Combine all batch results

# Generate embeddings for all texts
embeddings = embed_text_in_batches(texts, batch_size=9000)  # If still OutOfMemory, change batch_size to 8


In [None]:
# Set up FAISS vector index
d = embeddings.shape[1]  # Vector dimension
N = embeddings.shape[0]  # Number of embeddings
nlist = min(int(4 * np.sqrt(N)), N)  # Number of clusters
quantizer = faiss.IndexFlatL2(d)  # Quantizer with L2 distance
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

# Train FAISS index (IVF requires training)
index.train(embeddings)

# Add embeddings to the index
index.add(embeddings)


In [None]:
# Save FAISS index
faiss.write_index(index, "faiss_medical_index_IndexIVFFlat.ivf")

# Save metadata
df.write_csv("faiss_metadata_IndexIVFFlat.csv")

print("Vector index built and saved FAISS index and Metadata!")


# Serach

In [2]:
import faiss
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "abhinand/MedEmbed-base-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device).eval()

index = faiss.read_index("../output/2020/faiss.index")

df = pd.read_csv("../output/2020/metadata.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# **Search Function**
def search_papers(query, top_k=5):
    # **Step 1: Vectorize the query**
    inputs = tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    query_embedding = outputs.last_hidden_state[:, 0, :].to(torch.float32).cpu().numpy()

    # **Step 2: FAISS search**
    D, I = index.search(query_embedding, top_k)  # D is the distance, I is the index

    # **Step 3: Retrieve metadata based on indices**
    results = df.iloc[I[0]].copy()  # Select the retrieved papers

    # **Step 4: Sort by year (newest to oldest)**
    results = results.sort_values(by="Year", ascending=False)

    # **Step 5: Format the output**
    search_results = []
    for _, row in results.iterrows():
        search_results.append({
            "PMID": row["PMID"],
            "Title": row["Title"],
            "Abstract": row["Abstract"],
            "Authors": row["Authors"],
            "Year": row["Year"],
            "Journal": row["Journal"],
            "Keyword": row["Keyword"]
        })

    return search_results

# **Test Search**
query = "Using search data to forecast COVID-19 trends"
results = search_papers(query, top_k=10)

# **Display results**
for i, res in enumerate(results):
    print(f"🔹 {i+1}. {res['Title']} ({res['Year']})")
    print(f"    📝 Abstract: {res['Abstract']}")
    print(f"    👩‍⚕️ Authors: {res['Authors']}")
    print(f"    🏥 Journal: {res['Journal']}")
    print(f"    🔑 Keywords: {res['Keyword']}")


🔹 1. Assessing Internet Search Models in Predicting Daily New COVID-19 Cases and Deaths in South Korea. (2024)
    📝 Abstract: Search data were found to be useful variables for COVID-19 trend prediction. In this study, we aimed to investigate the performance of online search models in state space models (SSMs), linear regression (LR) models, and generalized linear models (GLMs) for South Korean data from January 20, 2020, to July 31, 2021. Principal component analysis (PCA) was run to construct the composite features which were later used in model development. Values of root mean squared error (RMSE), peak day error (PDE), and peak magnitude error (PME) were defined as loss functions. Results showed that integrating search data in the models for short- and long-term prediction resulted in a low level of RMSE values, particularly for SSMs. Findings indicated that type of model used highly impacts the performance of prediction and interpretability of the model. Furthermore, PDE and PME c

In [4]:
def format_retrieved_docs(search_results):
    context = ""
    for i, doc in enumerate(search_results):
        title = doc.get("Title", "Unknown Title")
        abstract = doc.get("Abstract", "No abstract available.")
        context += f"Document {i+1}:\nTitle: {title}\nAbstract: {abstract}\n\n"
    return context

def generate_rag_prompt(query, search_results):
    context = format_retrieved_docs(search_results)
    
    prompt = f"""You are a medical expert with extensive knowledge in deep learning applications for medical imaging and survival analysis.
        The user has asked the following question:
        "{query}"

        Below are relevant research articles retrieved from PubMed:

        {context}

        Your task is to analyze the retrieved documents and generate a structured response. Follow these guidelines:

        1. **Summarize each article** individually, extracting the key findings and methodologies.
        2. **Highlight the relevance** of each study to the user's question.
        3. **Compare methodologies**, identifying differences and potential synergies across papers.
        4. **Provide a final synthesis**, explaining how these studies contribute to the overall understanding of the topic.
        5. **If necessary, incorporate your expert knowledge**, but ensure that the response remains grounded in the retrieved literature.

        ### **Response Format:**
        **Article 1: [Title] ([Year])**  
        (Summary: Key findings, methodology, and relevance)  

        **Article 2: [Title] ([Year])**  
        (Summary: Key findings, methodology, and relevance)  

        **Article 3: [Title] ([Year])**  
        (Summary: Key findings, methodology, and relevance)  

        ### **Final Synthesis & Key Takeaways**
        - (Compare methodologies across studies)
        - (Discuss any common trends or contradictions)
        - (Explain implications for clinical practice or research)

        **Ensure that your response is precise, well-structured, and scientifically rigorous.**
    """
    
    return prompt

In [5]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

results = [res for res in results if 2023 <= res["Year"] <= 2025]

for i, paper in enumerate(results):
    print(f"Document {i+1}:")
    print(f"Title: {paper.get('Title', 'MISSING TITLE')} ({paper.get('Year', 'N/A')})")

print("\n===============================\n")
prompt = generate_rag_prompt(query, results)

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are an expert in kidney ultrasound analysis."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.7
)

generated_answer = response.choices[0].message.content
print(generated_answer)

Document 1:
Title: Assessing Internet Search Models in Predicting Daily New COVID-19 Cases and Deaths in South Korea. (2024)
Document 2:
Title: Predicting COVID-19 new cases in California with Google Trends data and a machine learning approach. (2024)
Document 3:
Title: Discovering Time-Varying Public Interest for COVID-19 Case Prediction in South Korea Using Search Engine Queries: Infodemiology Study. (2024)
Document 4:
Title: Enhancing the Predictive Power of Google Trends Data Through Network Analysis: Infodemiology Study of COVID-19. (2023)


**Article 1: Assessing Internet Search Models in Predicting Daily New COVID-19 Cases and Deaths in South Korea.**  
(Summary: This study aimed to evaluate the performance of different predictive models using online search data for forecasting COVID-19 trends in South Korea. Three types of models were compared: state space models (SSMs), linear regression (LR) models, and generalized linear models (GLMs). Principal component analysis (PCA) was 