In [1]:
!pip install pyyaml beautifulsoup4 langchain biopython requests pymilvus redis


Defaulting to user installation because normal site-packages is not writeable


In [2]:
import glob
import os
import sys
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pymilvus import Collection, connections, utility
from redis import Redis
from langchain_milvus.vectorstores.milvus import Milvus
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import chain
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import RedisChatMessageHistory
from langchain_core.documents import Document
from operator import itemgetter

# Konfigurierbare Variablen
EMAIL = "bene.linn@yahoo.de"
MILVUS_COLLECTION_NAME = "pubmed_summaries"
SAVE_DIRECTORY = "./"
API_KEY = os.getenv("NVIDIA_API_KEY")
PICO_TEXT_FILE_PATH = "./input.txt"

# Füge den Pfad zur Datei hinzu
script_dir = os.getcwd()
sys.path.append(os.path.join(script_dir, "../code/chain_server"))

from fetch_data import Research  # Verwende die bestehende Research-Klasse
from prompts import CONDENSE_QUESTION_TEMPLATE  # Importiere die Chat-Prompts

class ResearchWithSummaries(Research):
    def __init__(self, email: str, milvus_collection_name: str, max_results: int = 20, save_directory: str = "./"):
        super().__init__(email, max_results, save_directory)
        self.milvus_collection_name = milvus_collection_name
        self.embeddings = NVIDIAEmbeddings(
            model="nvidia/nv-embedqa-e5-v5",
            base_url="http://nv-embedqa-e5-v5:8000/v1"
        )
        connections.connect("default", host="milvus", port="19530")
        self.vector_store = Milvus(
            embedding_function=self.embeddings,
            connection_args={"uri": "http://milvus:19530"},
            collection_name=milvus_collection_name,
            auto_id=True
        )

    def upload_summaries_to_db(self, summaries: List[str]) -> None:
        """
        Splits the summaries into chunks and uploads them to the vector database for analysis.
        """
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        all_splits = [text_splitter.split_text(summary) for summary in summaries]
        all_splits_flat = [item for sublist in all_splits for item in sublist]  # flatten the list
        metadatas = [{} for _ in all_splits_flat]
        self.vector_store.add_texts(texts=all_splits_flat, metadatas=metadatas)
        print(f"Uploaded {len(summaries)} summaries to vector database.")

    def search_articles(self, query: str) -> tuple:
        """
        Searches PubMed articles based on the given query and returns the article IDs and summaries.
        """
        search_results = self.search_pubmed(query)
        pubmed_article_ids = search_results.get("IdList", [])
        summaries = self.fetch_summaries(pubmed_article_ids[:10])
        return pubmed_article_ids, [
            {
                "Title": summary.get("Title", "No title available"),
                "Authors": summary.get("AuthorList", ["No authors available"]),
                "Source": summary.get("Source", "No source available"),
                "PubDate": summary.get("PubDate", "No publication date available"),
                "Abstract": summary.get("Abstract", "No abstract available"),
                "DOI": summary.get("DOI", "No DOI available"),
                "MeSH": summary.get("MeshHeadingList", ["No MeSH terms available"]),
            }
            for summary in summaries
        ]

# Initialisiere die Klasse
research = ResearchWithSummaries(email=EMAIL, milvus_collection_name=MILVUS_COLLECTION_NAME, save_directory=SAVE_DIRECTORY)

# Beispiel einer Textdatei, aus der eine PICO-Frage extrahiert wird
def extract_pico_from_text(file_path: str) -> str:
    with open(file_path, "r") as file:
        text = file.read()
    condense_question_prompt = PROMPTS["health_search"].with_config(run_name="pico_extraction_prompt")
    return condense_question_prompt.invoke({"question": text})

# Extrahiere PICO-Informationen aus einer Textdatei
pico_question = extract_pico_from_text(PICO_TEXT_FILE_PATH)
print("PICO-Frage:", pico_question)

# Suche nach Artikeln basierend auf der PICO-Frage
article_ids, summaries = research.search_articles(pico_question)

# Lade die ersten 10 Summaries in die Datenbank
summaries_to_upload = [summary.get("Title", "No title available") for summary in summaries]
research.upload_summaries_to_db(summaries_to_upload)

# Initialisiere die Chat-Schnittstelle mit dem LLM
llm = ChatNVIDIA(
    model="meta/llama3-8b-instruct",
    base_url="https://integrate.api.nvidia.com/v1",
    api_key=API_KEY
)

@chain
def chat_with_llm(question: str, history: list) -> str:
    """
    Use LLM to answer questions based on chat history and user query.
    """
    condense_question_prompt = PROMPTS["condense"].with_config(run_name="condense_question_prompt")
    condensed_chain = condense_question_prompt | llm | StrOutputParser().with_config(run_name="condense_question_chain")
    if history:
        return condensed_chain.invoke({"history": history, "question": question})
    return question

# Beispiel eines Chat-Verlaufs
chat_history = []
while True:
    user_input = input("Your question: ")
    if user_input.lower() in ["exit", "quit"]:
        break
    response = chat_with_llm(user_input, chat_history)
    print("Assistant:", response)
    chat_history.append({"user": user_input, "assistant": response})

PICO-Frage: messages=[SystemMessage(content="You're an AI assistant designed to help with health-related literature searches.\n            Using the PICO framework, assist in extracting key components from the text,\n            and formulate a suitable search query for databases like PubMed. You are using a\n            Retrieval-Augmented Generation (RAG) model with NVIDIA NIMs to identify relevant information.\n            Based on the provided information, guide through the PICO process:\n            - **Patient/Population/Problem (P)**: Identify who the patient or population is, and what their health problem is.\n            - **Intervention (I)**: Determine the intervention of interest (e.g., treatment, exposure).\n            - **Comparison (C)**: Specify any comparison interventions (if applicable).\n            - **Outcome (O)**: Define the outcomes of interest (e.g., reduction in symptoms, prevention).\n            Please provide the necessary details or confirm that you woul

RuntimeError: Empty id list - nothing todo