In [1]:
!pip install pypdf biopython 

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Notwendige Module importieren
import os
import time
from datetime import datetime, timedelta
from Bio import Entrez

# Research-Klasse definieren
class Research:
    def __init__(self, email: str, max_results: int = 100, save_directory: str = "./abstracts"):
        self.email = email
        self.max_results = max_results
        self.save_directory = save_directory
        Entrez.email = self.email

    def search_pubmed(self, start_date: str, end_date: str, term: str = "") -> dict:
        query = f'{term} AND medline[sb]'
        handle = Entrez.esearch(
            db="pubmed",
            term=query,
            retmax=self.max_results,
            usehistory="y",
            datetype="pdat",
            mindate=start_date,
            maxdate=end_date
        )
        search_results = Entrez.read(handle)
        handle.close()
        return search_results

    def fetch_summaries(self, article_ids: list) -> list:
        summaries_handle = Entrez.esummary(
            db="pubmed",
            id=",".join(article_ids),
            retmode="xml"
        )
        summaries = list(Entrez.parse(summaries_handle))
        summaries_handle.close()
        return summaries

    def fetch_abstract(self, article_id: str) -> str:
        try:
            abstract_handle = Entrez.efetch(
                db="pubmed",
                id=article_id,
                rettype="abstract",
                retmode="text"
            )
            abstract = abstract_handle.read()
            abstract_handle.close()
            return abstract.strip()
        except Exception as e:
            print(f"Fehler beim Abrufen des Abstracts für Artikel {article_id}: {e}")
            return ""

# Hauptteil des Skripts
if __name__ == "__main__":
    # E-Mail-Adresse für Entrez festlegen
    email = 'bene.linn@yahoo.de'  # Bitte mit Ihrer E-Mail-Adresse ersetzen
    max_results = 100  # Anzahl der gewünschten Artikel
    save_directory = './abstracts'  # Verzeichnis zum Speichern der Abstracts

    # Verzeichnis erstellen, falls es nicht existiert
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)

    research = Research(email=email, max_results=max_results, save_directory=save_directory)

    # Zeitraum festlegen (diese Woche)
    today = datetime.today()
    start_of_week = today - timedelta(days=today.weekday())  # Montag dieser Woche
    start_date = start_of_week.strftime('%Y/%m/%d')
    end_date = today.strftime('%Y/%m/%d')

    # Suchbegriff festlegen
    search_term = (
    '(preventive health services[MeSH Terms] OR health promotion[MeSH Terms] OR '
    'primary prevention[MeSH Terms] OR "prävention"[Title/Abstract] OR "prevention"[Title/Abstract]) '
    'AND (nutrition[MeSH Terms] OR exercise[MeSH Terms] OR vaccination[MeSH Terms] OR '
    'mental health[MeSH Terms] OR lifestyle[MeSH Terms])'
     )
  # Ersetzen Sie 'prevention' durch Ihr Interessensgebiet

    print(f"Zeitraum: {start_date} bis {end_date}")
    print(f"Suchbegriff: {search_term}")

    # PubMed nach Artikeln durchsuchen
    search_results = research.search_pubmed(start_date, end_date, term=search_term)
    pubmed_article_ids = search_results.get('IdList', [])
    total_results = len(pubmed_article_ids)

    if total_results == 0:
        print("Keine Ergebnisse für den angegebenen Zeitraum und Suchbegriff gefunden.")
    else:
        print(f"{total_results} Artikel wurden gefunden.")
        # Summaries abrufen
        summaries = research.fetch_summaries(pubmed_article_ids)
        print(f"Anzahl der abgerufenen Zusammenfassungen: {len(summaries)}")

        # Durch die Summaries iterieren
        for idx, summary in enumerate(summaries):
            article_id = summary['Id']
            title = summary.get('Title', 'Kein Titel verfügbar')
            authors_list = summary.get('AuthorList', [])
            authors = ', '.join(authors_list) if authors_list else 'Keine Autoren verfügbar'
            pub_date = summary.get('PubDate', 'Kein Veröffentlichungsdatum verfügbar')
            print(f"\nArtikel {idx + 1}:")
            print(f"Artikel-ID: {article_id}")
            print(f"Titel: {title}")
            print(f"Autoren: {authors}")
            print(f"Veröffentlichungsdatum: {pub_date}")

            # Abstract abrufen
            abstract = research.fetch_abstract(article_id)
            if abstract:
                print("Abstract erfolgreich abgerufen.")
                # Abstract speichern
                sanitized_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
                file_name = f"Artikel_{article_id}_{sanitized_title}.txt"
                abstract_path = os.path.join(save_directory, file_name)
                with open(abstract_path, "w", encoding="utf-8") as f:
                    f.write(f"Titel: {title}\n")
                    f.write(f"Autoren: {authors}\n")
                    f.write(f"Veröffentlichungsdatum: {pub_date}\n\n")
                    f.write(f"Abstract:\n{abstract}")
                print(f"Abstract gespeichert unter: {abstract_path}")
            else:
                print("Kein Abstract verfügbar.")
        print("\nSkript abgeschlossen.")


Zeitraum: 2024/10/14 bis 2024/10/17
Suchbegriff: (preventive health services[MeSH Terms] OR health promotion[MeSH Terms] OR primary prevention[MeSH Terms] OR "prävention"[Title/Abstract] OR "prevention"[Title/Abstract]) AND (nutrition[MeSH Terms] OR exercise[MeSH Terms] OR vaccination[MeSH Terms] OR mental health[MeSH Terms] OR lifestyle[MeSH Terms])


RuntimeError: Search Backend failed: An error occurred while processing request. Status: 500. Source: /api/search/?r= Details: Search is temporarily unavailable. Please try again later. Details: Cannot connect to SOLR

In [None]:
# Importieren der notwendigen Module
from chain_server.configuration import config
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_milvus.vectorstores.milvus import Milvus
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
import os

# Setup des NVIDIA Embedding Modells
embedding_model = NVIDIAEmbeddings(
    model=config.embedding_model.name,      # Beispiel: "embed-qa-4"
    base_url=str(config.embedding_model.url),
    api_key=config.nvidia_api_key,
    truncate="END"
)

# Setup der Milvus Vektordatenbank
print(config.milvus.collection_name)

vector_store = Milvus(
    embedding_function=embedding_model,
    connection_args={"uri": config.milvus.url},  # Beispiel: "tcp://localhost:19530"
    collection_name=config.milvus.collection_name,
    auto_id=True,
)

# Funktion zum Hochladen von TXT-Dokumenten
def upload_document(file_path):
    # Lesen der TXT-Datei
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    # Erstellen eines Dokumentobjekts mit 'page' in den Metadaten
    document = Document(
        page_content=text,
        metadata={
            "source": file_path,
            "page": 1  # Fügen Sie das Feld 'page' hinzu
        }
    )
    # Aufteilen des Dokuments in kleinere Abschnitte
    text_splitter = RecursiveCharacterTextSplitter()
    all_splits = text_splitter.split_documents([document])
    # Sicherstellen, dass jedes Split die erforderlichen Metadaten enthält
    for split in all_splits:
        split.metadata["source"] = split.metadata.get("source", file_path)
        split.metadata["page"] = split.metadata.get("page", 1)
        split.metadata["text"] = split.page_content  # Fügen Sie das Feld 'text' hinzu
    # Hinzufügen der Dokumente zum Vektorspeicher
    vector_store.add_documents(all_splits)
    return f"uploaded {file_path}"

# Funktion zum Hochladen mehrerer TXT-Dateien
def upload_text_files(folder_path, num_files=None):
    i = 0
    for file_path in glob.glob(f"{folder_path}/*.txt"):
        print(upload_document(file_path))
        i += 1
        if num_files and i >= num_files:
            break

# Hochladen der TXT-Dateien aus dem './abstracts'-Verzeichnis
NUM_DOCS_TO_UPLOAD = None  # Setzen Sie eine Zahl, um die Anzahl der Dokumente zu begrenzen
upload_text_files("./abstracts", NUM_DOCS_TO_UPLOAD)

# Flushen und Laden der Daten in Milvus
vector_store.col.flush()
vector_store.col.load()
vector_store.col.wait_for_loading_complete()




In [None]:
# Durchführung einer Similarity-Suche
query = "Welche neuen Erkenntnisse gibt es zu präventiven Gesundheitsmaßnahmen?"
docs = vector_store.similarity_search(query)
print(docs[0])

In [3]:
import yaml
import os
import glob
from langchain.vectorstores import Milvus
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any
import requests

# Schritt 1: Laden der Konfigurationsdatei
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

# Schritt 2: Definieren der OllamaLLM-Klasse
class OllamaLLM(LLM):
    model_name: str
    base_url: str
    temperature: float = 0.7
    max_tokens: int = 512
    top_p: float = 1.0

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        url = f"{self.base_url}/api/generate"
        headers = {"Content-Type": "application/json"}
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "options": {
                "temperature": self.temperature,
                "max_tokens": self.max_tokens,
                "top_p": self.top_p,
            },
            "stream": False,
        }

        response = requests.post(url, json=payload, headers=headers)
        if response.status_code != 200:
            raise Exception(f"Ollama API returned status code {response.status_code}: {response.text}")

        data = response.json()
        generated_text = data.get("response", "")

        if stop:
            for stop_token in stop:
                if stop_token in generated_text:
                    generated_text = generated_text.split(stop_token)[0]
                    break

        return generated_text.strip()

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {
            "model_name": self.model_name,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens,
            "top_p": self.top_p,
        }

    @property
    def _llm_type(self) -> str:
        return "ollama"

# Schritt 3: Initialisieren der Modelle basierend auf der Konfiguration

# Initialisieren des LLM (Ollama)
ollama_llm = OllamaLLM(
    model_name=config["llm_model"]["name"],
    base_url=config["llm_model"]["url"],
    temperature=0.7,
    max_tokens=512,
    top_p=1.0,
)

# Initialisieren des Embedding-Modells (NIM)
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, NVIDIAReranker

embedding_model = NVIDIAEmbeddings(
    model=config["embedding_model"]["name"],
    base_url=config["embedding_model"]["url"],
    api_key=config.get("nvidia_api_key", ""),  # Stellen Sie sicher, dass der API-Key gesetzt ist
    truncate="END"
)

# Initialisieren des Rerankers (NIM)
reranker = NVIDIAReranker(
    model=config["reranking_model"]["name"],
    base_url=config["reranking_model"]["url"],
    api_key=config.get("nvidia_api_key", ""),  # Stellen Sie sicher, dass der API-Key gesetzt ist
)

# Initialisieren der Milvus Vektordatenbank
vector_store = Milvus(
    embedding_function=embedding_model,
    connection_args={"uri": os.getenv("MILVUS_URL", "tcp://localhost:19530")},
    collection_name=config.get("milvus_collection_name", "collection_1"),
    auto_id=True,
)

# Schritt 4: Funktionen zum Hochladen von Dokumenten beibehalten

def upload_document(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    document = Document(
        page_content=text,
        metadata={"source": file_path, "page": 1}
    )
    text_splitter = RecursiveCharacterTextSplitter()
    all_splits = text_splitter.split_documents([document])
    for split in all_splits:
        split.metadata["source"] = split.metadata.get("source", file_path)
        split.metadata["page"] = split.metadata.get("page", 1)
        split.metadata["text"] = split.page_content
    vector_store.add_documents(all_splits)
    return f"uploaded {file_path}"

def upload_text_files(folder_path, num_files=None):
    i = 0
    for file_path in glob.glob(f"{folder_path}/*.txt"):
        print(upload_document(file_path))
        i += 1
        if num_files and i >= num_files:
            break

# Hochladen der TXT-Dateien aus dem './abstracts'-Verzeichnis
NUM_DOCS_TO_UPLOAD = None  # Setzen Sie eine Zahl, um die Anzahl der Dokumente zu begrenzen
upload_text_files("./abstracts", NUM_DOCS_TO_UPLOAD)

# Flushen und Laden der Daten in Milvus
vector_store.col.flush()
vector_store.col.load()
vector_store.col.wait_for_loading_complete()

# Einrichtung des Conversation Buffers
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

# Anpassung der Reranking-Funktion
def custom_reranker(documents, query):
    reranked_docs = reranker.rerank(query, documents)
    return reranked_docs

# Integrieren des Rerankers in den Retriever
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})
retriever.add_rerank_function(custom_reranker)

# Erstellen der Conversational Retrieval Chain mit OllamaLLM
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=ollama_llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    get_chat_history=lambda h: h,
    verbose=True,
)

# Durchführung einer Abfrage
query = "Welche neuen Erkenntnisse gibt es zu präventiven Gesundheitsmaßnahmen?"

# Ausführen der Kette
try:
    result = qa_chain({"question": query})
    # Ausgabe des Ergebnisses
    print("Antwort:")
    print(result["answer"])
except Exception as e:
    print(f"Fehler bei der Anfrage: {e}")


ImportError: cannot import name 'NVIDIAReranker' from 'langchain_nvidia_ai_endpoints' (/home/workbench/.local/lib/python3.10/site-packages/langchain_nvidia_ai_endpoints/__init__.py)