In [1]:
!pip install pypdf biopython 

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Notwendige Module importieren
import os
import time
from datetime import datetime, timedelta
from Bio import Entrez

# Research-Klasse definieren
class Research:
    def __init__(self, email: str, max_results: int = 100, save_directory: str = "./abstracts"):
        self.email = email
        self.max_results = max_results
        self.save_directory = save_directory
        Entrez.email = self.email

    def search_pubmed(self, start_date: str, end_date: str, term: str = "") -> dict:
        query = f'{term} AND medline[sb]'
        handle = Entrez.esearch(
            db="pubmed",
            term=query,
            retmax=self.max_results,
            usehistory="y",
            datetype="pdat",
            mindate=start_date,
            maxdate=end_date
        )
        search_results = Entrez.read(handle)
        handle.close()
        return search_results

    def fetch_summaries(self, article_ids: list) -> list:
        summaries_handle = Entrez.esummary(
            db="pubmed",
            id=",".join(article_ids),
            retmode="xml"
        )
        summaries = list(Entrez.parse(summaries_handle))
        summaries_handle.close()
        return summaries

    def fetch_abstract(self, article_id: str) -> str:
        try:
            abstract_handle = Entrez.efetch(
                db="pubmed",
                id=article_id,
                rettype="abstract",
                retmode="text"
            )
            abstract = abstract_handle.read()
            abstract_handle.close()
            return abstract.strip()
        except Exception as e:
            print(f"Fehler beim Abrufen des Abstracts für Artikel {article_id}: {e}")
            return ""

# Hauptteil des Skripts
if __name__ == "__main__":
    # E-Mail-Adresse für Entrez festlegen
    email = 'bene.linn@yahoo.de'  # Bitte mit Ihrer E-Mail-Adresse ersetzen
    max_results = 100  # Anzahl der gewünschten Artikel
    save_directory = './abstracts'  # Verzeichnis zum Speichern der Abstracts

    # Verzeichnis erstellen, falls es nicht existiert
    if not os.path.exists(save_directory):
        os.makedirs(save_directory)

    research = Research(email=email, max_results=max_results, save_directory=save_directory)

    # Zeitraum festlegen (diese Woche)
    today = datetime.today()
    start_of_week = today - timedelta(days=today.weekday())  # Montag dieser Woche
    start_date = start_of_week.strftime('%Y/%m/%d')
    end_date = today.strftime('%Y/%m/%d')

    # Suchbegriff festlegen
    search_term = (
    '(preventive health services[MeSH Terms] OR health promotion[MeSH Terms] OR '
    'primary prevention[MeSH Terms] OR "prävention"[Title/Abstract] OR "prevention"[Title/Abstract]) '
    'AND (nutrition[MeSH Terms] OR exercise[MeSH Terms] OR vaccination[MeSH Terms] OR '
    'mental health[MeSH Terms] OR lifestyle[MeSH Terms])'
     )
  # Ersetzen Sie 'prevention' durch Ihr Interessensgebiet

    print(f"Zeitraum: {start_date} bis {end_date}")
    print(f"Suchbegriff: {search_term}")

    # PubMed nach Artikeln durchsuchen
    search_results = research.search_pubmed(start_date, end_date, term=search_term)
    pubmed_article_ids = search_results.get('IdList', [])
    total_results = len(pubmed_article_ids)

    if total_results == 0:
        print("Keine Ergebnisse für den angegebenen Zeitraum und Suchbegriff gefunden.")
    else:
        print(f"{total_results} Artikel wurden gefunden.")
        # Summaries abrufen
        summaries = research.fetch_summaries(pubmed_article_ids)
        print(f"Anzahl der abgerufenen Zusammenfassungen: {len(summaries)}")

        # Durch die Summaries iterieren
        for idx, summary in enumerate(summaries):
            article_id = summary['Id']
            title = summary.get('Title', 'Kein Titel verfügbar')
            authors_list = summary.get('AuthorList', [])
            authors = ', '.join(authors_list) if authors_list else 'Keine Autoren verfügbar'
            pub_date = summary.get('PubDate', 'Kein Veröffentlichungsdatum verfügbar')
            print(f"\nArtikel {idx + 1}:")
            print(f"Artikel-ID: {article_id}")
            print(f"Titel: {title}")
            print(f"Autoren: {authors}")
            print(f"Veröffentlichungsdatum: {pub_date}")

            # Abstract abrufen
            abstract = research.fetch_abstract(article_id)
            if abstract:
                print("Abstract erfolgreich abgerufen.")
                # Abstract speichern
                sanitized_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
                file_name = f"Artikel_{article_id}_{sanitized_title}.txt"
                abstract_path = os.path.join(save_directory, file_name)
                with open(abstract_path, "w", encoding="utf-8") as f:
                    f.write(f"Titel: {title}\n")
                    f.write(f"Autoren: {authors}\n")
                    f.write(f"Veröffentlichungsdatum: {pub_date}\n\n")
                    f.write(f"Abstract:\n{abstract}")
                print(f"Abstract gespeichert unter: {abstract_path}")
            else:
                print("Kein Abstract verfügbar.")
        print("\nSkript abgeschlossen.")


Zeitraum: 2024/10/14 bis 2024/10/17
Suchbegriff: (preventive health services[MeSH Terms] OR health promotion[MeSH Terms] OR primary prevention[MeSH Terms] OR "prävention"[Title/Abstract] OR "prevention"[Title/Abstract]) AND (nutrition[MeSH Terms] OR exercise[MeSH Terms] OR vaccination[MeSH Terms] OR mental health[MeSH Terms] OR lifestyle[MeSH Terms])
32 Artikel wurden gefunden.
Anzahl der abgerufenen Zusammenfassungen: 32

Artikel 1:
Artikel-ID: 39413130
Titel: Deciphering the neural responses to a naturalistic persuasive message.
Autoren: Ntoumanis I, Sheronova J, Davydova A, Dolgaleva M, Jääskeläinen IP, Kosonogov V, Shestakova AN, Klucharev V
Veröffentlichungsdatum: 2024 Oct 22
Abstract erfolgreich abgerufen.
Abstract gespeichert unter: ./abstracts/Artikel_39413130_Deciphering_the_neural_responses_to_a_naturalistic.txt

Artikel 2:
Artikel-ID: 39413075
Titel: The value of using COVID-19 antibody tests as a potential approach to prioritize vaccination delivery.
Autoren: Alrabadi N, Ob

In [None]:
# Importieren der notwendigen Module
from chain_server.configuration import config
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_milvus.vectorstores.milvus import Milvus
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
import os

# Setup des NVIDIA Embedding Modells
embedding_model = NVIDIAEmbeddings(
    model=config.embedding_model.name,      # Beispiel: "embed-qa-4"
    base_url=str(config.embedding_model.url),
    api_key=config.nvidia_api_key,
    truncate="END"
)

# Setup der Milvus Vektordatenbank
print(config.milvus.collection_name)

vector_store = Milvus(
    embedding_function=embedding_model,
    connection_args={"uri": config.milvus.url},  # Beispiel: "tcp://localhost:19530"
    collection_name=config.milvus.collection_name,
    auto_id=True,
)

# Funktion zum Hochladen von TXT-Dokumenten
def upload_document(file_path):
    # Lesen der TXT-Datei
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    # Erstellen eines Dokumentobjekts mit 'page' in den Metadaten
    document = Document(
        page_content=text,
        metadata={
            "source": file_path,
            "page": 1  # Fügen Sie das Feld 'page' hinzu
        }
    )
    # Aufteilen des Dokuments in kleinere Abschnitte
    text_splitter = RecursiveCharacterTextSplitter()
    all_splits = text_splitter.split_documents([document])
    # Sicherstellen, dass jedes Split die erforderlichen Metadaten enthält
    for split in all_splits:
        split.metadata["source"] = split.metadata.get("source", file_path)
        split.metadata["page"] = split.metadata.get("page", 1)
        split.metadata["text"] = split.page_content  # Fügen Sie das Feld 'text' hinzu
    # Hinzufügen der Dokumente zum Vektorspeicher
    vector_store.add_documents(all_splits)
    return f"uploaded {file_path}"

# Funktion zum Hochladen mehrerer TXT-Dateien
def upload_text_files(folder_path, num_files=None):
    i = 0
    for file_path in glob.glob(f"{folder_path}/*.txt"):
        print(upload_document(file_path))
        i += 1
        if num_files and i >= num_files:
            break

# Hochladen der TXT-Dateien aus dem './abstracts'-Verzeichnis
NUM_DOCS_TO_UPLOAD = None  # Setzen Sie eine Zahl, um die Anzahl der Dokumente zu begrenzen
upload_text_files("./abstracts", NUM_DOCS_TO_UPLOAD)

# Flushen und Laden der Daten in Milvus
vector_store.col.flush()
vector_store.col.load()
vector_store.col.wait_for_loading_complete()




collection_1
uploaded ./abstracts/Artikel_39382433_Figure_1__Pneumococcal_vaccine_recommendations_for.txt
uploaded ./abstracts/Artikel_39382434_Figure_2__Pneumococcal_vaccine_recommendations_for.txt
uploaded ./abstracts/Artikel_39410905_COVID_19_vaccine_hesitancy_and_intentions_among_pa.txt
uploaded ./abstracts/Artikel_39402502_Knowledge__attitude_and_perception_of_Italian_dent.txt
uploaded ./abstracts/Artikel_39099093_Noninferior_Immunogenicity_and_Consistent_Safety_o.txt
uploaded ./abstracts/Artikel_39402500_Digital_health_literacy_among_undergraduate_nursin.txt
uploaded ./abstracts/Artikel_39406956_The_individual_and_ecological_characteristics_of_p.txt
uploaded ./abstracts/Artikel_39400298_Smoking_and_serological_response_to_influenza_vacc.txt
uploaded ./abstracts/Artikel_39407129_Referral_pathway_and_competency_profiles_of_primar.txt
uploaded ./abstracts/Artikel_39400296_Healthcare_worker_practices_for_HPV_vaccine_recomm.txt
uploaded ./abstracts/Artikel_38606958_Respiratory_Syncyti

In [None]:
# Durchführung einer Similarity-Suche
query = "Welche neuen Erkenntnisse gibt es zu präventiven Gesundheitsmaßnahmen?"
docs = vector_store.similarity_search(query)
print(docs[0])

In [None]:
print(vector_store.col.schema)
