In [1]:
!pip install biopython pymilvus redis transformers torch langchain
!pip install --upgrade jupyter ipywidgets


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl (2.7 kB)
Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 KB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting notebook
  Downloading notebook-7.2.2-py3-none-any.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting jupyter-console
  Downloading jupyter_console-6.6.3-py3-none-any.whl (24 kB)
Collecting jupyterlab-widgets~=3.0.12
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.4/214.4 KB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting widgetsnbexte

In [2]:
import redis
from Bio import Entrez
from pymilvus import Collection, connections
from transformers import AutoTokenizer, AutoModel
import torch
from chain_server.configuration import config
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_milvus.vectorstores.milvus import Milvus
import os

In [3]:

# Hugging Face und NVIDIA Tokens aus den Umgebungsvariablen laden
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
ngc_token = os.getenv("NGC_API_KEY")


# NVIDIA Embeddings Modell konfigurieren (mit Token)
embedding_model = NVIDIAEmbeddings(
    model=config.embedding_model.name,
    base_url=str(config.embedding_model.url),
    api_key=ngc_token,  # Verwende den NVIDIA API Key aus der Konfiguration
    truncate="END"
)

# Milvus Vector Store konfigurieren (mit dem Embedding Model)
vector_store = Milvus(
    embedding_function=embedding_model,
    connection_args={"uri": config.milvus.url},
    collection_name=config.milvus.collection_name,
    auto_id=True,
    timeout=10  # Optional: Timeout für Milvus-Verbindungen
)

In [6]:
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType

# Verbindung zu Milvus herstellen
connections.connect("default", host="milvus", port="19530")  # Passe Host und Port ggf. an

# Überprüfen, ob die Collection existiert
collection_name = "mesh_terms"
if not connections.has_collection(collection_name):
    # Felder der Collection definieren
    fields = [
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128),  # Passe die Dimension der Vektoren an
        FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=255, is_primary=True)
    ]
    
    # Collection Schema definieren
    schema = CollectionSchema(fields, description="Mesh Terms Collection")
    
    # Collection erstellen
    milvus_collection = Collection(name=collection_name, schema=schema)
    print(f"Collection '{collection_name}' wurde erstellt.")
else:
    # Wenn die Collection existiert, lade sie
    milvus_collection = Collection(collection_name)
    print(f"Collection '{collection_name}' erfolgreich geladen.")


AttributeError: 'Connections' object has no attribute 'has_collection'

In [None]:
class PubMedMeshSearcher:
    def __init__(self, embedding_model, milvus_collection: Collection, redis_client: redis.Redis):
        self.embedding_model = embedding_model
        self.milvus_collection = milvus_collection
        self.redis_client = redis_client

        # Setup Entrez
        Entrez.email = "bene.linn@yahoo.de"
        Entrez.api_key = "your_ncbi_api_key"
    
    def search_pubmed(self, query: str):
        """Search for PubMed articles and return PMIDs."""
        handle = Entrez.esearch(db="pubmed", term=query, retmax=10)
        results = Entrez.read(handle)
        handle.close()
        return results.get("IdList", [])
    
    def fetch_abstracts(self, pmids):
        """Fetch PubMed abstracts for given PMIDs."""
        handle = Entrez.efetch(db="pubmed", id=pmids, rettype="abstract", retmode="text")
        abstracts = handle.read().split("\n\n")
        handle.close()
        return abstracts
    
    def generate_embeddings(self, text):
        """Generate embeddings for text using the embedding model."""
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        # Mean Pooling
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()[0]

    def store_in_milvus(self, embeddings, ids):
        """Store the embeddings in Milvus."""
        entities = [
            {"name": "embedding", "values": embeddings, "type": "float_vector"},
            {"name": "id", "values": ids, "type": "varchar"}
        ]
        self.milvus_collection.insert(entities)


In [None]:
# Verbindung zu Milvus erstellen
milvus_collection = Collection("mesh_terms")  # Stelle sicher, dass die Collection existiert

# Erstelle die Instanz von PubMedMeshSearcher
pubmed_searcher = PubMedMeshSearcher(embedding_model=model, milvus_collection=milvus_collection, redis_client=redis_client)

# Teste die Suche nach einem Begriff in PubMed
pmids = pubmed_searcher.search_pubmed("gastrointestinal eponyms")
abstracts = pubmed_searcher.fetch_abstracts(pmids)

# Generiere Embeddings für die Abstracts und speichere sie in Milvus
embeddings = [pubmed_searcher.generate_embeddings(abstract) for abstract in abstracts]
pubmed_searcher.store_in_milvus(embeddings, pmids)

print("Abstracts and embeddings stored in Milvus.")


In [None]:
# Beispiel: Milvus-Abfrage für gespeicherte Daten
results = milvus_collection.query(expr=None, output_fields=["id"], limit=10)
print(results)
