### Retrieval pipeline

Importing relevant modules

In [3]:
from elasticsearch import Elasticsearch
import urllib3
import os
import requests
import numpy as np
import json
from Embedding import TextEmbedder

  from .autonotebook import tqdm as notebook_tqdm


Initializing embedder

In [5]:
embedder = TextEmbedder()

Initializing Elastic Search connection

In [6]:
elastic_password = os.getenv('ELASTIC_PASSWORD')

es = Elasticsearch(
    ['https://localhost:9200'],
    basic_auth=('elastic', elastic_password),
    verify_certs=True,
    ca_certs="/home/ubuntu/.crts/http_ca.crt",
    request_timeout=60
)

index_name = "pubmed_index"

In [7]:
# Define a search query
def bm25_search(query: str, k: int = 5):
    query = {
        "size": k,
        "query": {
            "match": {
                "content": f"{query}"
            }
        },
        "_source": ["PMID", "title"]
    }
    # Elasticsearch nutzt standardmässig das BM25-Modell, um die Relevanz der Dokumente zu berechnen
    return es.search(index='pubmed_index', body=query)

In [30]:
def get_docs_via_PMIDs(PMIDs: list):
    query = {
        "query": {
            "terms": {
                "PMID": PMIDs
            }
        },
        "_source": ["PMID", "title"]
    }
    return es.search(index='pubmed_index', body=query)

In [23]:
def query_to_vector(text, embedder):
    embedding = embedder.embed(text)
    return embedding

def query(query: str, k: int = 10, url='http://localhost:5000/search'):
    vec = query_to_vector(query, embedder).tolist()  # Konvertiere das NumPy-Array in eine Liste
    data = {
        'queries': [vec],  # Stelle sicher, dass 'queries' eine Liste von Listen ist
        'k': k
    }
    response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(data))

    return response.json()

In [24]:
response = query('What are the contraindications for the use of ACE inhibitors in cardiac patients?')

In [33]:
PMIDs = response['indices'][0]
distances = response['distances'][0]

print(f"Distances: {distances}")
print(f"PMIDs: {PMIDs}")

Distances: [15.21097183227539, 15.540654182434082, 15.747185707092285, 16.055830001831055, 16.268619537353516, 16.304288864135742, 16.509267807006836, 16.602100372314453, 16.609519958496094, 16.660133361816406]
PMIDs: [259163, 2219955, 1936409, 1208974, 454949, 2225056, 1604362, 2213504, 1056585, 1029982]


In [34]:
docs = get_docs_via_PMIDs(PMIDs)

for doc in docs['hits']['hits']:
    print(f"PMID: {doc['_source']['PMID']}, Title: {doc['_source']['title']}")

PMID: 1208974, Title: Comparison of changes in myocardial balances of lactate, glucose potassium, and inorganic phosphate during pacing-induced angina.
PMID: 2219955, Title: Dissimilation of 2,4-dichlorophenoxyacetic acid by Azotobacter chroococcum.
PMID: 2225056, Title: Temporary cardiac pacing using a new, steerable, balloon-tipped pacing catheter.
PMID: 1936409, Title: The role of the sports team dentist.
PMID: 1604362, Title: Methods for quality adjustment of life years.
PMID: 454949, Title: Effects of delta 9-tetrahydrocannabinol, 2.4-dinitrophenol and pentolinium tartrate on behavioural thermoregulation in mice.
