### Retrieval pipeline

Importing relevant modules

In [2]:
from elasticsearch import Elasticsearch
import urllib3
import os
import requests
import numpy as np
import json
from Embedding import TextEmbedder

  from .autonotebook import tqdm as notebook_tqdm


Initializing embedder

In [3]:
embedder = TextEmbedder()

Initializing Elastic Search connection

In [4]:
elastic_password = os.getenv('ELASTIC_PASSWORD')

es = Elasticsearch(
    ['https://localhost:9200'],
    basic_auth=('elastic', elastic_password),
    verify_certs=True,
    ca_certs="/home/ubuntu/.crts/http_ca.crt",
    request_timeout=60
)

index_name = "pubmed_index"

In [5]:
# Define a search query
def bm25_search(query: str, k: int = 10):
    query = {
        "size": k,
        "query": {
            "match": {
                "content": f"{query}"
            }
        },
        "_source": ["PMID", "title"]
    }
    # Elasticsearch nutzt standardmässig das BM25-Modell, um die Relevanz der Dokumente zu berechnen
    return es.search(index='pubmed_index', body=query)

In [6]:
def get_docs_via_PMIDs(PMIDs: list):
    query = {
        "size": len(PMIDs),
        "query": {
            "terms": {
                "PMID": PMIDs
            }
        },
        "_source": ["PMID", "title"]
    }
    
    print(len(PMIDs))
    print(len(es.search(index='pubmed_index', body=query).body['hits']['hits']))
    return es.search(index='pubmed_index', body=query)

In [12]:
def query_to_vector(text, embedder):
    embedding = embedder.embed(text)
    return embedding

def query(query: str, k: int = 10, url='http://localhost:5000/search'):
    vec = query_to_vector(query, embedder).tolist()  # Konvertiere das NumPy-Array in eine Liste
    data = {
        'queries': [vec],  # Stelle sicher, dass 'queries' eine Liste von Listen ist
        'k': k
    }
    response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(data))

    return response.json()

In [20]:
response = query("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 10)

In [21]:
PMIDs = response['indices'][0]
distances = response['distances'][0]

print(f"Distances: {distances}")
print(f"PMIDs: {PMIDs}")

Distances: [17.478534698486328, 17.741710662841797, 18.024999618530273, 18.073043823242188, 18.258037567138672, 18.425647735595703, 18.439815521240234, 18.589141845703125, 18.631526947021484, 18.6602840423584]
PMIDs: [3150951, 3079288, 1380906, 3299646, 1500394, 1677748, 1336770, 1454973, 1377169, 2162298]


In [22]:
docs = get_docs_via_PMIDs(PMIDs)

for doc in docs.body['hits']['hits']:
    print(f"PMID: {doc['_source']['PMID']}, Title: {doc['_source']['title']}")

10
10
PMID: 1380906, Title: Development of effective drug combinations for the inhibition of multiply resistant mycobacteria, especially of the Mycobacterium avium complex.
PMID: 1377169, Title: [A new, highly synergistic drug combination for the treatment of infections with multiresistant mycobacteria, especially the mycobacterium avium complex].
PMID: 1500394, Title: Ovarian cancer. Experimental chemotherapy.
PMID: 1454973, Title: The multidrug-resistant tuberculosis challenge to public health efforts to control tuberculosis.
PMID: 3299646, Title: Resistance of bacteria to antibacterial agents: report of Task Force 2.
PMID: 1336770, Title: Phase II study of iproplatin (CHIP) in patients with cisplatin-refractory germ cell tumors; the need for alternative strategies in the investigation of new agents in GCT.
PMID: 3079288, Title: Mechanisms and clinical significance of multidrug resistance.
PMID: 3150951, Title: In vitro activity of antimicrobial agents against mycobacteria.
PMID: 216