### Retrieval pipeline

Importing relevant modules

In [2]:
from elasticsearch import Elasticsearch
import urllib3
import os
import requests
import numpy as np
import json
from bioBERTencoder import TextEncooderBioBERT

Initializing embedder

In [3]:
embedder = TextEncooderBioBERT()

Initializing Elastic Search connection

In [4]:
elastic_password = os.getenv('ELASTIC_PASSWORD')

es = Elasticsearch(
    ['https://localhost:9200'],
    basic_auth=('elastic', elastic_password),
    verify_certs=True,
    ca_certs="/home/ubuntu/.crts/http_ca.crt",
    request_timeout=60
)

In [5]:
# Define a search query
def bm25_search(query: str, k: int = 10):
    query = {
        "size": k,
        "query": {
            "match": {
                "content": f"{query}"
            }
        },
        "_source": ["PMID", "title"]
    }
    # Elasticsearch nutzt standardmässig das BM25-Modell, um die Relevanz der Dokumente zu berechnen
    return es.search(index='pubmed_index', body=query)

In [6]:
bm25_search("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 10)

ObjectApiResponse({'took': 2575, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10000, 'relation': 'gte'}, 'max_score': 29.15415, 'hits': [{'_index': 'pubmed_index', '_id': 'dr5K7I4BlUXX0v6gr52g', '_score': 29.15415, '_source': {'title': 'The epidemiologic patterns of drug-resistant Mycobacterium tuberculosis infections: a community-based study.', 'PMID': 2496635}}, {'_index': 'pubmed_index', '_id': 'bspj7I4BlUXX0v6gdi_p', '_score': 28.184868, '_source': {'title': '[Experimental and clinical study test of capreomycin].', 'PMID': 60080}}, {'_index': 'pubmed_index', '_id': 'rbU47I4BlUXX0v6gpBFb', '_score': 27.948555, '_source': {'title': 'Primary antituberculous drug resistance in Hawaii, 1957 to 1977.', 'PMID': 101107}}, {'_index': 'pubmed_index', '_id': 'rcZb7I4BlUXX0v6gbcdj', '_score': 27.639471, '_source': {'title': 'Failure of isoniazid prophylaxis after exposure to isoniazid-resistant tuberculosis.', 'PMID': 8071

In [7]:
def get_docs_via_PMIDs(PMIDs: list):
    query = {
        "size": len(PMIDs),
        "query": {
            "terms": {
                "PMID": PMIDs
            }
        },
        "_source": ["PMID", "title", "content"]
    }

    return es.search(index='pubmed_index', body=query)

In [8]:
def query_to_vector(text, embedder):
    embedding = embedder.embed(text)
    return embedding

def query(query: str, k: int = 10, url='http://localhost:5000/search'):
    vec = query_to_vector(query, embedder).tolist()  # Konvertiere das NumPy-Array in eine Liste
    data = {
        'queries': [vec],  # Stelle sicher, dass 'queries' eine Liste von Listen ist
        'k': k
    }
    response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(data))

    return response.json()

In [10]:
response = query("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 10)

In [10]:
PMIDs = response['PMIDs'][0]
distances = response['distances'][0]

print(f"Distances: {distances}")
print(f"PMIDs: {PMIDs}")

Distances: [17.478534698486328, 17.741710662841797, 18.024999618530273, 18.073043823242188, 18.258037567138672, 18.425647735595703, 18.439815521240234, 18.589141845703125, 18.631526947021484, 18.6602840423584]
PMIDs: [3150951, 3079288, 1380906, 3299646, 1500394, 1677748, 1336770, 1454973, 1377169, 2162298]


In [15]:
docs = get_docs_via_PMIDs(PMIDs)

Now testing implemented classes

In [17]:
from semantic_search_bioBERT import bioBERTretriever
retriever = bioBERTretriever()

Retrieving 3 most relevant docs 

In [32]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 3)
print(response)

{
    "doc1": {
        "PMID": 1380906,
        "title": "Development of effective drug combinations for the inhibition of multiply resistant mycobacteria, especially of the Mycobacterium avium complex.",
        "content": "Rationally designed combinations of rifampicin (RAMP) and thiacetazone plus isonicotinic acid hydrazide and/or ethambutol are highly effective in the treatment of patients (including HIV-positive) infected with multiply resistant mycobacteria of the Mycobacterium avium complex (MAC). Clinical results are very promising. The high efficacy of these combinations is due to the synergistic potentiation of single-drug activities. As soon as rifabutin is marketed, it should replace RAMP in the combination treatment of patients with highly RAMP-resistant MAC bacteria."
    },
    "doc2": {
        "PMID": 3079288,
        "title": "Mechanisms and clinical significance of multidrug resistance.",
        "content": "Tumor cells often become refractory to diverse drugs with 

Now the BM25 retriever

In [33]:
from BM25_search import BM25retriever

retriever = BM25retriever()

In [35]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 3)
print(response)

{
    "doc1": {
        "PMID": 2496635,
        "title": "The epidemiologic patterns of drug-resistant Mycobacterium tuberculosis infections: a community-based study.",
        "content": "A community-based study of tuberculosis in Santa Clara County, California was conducted in order to identify community-specific determinants of drug-resistant Mycobacterium tuberculosis infections. From January 1984 through December 1986, 517 verified cases of tuberculosis were reported from the county. Drug susceptibility test results to isoniazid, streptomycin, ethambutol, and rifampin were available for 256 of the 517 cases. The frequency of resistance of M. tuberculosis isolates to one or more drugs was 27% for all cases and 25% for those who had had no previous antituberculosis treatment. Isolates from Asian immigrants had the highest frequencies of resistance (33 to 45%), and the Southeast Asian immigrants had a drug-resistant tuberculosis case rate greater than 30/100,000 population per year.

Now testing the combined RAG system using retriever number 1, semantic similarity search.

In [1]:
from RAG import RAG

rag = RAG(retriever=1, question_type=1)

rag.get_answer("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.")

'{"response": [{"trial_title": "Development of effective drug combinations for the inhibition of multiply resistant mycobacteria, especially of the Mycobacterium avium complex", "trial_status": "Clinical results are very promising", "PMID": 1380906}, {"trial_title": "A new, highly synergistic drug combination for the treatment of infections with multiresistant mycobacteria, especially the mycobacterium avium complex", "trial_status": "Clinical results are very promising", "PMID": 1377169}], "used_PMIDs": [1380906, 1377169], "PMIDs": [1380906, 1377169, 1500394, 1454973, 3299646, 1336770, 3079288, 3150951, 2162298, 1677748]}'