### Retrieval pipeline

Importing relevant modules

In [1]:
from elasticsearch import Elasticsearch
import os
import requests
import json
from bioBERT_encoder import BioBERTQueryEncooder
from medCPT_encoder import MedCPTQueryEncoder

Initializing query encoder

In [6]:
bioBERT_encoder = BioBERTQueryEncooder()
med_cpt_encoder = MedCPTQueryEncoder()

Initializing Elastic Search connection

In [5]:
elastic_password = os.getenv('ELASTIC_PASSWORD')

es = Elasticsearch(
    ['https://localhost:9200'],
    basic_auth=('elastic', elastic_password),
    verify_certs=True,
    ca_certs="/home/rag/.crt/http_ca.crt",
    request_timeout=60
)

es.info()

ObjectApiResponse({'name': 'e16354f42e49', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'QRfx48-WQEmifPZNrtrbGw', 'version': {'number': '8.13.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'da95df118650b55a500dcc181889ac35c6d8da7c', 'build_date': '2024-05-06T22:04:45.107454559Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [6]:
# Define a search query
def bm25_search(query: str, k: int = 10):
    query = {
        "size": k,
        "query": {
            "match": {
                "content": f"{query}"
            }
        },
        "_source": ["PMID", "title", "embeddings"]
    }
    # Elasticsearch nutzt standardmässig das BM25-Modell, um die Relevanz der Dokumente zu berechnen
    return es.search(index='pubmed_index', body=query)

In [7]:
bm25_search("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 10)

ObjectApiResponse({'took': 688, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10000, 'relation': 'gte'}, 'max_score': 41.127823, 'hits': [{'_index': 'pubmed_index', '_id': '_KJosI8BPBlFnwQS7Z0P', '_score': 41.127823, '_ignored': ['content.keyword', 'contents.keyword'], '_source': {'title': 'Drug development against tuberculosis: Past, present and future.', 'PMID': 28941848}}, {'_index': 'pubmed_index', '_id': '12M-sI8BPBlFnwQSiNQo', '_score': 38.054825, '_ignored': ['content.keyword', 'contents.keyword'], '_source': {'title': 'Nano-Drug Delivery Systems: Possible End to the Rising Threats of Tuberculosis.', 'PMID': 34974855}}, {'_index': 'pubmed_index', '_id': 'AhsOsI8BPBlFnwQSbCDr', '_score': 37.860977, '_ignored': ['content.keyword', 'contents.keyword'], '_source': {'title': 'Bottlenecks and opportunities in antibiotic discovery against Mycobacterium tuberculosis.', 'PMID': 35970040}}, {'_index': 'pubmed_index', '

In [7]:
def get_docs_via_PMIDs(PMIDs: list):
    query = {
        "size": len(PMIDs),
        "query": {
            "terms": {
                "PMID": PMIDs
            }
        },
        "_source": ["PMID", "title", "content"]
    }

    return es.search(index='pubmed_index', body=query)

In [5]:
def query_to_vector(text, encoder):
    embedding = encoder.encode(text)
    return embedding[0]

def query(query: str, encoder:object, k: int = 10, url='http://localhost:5000/search'):
    vec = query_to_vector(query, encoder).tolist()  # Konvertiere das NumPy-Array in eine Liste
    data = {
        'queries': [vec],  # Stelle sicher, dass 'queries' eine Liste von Listen ist
        'k': k
    }
    response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(data))

    return response.json()

In [8]:
response = query("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", med_cpt_encoder)
print(response)

{'PMIDs': [[2517449, 1477244, 1322487, 3280912, 1640921, 3150951, 3685667, 3118492, 101915, 2699349]], 'distances': [[73.7181396484375, 73.8319091796875, 74.03831481933594, 74.13957214355469, 74.66549682617188, 75.01885223388672, 75.84512329101562, 75.998046875, 76.24203491210938, 76.6444091796875]]}


In [9]:
PMIDs = response['PMIDs'][0]
distances = response['distances'][0]

print(f"Distances: {distances}")
print(f"PMIDs: {PMIDs}")

Distances: [73.7181396484375, 73.8319091796875, 74.03831481933594, 74.13957214355469, 74.66549682617188, 75.01885223388672, 75.84512329101562, 75.998046875, 76.24203491210938, 76.6444091796875]
PMIDs: [2517449, 1477244, 1322487, 3280912, 1640921, 3150951, 3685667, 3118492, 101915, 2699349]


In [15]:
docs = get_docs_via_PMIDs(PMIDs)

Now testing implemented classes

In [7]:
from bioBERT_retriever import BioBERTRetriever
retriever = BioBERTRetriever()

Retrieving 3 most relevant docs 

In [8]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 3)
print(response)

{
    "doc1": {
        "PMID": 1380906,
        "title": "Development of effective drug combinations for the inhibition of multiply resistant mycobacteria, especially of the Mycobacterium avium complex.",
        "content": "Rationally designed combinations of rifampicin (RAMP) and thiacetazone plus isonicotinic acid hydrazide and/or ethambutol are highly effective in the treatment of patients (including HIV-positive) infected with multiply resistant mycobacteria of the Mycobacterium avium complex (MAC). Clinical results are very promising. The high efficacy of these combinations is due to the synergistic potentiation of single-drug activities. As soon as rifabutin is marketed, it should replace RAMP in the combination treatment of patients with highly RAMP-resistant MAC bacteria."
    },
    "doc2": {
        "PMID": 3079288,
        "title": "Mechanisms and clinical significance of multidrug resistance.",
        "content": "Tumor cells often become refractory to diverse drugs with 

Now the BM25 retriever

In [1]:
from bm25_retriever import BM25Retriever

retriever = BM25Retriever()

In [2]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 3)
print(response)

{
    "doc1": {
        "PMID": 28941848,
        "title": "Drug development against tuberculosis: Past, present and future.",
        "content": "Infection of Mycobacterium tuberculosis (MTB) was observed as early as 5000 years ago with evidence, which is a primeval enemy of the humanoid race. MTB is the pathogen which is responsible for causing the infectious disease tuberculosis; it remains a major cause of morbidity and mortality in poor low-income countries as well as in developing countries because of non-availability of reliable laboratory facilities. The current treatment for drug-resistant tuberculosis (TB) is lengthy, complex, and connected with severe harmful side effects and poor outcomes. The present cure against tuberculosis has substantial restrictions, in terms of their efficiency, side-effect outline, and complication of handling. Furthermore, the emergence of multi-drug resistant tuberculosis (MDR-TB) outbreaks during the 1990s and additionally in recent times the vas

Now trying the medCPT retriever without reranking

In [1]:
from medCPT_retriever import SemanticRetrieverMedCPT
retriever = SemanticRetrieverMedCPT(rerank=False)

In [2]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 5)
print(response)

{
    "doc1": {
        "PMID": 3280912,
        "title": "Antituberculosis agents.",
        "content": "Tuberculosis, once considered a problem solved, is now dramatically on the rise. New approaches to chemotherapy will hopefully help to control this again serious problem. This article reviews the current status of tuberculosis chemotherapy, including the management of drug-resistant cases."
    },
    "doc2": {
        "PMID": 1640921,
        "title": "Management of persons exposed to multidrug-resistant tuberculosis.",
        "content": "Recent outbreaks of multidrug-resistant tuberculosis (MDR-TB) have posed challenges for the management of exposed persons. This report offers suggestions for evaluating and managing persons (i.e., contacts) who have been exposed to patients with infectious MDR-TB (TB due to strains of Mycobacterium tuberculosis resistant to both isoniazid [INH] and rifampin [RIF]), provides background information on alternative preventive therapy regimens with d

Now with reranking using the medCPT cross encoder

In [1]:
from medCPT_retriever import SemanticRetrieverMedCPT
retriever = SemanticRetrieverMedCPT(rerank=True)

In [2]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 5)
print(response)

{
    "doc1": {
        "PMID": 1477244,
        "title": "Evaluation of new anti-infective drugs for the treatment and prevention of tuberculosis. Infectious Diseases Society of America and the Food and Drug Administration.",
        "content": "This guideline addresses the evaluation of new antimycobacterial drugs in the treatment and prevention (secondary prophylaxis) of infection by M. tuberculosis. Patients may be enrolled in clinical trials on the basis of clinical and/or microbiological criteria. A therapeutic regimen will likely include a combination of drugs; a randomized, active-control, comparative clinical trial is recommended. If appropriate samples can be obtained for culture during follow-up without placing the patient at unwarranted risk, the assessment of microbiological outcome is paramount. Prophylaxis will probably require a single drug, and a similar study design is preferred.",
        "score": 6.256978511810303
    },
    "doc2": {
        "PMID": 3280912,
      

In [1]:
from hybrid_retriever import HybridRetriever
retriever = HybridRetriever()

In [4]:
response = retriever.retrieve_docs("How to treat ADHD?", top_n=10, k=100)
print(response)

{
    "doc1": {
        "PMID": 28004618,
        "title": "Organisation of services for managing ADHD.",
        "content": "There is considerable variation in practice, both between and with different countries in the management of attention deficit hyperactivity disorder (ADHD). Whilst there is no one optimal model of service organisation there are general principles of care that can be introduced to reduce this variability. There are frequent debates and discussions about which professional group is best placed to manage ADHD at different points in the life cycle. Who delivers care is however less important than ensuring that training schemes provide adequate exposure, training and experience to both the core and non-core skills required to provide a comprehensive package of care. Most evidence-based guidelines recommend a multi-modal, multi-professional and multi-agency approach. Many also promote the use of both stepped care and shared care approaches for the management of ADHD. 

### RAG system

Now testing the combined RAG system using retriever number 1, semantic similarity search.

In [11]:
from med_rag import MedRAG

rag = MedRAG(retriever=1, question_type=1)

rag.get_answer("What is the treatment for breast cancer?")

'{"response": "Treatment options for breast cancer include mastectomy, breast conservation treatment (excision of primary tumor plus radiation therapy), and primary systemic chemotherapy. Mastectomy was standard treatment in the past, but now breast conservation treatment is appropriate for the majority of women with stage I or II breast cancer. Primary systemic chemotherapy can be used for operable breast cancer to improve patient outcomes. Radiotherapy is also a successful way to treat localized metastatic or recurrent breast cancer.", "used_PMIDs": ["59936", "1925139", "1933219"], "retrieved_PMIDs": [1445519, 2807944, 2306572, 1903883, 2207477, 2852332, 59936, 1925139, 1933219, 1612877], "retrieval_time": 1.0633940696716309, "generation_time": 3.061168909072876}'

In [10]:
from med_rag import MedRAG

rag = MedRAG(retriever=2, question_type=1)

rag.get_answer("What is the mode of inheritance of Wilson's disease?")

'{"response": "Wilson\'s disease is inherited in an autosomal recessive mode.", "used_PMIDs": ["838566"], "retrieved_PMIDs": [838566, 2292371, 1536160, 2724779, 2332582, 1248830, 2222683, 3255247, 1517772, 789146], "retrieval_time": 0.7482585906982422, "generation_time": 1.2844774723052979}'

In [2]:
import os

print(os.getenv('OPENAI_API_KEY'))

None


In [1]:
from med_rag import MedRAG

rag = MedRAG(retriever=3, question_type=1)

rag.get_answer("What is the mode of inheritance of Wilson's disease?")

'{"response": "Wilson\'s disease is inherited in an autosomal recessive mode. This means that an individual must inherit two copies of the mutated gene (one from each parent) to develop the disease.", "used_PMIDs": ["26817129", "6109943"], "retrieved_PMIDs": [26817129, 6109943, 838566, 2724779, 6620327, 16810973, 20662462, 8186659, 11254776, 23518715], "retrieval_time": 1.5806100368499756, "generation_time": 1.6769413948059082}'

In [2]:
from med_rag import MedRAG

rag = MedRAG(retriever=3, question_type=2)

rag.get_answer("Is stop codon bypass possible?")

'{"response": "yes", "used_PMIDs": ["24535059", "12711673", "2103444", "26382736", "17881586", "2691247", "17961216", "2010914"], "retrieved_PMIDs": [24535059, 12711673, 2103444, 21930924, 26382736, 17881586, 2207158, 2691247, 17961216, 2010914], "retrieval_time": 0.371307373046875, "generation_time": 1.5111398696899414}'

In [34]:
from med_rag import MedRAG

rag = MedRAG(retriever=4, question_type=1)

rag.get_answer("Is stop codon bypass possible?")

'{"response": "Stop codon bypass is possible through ribosomal frameshifting at hungry codons, allowing for readthrough of stop codons and continuation of translation in a shifted reading frame.", "used_PMIDs": [3199440, 1515416, 1731076, 3477671, 1779848], "retrieved_PMIDs": [3199440, 1515416, 1731076, 2253710, 1628840, 1689389, 2439408, 3477671, 1779848, 1814364]}'

In [1]:
from med_rag import MedRAG

rag = MedRAG(retriever=4, question_type=2)

In [2]:
rag.get_answer("Is stop codon bypass possible?")

'{"response": "yes", "used_PMIDs": ["1814364", "1731076", "1628840", "1689389", "3199440"], "retrieved_PMIDs": [1814364, 1731076, 1628840, 2439408, 1689389, 2253710, 1779848, 3199440, 1515416, 3477671], "retrieval_time": 3.8009798526763916, "generation_time": 1.5287399291992188}'