### Retrieval pipeline

Importing relevant modules

In [1]:
from elasticsearch import Elasticsearch
import urllib3
import os
import requests
import numpy as np
import json
from bioBERT_encoder import BioBERTQueryEncooder
from medCPT_encoder import MedCPTQueryEncoder

Initializing query encoder

In [2]:
bioBERT_encoder = BioBERTQueryEncooder()
med_cpt_encoder = MedCPTQueryEncoder()

Initializing Elastic Search connection

In [4]:
elastic_password = os.getenv('ELASTIC_PASSWORD')

es = Elasticsearch(
    ['https://localhost:9200'],
    basic_auth=('elastic', elastic_password),
    verify_certs=True,
    ca_certs="/home/ubuntu/.crts/http_ca.crt",
    request_timeout=60
)

In [5]:
# Define a search query
def bm25_search(query: str, k: int = 10):
    query = {
        "size": k,
        "query": {
            "match": {
                "content": f"{query}"
            }
        },
        "_source": ["PMID", "title", "embeddings"]
    }
    # Elasticsearch nutzt standardmässig das BM25-Modell, um die Relevanz der Dokumente zu berechnen
    return es.search(index='pubmed_index_embedded', body=query)

In [10]:
bm25_search("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 10)

ObjectApiResponse({'took': 278, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10000, 'relation': 'gte'}, 'max_score': 28.842054, 'hits': [{'_index': 'pubmed_index_embedded', '_id': 'K2AkL48Bn9g2Tubai763', '_score': 28.842054, '_ignored': ['content.keyword'], '_source': {'title': '[Experimental and clinical study test of capreomycin].', 'PMID': 60080, 'embeddings': [0.07073917239904404, -0.28926077485084534, -0.03199537843465805, -0.023666726425290108, 0.07595331966876984, -0.10061178356409073, -0.19958512485027313, 0.14041295647621155, 0.0923428162932396, -0.057363610714673996, 0.32423385977745056, 0.11429763585329056, -0.12183111906051636, 0.27941980957984924, -0.14178511500358582, -0.1909717172384262, -0.04250829666852951, -0.028552765026688576, -0.0869457945227623, -0.1038479432463646, 0.006509660743176937, 0.1630488932132721, -0.10671234130859375, 0.07790783047676086, 0.16969111561775208, -0.027340829372406006, 

In [7]:
def get_docs_via_PMIDs(PMIDs: list):
    query = {
        "size": len(PMIDs),
        "query": {
            "terms": {
                "PMID": PMIDs
            }
        },
        "_source": ["PMID", "title", "content"]
    }

    return es.search(index='pubmed_index', body=query)

In [5]:
def query_to_vector(text, encoder):
    embedding = encoder.encode(text)
    return embedding[0]

def query(query: str, encoder:object, k: int = 10, url='http://localhost:5000/search'):
    vec = query_to_vector(query, encoder).tolist()  # Konvertiere das NumPy-Array in eine Liste
    data = {
        'queries': [vec],  # Stelle sicher, dass 'queries' eine Liste von Listen ist
        'k': k
    }
    response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(data))

    return response.json()

In [8]:
response = query("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", med_cpt_encoder)
print(response)

{'PMIDs': [[2517449, 1477244, 1322487, 3280912, 1640921, 3150951, 3685667, 3118492, 101915, 2699349]], 'distances': [[73.7181396484375, 73.8319091796875, 74.03831481933594, 74.13957214355469, 74.66549682617188, 75.01885223388672, 75.84512329101562, 75.998046875, 76.24203491210938, 76.6444091796875]]}


In [9]:
PMIDs = response['PMIDs'][0]
distances = response['distances'][0]

print(f"Distances: {distances}")
print(f"PMIDs: {PMIDs}")

Distances: [73.7181396484375, 73.8319091796875, 74.03831481933594, 74.13957214355469, 74.66549682617188, 75.01885223388672, 75.84512329101562, 75.998046875, 76.24203491210938, 76.6444091796875]
PMIDs: [2517449, 1477244, 1322487, 3280912, 1640921, 3150951, 3685667, 3118492, 101915, 2699349]


In [15]:
docs = get_docs_via_PMIDs(PMIDs)

Now testing implemented classes

In [9]:
from bioBERT_retriever import BioBERTRetriever
retriever = BioBERTRetriever()

Retrieving 3 most relevant docs 

In [10]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 3)
print(response)

{
    "doc1": {
        "PMID": 2225556,
        "title": "Disappearance of renin-induced proteinuria by an ACE-inhibitor: a case report.",
        "content": "A 55-year-old man developed renovascular hypertension that was characterized by high plasma renin activity. This was accompanied by nephrotic range proteinuria. Treatment with nifedipine and furosemide lowered the blood pressure to normal values, but proteinuria persisted. However, treatment with an ACE-inhibitor brought resolution of the proteinuria, suggesting a role for angiotensin II in urinary protein loss."
    },
    "doc2": {
        "PMID": 971979,
        "title": "Urticaria secondary to a copper intrauterine device.",
        "content": "A 24-year-old woman developed in acute urticarial reaction secondary to a copper intrauterine contraceptive device. Allergy to copper was proven by scratch tests. The condition cleared with removal of the IUD."
    },
    "doc3": {
        "PMID": 2829141,
        "title": "Enalapril:

Now the BM25 retriever

In [7]:
from bm25_retriever import BM25Retriever

retriever = BM25Retriever()

In [8]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 3)
print(response)

{
    "doc1": {
        "PMID": 2496635,
        "title": "The epidemiologic patterns of drug-resistant Mycobacterium tuberculosis infections: a community-based study.",
        "content": "A community-based study of tuberculosis in Santa Clara County, California was conducted in order to identify community-specific determinants of drug-resistant Mycobacterium tuberculosis infections. From January 1984 through December 1986, 517 verified cases of tuberculosis were reported from the county. Drug susceptibility test results to isoniazid, streptomycin, ethambutol, and rifampin were available for 256 of the 517 cases. The frequency of resistance of M. tuberculosis isolates to one or more drugs was 27% for all cases and 25% for those who had had no previous antituberculosis treatment. Isolates from Asian immigrants had the highest frequencies of resistance (33 to 45%), and the Southeast Asian immigrants had a drug-resistant tuberculosis case rate greater than 30/100,000 population per year.

Now trying the medCPT retriever without reranking

In [1]:
from medCPT_retriever import SemanticRetrieverMedCPT
retriever = SemanticRetrieverMedCPT(rerank=False)

In [2]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 5)
print(response)

{
    "doc1": {
        "PMID": 3280912,
        "title": "Antituberculosis agents.",
        "content": "Tuberculosis, once considered a problem solved, is now dramatically on the rise. New approaches to chemotherapy will hopefully help to control this again serious problem. This article reviews the current status of tuberculosis chemotherapy, including the management of drug-resistant cases."
    },
    "doc2": {
        "PMID": 1640921,
        "title": "Management of persons exposed to multidrug-resistant tuberculosis.",
        "content": "Recent outbreaks of multidrug-resistant tuberculosis (MDR-TB) have posed challenges for the management of exposed persons. This report offers suggestions for evaluating and managing persons (i.e., contacts) who have been exposed to patients with infectious MDR-TB (TB due to strains of Mycobacterium tuberculosis resistant to both isoniazid [INH] and rifampin [RIF]), provides background information on alternative preventive therapy regimens with d

Now with reranking using the medCPT cross encoder

In [1]:
from medCPT_retriever import SemanticRetrieverMedCPT
retriever = SemanticRetrieverMedCPT(rerank=True)

In [2]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 5)
print(response)

{
    "doc1": {
        "PMID": 1477244,
        "title": "Evaluation of new anti-infective drugs for the treatment and prevention of tuberculosis. Infectious Diseases Society of America and the Food and Drug Administration.",
        "content": "This guideline addresses the evaluation of new antimycobacterial drugs in the treatment and prevention (secondary prophylaxis) of infection by M. tuberculosis. Patients may be enrolled in clinical trials on the basis of clinical and/or microbiological criteria. A therapeutic regimen will likely include a combination of drugs; a randomized, active-control, comparative clinical trial is recommended. If appropriate samples can be obtained for culture during follow-up without placing the patient at unwarranted risk, the assessment of microbiological outcome is paramount. Prophylaxis will probably require a single drug, and a similar study design is preferred.",
        "score": 6.256978511810303
    },
    "doc2": {
        "PMID": 3280912,
      

In [1]:
from hybrid_retriever import HybridRetriever
retriever = HybridRetriever()

In [2]:
response = retriever.retrieve_docs("Is stop codon bypass possible?", 10)
print(response)

{
    "doc1": {
        "PMID": 2010914,
        "title": "The signal for a leaky UAG stop codon in several plant viruses includes the two downstream codons.",
        "content": "Plant RNA viruses commonly exploit leaky translation termination signals in order to express internal protein coding regions. As a first step to elucidate the mechanism(s) by which ribosomes bypass leaky stop codons in vivo, we have devised a system in which readthrough is coupled to the transient expression of beta-glucuronidase (GUS) in tobacco protoplasts. GUS vectors that contain the stop codons and surrounding nucleotides from the readthrough regions of several different RNA viruses were constructed and the plasmids were tested for the ability to direct transient GUS expression. These studies indicated that ribosomes bypass the leaky termination sites at efficiencies ranging from essentially 0 to ca. 5% depending upon the viral sequence. The results suggest that the efficiency of readthrough is determine

### RAG system

Now testing the combined RAG system using retriever number 1, semantic similarity search.

In [1]:
from med_rag import MedRAG

rag = MedRAG(retriever=1, question_type=1)

rag.get_answer("What is the mode of inheritance of Wilson's disease?")

'{"response": "Wilson\'s disease is inherited in an autosomal recessive manner.", "used_PMIDs": [], "retrieved_PMIDs": [2225556, 971979, 3718109, 1580720, 3279286, 2739114, 2542908, 2829141, 2188371, 1474545]}'

In [5]:
from med_rag import MedRAG

rag = MedRAG(retriever=2, question_type=1)

rag.get_answer("What is the mode of inheritance of Wilson's disease?")

'{"response": "Wilson\'s disease follows an autosomal recessive mode of inheritance.", "used_PMIDs": [838566], "retrieved_PMIDs": [838566, 2292371, 1536160, 2724779, 2332582, 1248830, 2222683, 3255247, 1517772, 789146]}'

In [6]:
from med_rag import MedRAG

rag = MedRAG(retriever=3, question_type=1)

rag.get_answer("What is the mode of inheritance of Wilson's disease?")

'{"response": "Wilson\'s disease is inherited as an autosomal recessive trait.", "used_PMIDs": [2958567, 2129845], "retrieved_PMIDs": [741263, 2958567, 2129845, 311643, 4054179, 2611557, 717319, 1268176, 2086477, 479855]}'

In [15]:
from med_rag import MedRAG

rag = MedRAG(retriever=3, question_type=2)

rag.get_answer("Is stop codon bypass possible?")

'{"response": "yes", "used_PMIDs": ["2352429", "1814364", "611657", "3012775", "3510456", "2636391", "388349", "3837850", "2226798"], "retrieved_PMIDs": [2352429, 1814364, 3312963, 611657, 3012775, 3510456, 2636391, 388349, 3837850, 2226798]}'

In [34]:
from med_rag import MedRAG

rag = MedRAG(retriever=4, question_type=1)

rag.get_answer("Is stop codon bypass possible?")

'{"response": "Stop codon bypass is possible through ribosomal frameshifting at hungry codons, allowing for readthrough of stop codons and continuation of translation in a shifted reading frame.", "used_PMIDs": [3199440, 1515416, 1731076, 3477671, 1779848], "retrieved_PMIDs": [3199440, 1515416, 1731076, 2253710, 1628840, 1689389, 2439408, 3477671, 1779848, 1814364]}'

In [1]:
from med_rag import MedRAG

rag = MedRAG(retriever=4, question_type=2)

In [2]:
rag.get_answer("Is stop codon bypass possible?")

'{"response": "yes", "used_PMIDs": ["3199440", "1515416", "1731076", "3477671", "1779848"], "retrieved_PMIDs": [3199440, 1515416, 1731076, 2253710, 1628840, 1689389, 2439408, 3477671, 1779848, 1814364]}'