In [11]:
import dotenv
import os
import requests
import json
import re
import tiktoken
from bs4 import BeautifulSoup
from pinecone import Pinecone
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from sentence_transformers import SentenceTransformer
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain
from langchain.schema import SystemMessage, HumanMessage

dotenv.load_dotenv(".env")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
SERPER_API_KEY = os.getenv("SERPER_API_KEY")

In [12]:
url = "https://google.serper.dev/search"

In [3]:
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "medical-data"
# index_name = "medicalbot-vdb"
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x272fb977410>

In [13]:
embedding_model = SentenceTransformer("sentence-transformers/msmarco-bert-base-dot-v5")

In [14]:
def remove_tags(text):
    pattern = re.compile(r'<think>.*?</think>', re.DOTALL)
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def only_p_tags(text):
    descriptions = re.findall(r"<p>(.*?)</p>", text)
    return descriptions


def get_unique_content_only(results):
    
    context = ''
    for search_result in results: 
        for match in search_result["matches"]:
            if match['metadata']['text'] not in context:
                context += f"- {match['metadata']['text']}\n"
    return context

def retrieve_context(description):
    results = []

    for i in range(len(description)):
        query_vector = embedding_model.encode(description[i]).tolist()
        search_results = index.query(vector=query_vector, top_k=5, include_metadata=True)
        results.append(search_results)
    return results

    

In [15]:
model_name = "deepseek-r1-distill-llama-70b"
# model_name = "llama-3.3-70b-versatile"
# model_name = "qwen-2.5-32b"


chat = ChatGroq(
api_key = GROQ_API_KEY,
model_name = model_name
)

In [23]:
model2 = "deepseek-r1-distill-llama-70b"

chat2 = ChatGroq(
api_key = GROQ_API_KEY,
model_name = model2
)   

tokenizer = tiktoken.get_encoding("cl100k_base")  # Adjust for your LLM

In [24]:
def chunk_text(text, max_tokens=4500):
    """Splits text into token-based chunks without cutting sentences abruptly."""
    
    words = text.split()

    chunks = []
    current_chunk = []
    token_count = 0

    for word in words:
        word_tokens = len(tokenizer.encode(word))  # Get token count for each word

        if token_count + word_tokens > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            token_count = 0

        current_chunk.append(word)
        token_count += word_tokens

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def scrape_and_extract(url):
    """Scrape the webpage content."""
    try:
        headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for bad status codes
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def clean_text(text):
    """
    Cleans extracted web content by:
    - Removing HTML tags
    - Replacing multiple spaces/newlines/tabs with a single space
    - Stripping leading/trailing whitespace
    """
    if not text:
        return ""

    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    # Remove extra spaces, newlines, and tabs
    text = re.sub(r"\s+", " ", text)

    return text.strip()

def get_URLs(test_name):
    """Search the web, scrape results, and generate an LLM response."""
    search_query = f"How to interpret {test_name} report"

    payload = json.dumps({
        "q": search_query,
        "num": 2
    })
    headers = {
    'X-API-KEY': SERPER_API_KEY,
    'Content-Type': 'application/json'
    }
    url = "https://google.serper.dev/search"
    response = requests.request("POST", url, headers=headers, data=payload) 
    data = json.loads(response.text)
    
    # Extract URLs from the "organic" search results
    urls = [entry["link"] for entry in data.get("organic", [])]

    return urls

In [29]:

def get_interpretations_list(test_name,urls):
    
    extracted_texts = []
    for url in urls:
            content = scrape_and_extract(url)
            if content:
                cleaned_text = clean_text(content)
                extracted_texts.append(cleaned_text)

    web_content = "\n\n".join(extracted_texts)
    web_content_chunks = chunk_text(web_content)

    responses = []
    for i in range(len(web_content_chunks)):
        messages = [
            SystemMessage(content="You are a medical expert providing the relavent content from the given one."),
            HumanMessage(content=f"""Based on the following information:\n\n{web_content_chunks[i]}\n\n extract the information
        that can help in interpreting the medical report related to {test_name} (test). Don't type 
        anything else. Just provide the relavent information from the content nothing else, if there
        is nothing relevant then just response with there is nothing helpful but dont type anything from your knowledge.""")
        ]
        # message = f"""You are a medical expert providing the relavent content from the given one.
        # Based on the following information:\n\n{web_content_chunks[i]}\n\n extract the information
        # that can help in interpreting the medical report related to {test_name} (test). Don't type 
        # anything irrelevant. Just provide the relavent information from the content."""
        # print(len(message))

        response = chat2(messages)
        response = remove_tags(response.content)
        responses.append(response)
        
    return responses

# Example Usage
test_name = "Dengue Hemorrhagic Fever"
urlss = get_URLs(test_name)
interpretation = get_interpretations_list(test_name,urlss)
print(interpretation)


['\n\n- Dengue hemorrhagic fever (DHF) is defined by four clinical criteria: fever, hemorrhagic tendency (spontaneous bleeding or a positive tourniquet test result), thrombocytopenia (platelet count ≤100,000 cells/mm³), and plasma leakage (pleural effusion, ascites, or ≥20% hemoconcentration).  \n- The diagnosis of DHF does not require laboratory evidence of dengue virus infection.  \n- The 2009 World Health Organization (WHO) classification replaced the previous dengue fever (DF) and DHF categories with "dengue" and "severe dengue."  \n- Severe dengue is defined as dengue with plasma leakage leading to shock or respiratory distress, severe bleeding, or organ failure (e.g., elevated liver enzyme levels, impaired consciousness, or heart failure).  \n- The previous DHF classification was based on objective findings, while the 2009 classification relies more on clinical judgment and severity.  \n- The tourniquet test was used as a screening tool for DHF in some settings.', '\n\n- **IgM An

In [31]:
z = 0
for i in range(len(interpretation)):
    # print(i, "---", interpretation[i])
    z = z + int(len(interpretation[i]))
print(z)

4969


In [63]:
def summarize_web_content(list_of_interpretations, test_name):
    """Summarize the interpretations."""
    # Join the interpretations into a single string
    interpretations = "".join(list_of_interpretations)
    chain = chat | StrOutputParser()

    prompt = f"""You are a medical expert summarize the provided content to the max of 1000 words
    that can be helpful in interpreting the medical report related to {test_name} (test).Make sure that
    the length of summarization not exceed 1000 words.
    content: {interpretations}"""

    response = chain.invoke(prompt)
    response = remove_tags(response)
    return response

In [64]:
texttt = summarize_web_content(interpretation, test_name)

In [65]:
texttt

'\n\n### Summary of Dengue Hemorrhagic Fever (DHF) and Diagnostic Interpretation\n\nDengue Hemorrhagic Fever (DHF) is a severe form of dengue infection characterized by four key clinical criteria: fever, hemorrhagic tendency, thrombocytopenia (platelet count ≤100,000 cells/mm³), and plasma leakage (evidenced by pleural effusion, ascites, or ≥20% hemoconcentration). The diagnosis of DHF does not require laboratory confirmation of dengue virus infection, as it is primarily based on clinical findings. However, the 2009 World Health Organization (WHO) classification simplified the categorization of dengue infections into "dengue" and "severe dengue," with severe dengue defined by plasma leakage leading to shock or respiratory distress, severe bleeding, or organ failure.\n\n#### Diagnostic Tests for Dengue Infection\n\n1. **IgM Antibody Testing**:\n   - IgM antibodies are recommended for use in combination with nucleic acid amplification tests (NAAT) or NS1 antigen tests during the first 7 

In [66]:
len(texttt)

5442

In [None]:

def generate_refined_prompt(query,type,disease):
    
    # Construct prompt
    prompt = f"""You are an expert doctor. I have provided the test type, 
    the suspected disease (if mentioned by the patient), and the lab report. Since the lab report 
    primarily consists of numerical values, retrieving relevant context from my RAG system is challenging.
    Your task is to generate a maximum of 200-300 words textual summary that best represents this report,
    incorporating key abnormalities or notable findings. Provide 2 descriptions which are inside the     
    <p>*</p> tag (don't use any other mode for separating them). This summary will be used to fetch the 
    most relevant medical context from my RAG, which, along with the report, will help the LLM provide an 
    accurate interpretation.

    For your information so that you can write the desrciption in a familiar way, the data i have scrapped 
    is from Testing.com, medlinplus.com and some books.
    Don't write anything else, other than the relavant interpretation.
    :
    
    Type: {type}
    Diesease: {disease}
    Report: {query}
        
    Answer:
    """
    
    chain = chat | StrOutputParser()
    
    response = chain.invoke(prompt)
    response = remove_tags(response)
    response = only_p_tags(response)
    return response # returns the list of 2 description

In [None]:

def discard_irrelevant_context(test_name,normal_ranges,retrieved_context,report):
    
    # Construct prompt
    prompt = f"""You are an expert doctor. Your task is to extract only the most relevant 
    information from the provided medical context and discard anything unrelated.  

### Given Information:
- **Test Type:** {test_name}  
- **Medical Report:** {report}  
- **Normal Ranges:** {normal_ranges}  
- **Retrieved Context:** {retrieved_context}  

### Instructions:
- Read the retrieved context carefully.  
- Identify the information that is directly relevant to the test {test_name} and the provided ranges (only
 consider the provided ranges if the normal ranges are not present in the test report, otherwise consider the provided ones).  
- Extract only the relevant paragraphs and discard anything that is not directly related.  
- Ensure that the extracted content provides useful insights about the given test.  

### Output Format:
Return the **filtered context** as clean paragraphs that are relevant to {test_name}. Do not include any unrelated information, just provide the paragraphs nothing else.  

"""
    
    chain = chat | StrOutputParser()
    
    response = chain.invoke(prompt)
    response = remove_tags(response)
    return response # final paragraphs

In [None]:
# memory = ConversationBufferMemory()
# conversation = ConversationChain(
#     llm=chat2,
#     memory=memory
# )

In [None]:

def generate_final_output(report,type,disease,generated_text,context):
    
    
    # Construct prompt
    prompt = f"""You are a expert doctor. You have to interpret the medical lab report of the patient. I have provided 
    you the lab report, the test type, the disease which the patient thinks he is suffereing from and some context which may assist you in interpreting the report.
    Interpret the report in layman understandable form in just 2 lines, not more than that and donot write anything else other than the interpretation. If you think that
    the disease he thinks he is suffering from do not match the report, you can mention that as well and recommend the possible diseases.In case the Context is not benificial,
    you can ignore it and answer from your own knowledge. Also score your answer out of 10, but just the number in the markdown form, nothing else. The answer should not be more than 2 to 3 lines.
    :
        
    Context: {context}
    Type: {type}
    Disease: {disease}
    Report: {report}
    Random Context (it can be wrong): {generated_text}
    Answer:
    """
    
    chain = chat | StrOutputParser()
    
    response = chain.invoke(prompt)
    response = remove_tags(response)
    return response


In [None]:
test_name = " Dengue Hemorrhagic Fever "
disease = "dengue"

report = '''Characteristic	DF¹ (%)	DHF¹ (%)	N	p-value²
Demographics				
Age (median ± interquartile range)	26.0 ± 19.0	24.5 ± 18.0	201	0.905
Gender				
Male	99 (61.5)	30 (75.0)	201	0.077
Female	62 (38.5)	10 (25.0)		
Clinical Presentation				
Fever	159/161 (98.8)	39/39 (100.0)	200	0.647
Nausea and/or vomiting	93/161 (57.8)	27/40 (67.5)	201	0.173
Rash	44/161 (27.3)	31/40 (77.5)	201	<0.001
Abdominal Pain	6/86 (7.0)	1/11 (9.1)	97	0.582
Diarrhea	26/161 (16.2)	5/40 (12.5)	201	0.384
Myalgia	46/161 (28.6)	6/40 (15.0)	201	0.056
Headache	21/161 (13.0)	2/40 (5.0)	201	0.120
Cough	22/161 (13.7)	1/40 (2.5)	201	0.033
Hemorrhage	6/161 (3.7)	29/40 (72.5)	201	<0.001
Temperature (median)	38.00 ± 2.00	37.00 ± 1.00	177	0.014
Laboratory Findings				
Thrombocytopenia at presentation	116/151 (76.8)	36/40 (90.0)	191	0.047
Received platelet transfusion	25/90 (27.8)	13/32 (40.6)	122	0.131
Low Hemoglobin	16/137 (11.7)	8/37 (21.6)	174	0.102
Hematocrit level on admission (median)	39.50 ± 6.75	40.40 ± 8.80	167	0.607
Leukopenia	64/141 (45.4)	9/37 (24.3)	178	0.015
Neutropenia	35/133 (26.3)	16/34 (47.1)	167	0.018
Lymphocytosis	30/132 (22.7)	7/33 (21.2)	165	0.529
Monocytosis	32/122 (26.2)	19/33 (57.6)	155	0.001
Raised ALT	63/110 (57.3)	27/32 (84.4)	142	0.004
Raised AST	68/79 (86.1)	25/26 (96.2)	105	0.147'''


In [None]:
generated_text = generate_refined_prompt(report,test_name,disease) # list of 2 descriptions
print("Generated text: ",generated_text)
retrieved_content = retrieve_context(generated_text)
print("Retrieved content: ",retrieved_content)
unique_content = get_unique_content_only(retrieved_content)
print("Unique content: ",unique_content)
context = discard_irrelevant_context(test_name,None,unique_content,report)
print("Context: ",context)
response = generate_final_output(report,test_name,disease,generated_text,context)
print("Response: ",response)

In [None]:
# 22.6 / 15.0 / 18.2
# 10
# 4.5
# 4.4
# 2.7

In [None]:
# report = """Complete Blood Count (CBC) Report
# Test	Result	Reference Range	Units	Flag
# White Blood Cell Count (WBC)	7.5	4.0 - 11.0	x10³/μL	Normal
# Red Blood Cell Count (RBC)	3	4.5 - 5.9	x10⁶/μL	Abnormal
# Hemoglobin (Hgb)	15.0	13.5 - 17.5	g/dL	Normal
# Hematocrit (Hct)	45	40 - 52	%	Normal
# Mean Corpuscular Volume (MCV)	88	80 - 100	fL	Normal
# Mean Corpuscular Hemoglobin (MCH)	29	27 - 34	pg	Normal
# Mean Corpuscular Hemoglobin Concentration (MCHC)	34	32 - 36	g/dL	Normal
# Red Cell Distribution Width (RDW)	13.5	11.5 - 14.5	%	Normal
# Platelet Count	250	150 - 450	x10³/μL	Normal
# Neutrophils	35	40 - 70	%	Abnormal
# Lymphocytes	10	20 - 40	%	Abnormal
# Monocytes	6	2 - 10	%	Normal
# Eosinophils	3	1 - 4	%	Normal
# Basophils	1	0 - 2	%	Normal"""