In [1]:
import dotenv
import os
import pandas as pd
from pinecone import Pinecone
import fitz
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from sentence_transformers import SentenceTransformer

CHUNK_SIZE = 128

dotenv.load_dotenv(".env")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")


  from tqdm.autonotebook import tqdm


In [2]:
pc = Pinecone(api_key=PINECONE_API_KEY)


In [3]:
index_name = "medicalbot-vdb"
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x2005a761b50>

# FILES TO CSV

In [4]:

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text("text") for page in doc])

# Function to split text into chunks
def create_text_chunks(text, chunk_size=CHUNK_SIZE):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

# Function to extract headings and map content to them
def extract_headings_and_text(text):
    lines = text.split("\n")
    extracted_data = []
    current_heading = None
    buffer = []
    
    for line in lines:
        if line.isupper() and len(line) > 3:  # Heuristic for headings
            if buffer:
                extracted_data.append((current_heading, " ".join(buffer)))
                buffer = []
            current_heading = line.strip()
        else:
            buffer.append(line.strip())
    
    if buffer:
        extracted_data.append((current_heading, " ".join(buffer)))
    
    return extracted_data

# Function to create a structured DataFrame
def create_dataframe_from_text_files(txt_folder):
    data_list = []
    txt_files = sorted(os.listdir(txt_folder), key=lambda x: int(x.split(".")[0]))
    
    for txt_file in txt_files:
        txt_path = os.path.join(txt_folder, txt_file)
        with open(txt_path, "r", encoding="utf-8") as f:
            text_data = f.read()
        
        extracted_data = extract_headings_and_text(text_data)
        text_chunks = create_text_chunks(text_data)
        
        for chunk in text_chunks:
            test_name, source = "No_testName", "No_Source"
            
            for heading, content in extracted_data:
                if chunk in content:
                    test_name = heading
                    source = heading
                    break
            
            data_list.append({
                "Test Name": test_name,
                "Description": chunk,
                "Source": source,
                "URL": "No_URl"
            })
    
    return pd.DataFrame(data_list)

# Function to save DataFrame to a CSV file
def save_dataframe(df, output_path):
    df.to_csv(output_path, index=False, encoding="utf-8")

# Main execution
txt_folder = "../DATA/TextFiles"
df = create_dataframe_from_text_files(txt_folder)
save_dataframe(df, "structured_data.csv")

print("Data processing complete. CSV file saved.")


Data processing complete. CSV file saved.


# Uploading DF

In [28]:

def upsert_data_to_pinecone(df):
    # Ensure the "tokens" column is in the correct format (list of floats)
    df["tokens"] = df["tokens"].apply(eval)  # Convert string representation of list to actual list
    print("1: ",len(df))
    # Prepare data for upserting into Pinecone
    df = df.fillna("")  # Replace NaN with an empty string
    print("1: ",len(df))
    print("Uploading data to Pinecone...")
    data_to_upsert = []
    for idx, row in df.iterrows():
        metadata = {
            "test_name": row["Test Name"],
            "source": row["Source"],
            "url": row["URL"],
            "text": row["text"]
        }
        data_to_upsert.append({"id": str(idx), "values": row["tokens"], "metadata": metadata})

    return data_to_upsert


In [29]:
def upload_In_Batches(data_to_upsert, batch_size=100):
    # Batch upload
    for i in range(0, len(data_to_upsert), batch_size):
        batch = data_to_upsert[i:i + batch_size]
        index.upsert(vectors=batch)
        print(f"Uploaded batch {i // batch_size + 1} of {len(data_to_upsert) // batch_size + 1}")
        
    print("Data upload complete!")

In [30]:
# Load your DataFrame
df = pd.read_csv("../DATA/Embedded_Files/medlinePlus_tokens.csv")  # Replace with your file path
df2 = pd.read_csv("../DATA/Embedded_Files/testing_com_tokens.csv")  # Replace with your file path
df3 = pd.read_csv("../DATA/Embedded_Files/files_tokens.csv")  # Replace with your file path

In [31]:
data_to_upsert = upsert_data_to_pinecone(df)
data_to_upsert2 = upsert_data_to_pinecone(df2)
data_to_upsert3 = upsert_data_to_pinecone(df3)

1:  2803
1:  2803
Uploading data to Pinecone...
1:  3449
1:  3449
Uploading data to Pinecone...
1:  2906
1:  2906
Uploading data to Pinecone...


In [32]:
print(len(data_to_upsert))
print(len(data_to_upsert2))
print(len(data_to_upsert3))

2803
3449
2906


In [33]:
merged_data_to_upsert = data_to_upsert + data_to_upsert2 + data_to_upsert3
len(merged_data_to_upsert)

9158

In [34]:
upload_In_Batches(merged_data_to_upsert)

Uploaded batch 1 of 92
Uploaded batch 2 of 92
Uploaded batch 3 of 92
Uploaded batch 4 of 92
Uploaded batch 5 of 92
Uploaded batch 6 of 92
Uploaded batch 7 of 92
Uploaded batch 8 of 92
Uploaded batch 9 of 92
Uploaded batch 10 of 92
Uploaded batch 11 of 92
Uploaded batch 12 of 92
Uploaded batch 13 of 92
Uploaded batch 14 of 92
Uploaded batch 15 of 92
Uploaded batch 16 of 92
Uploaded batch 17 of 92
Uploaded batch 18 of 92
Uploaded batch 19 of 92
Uploaded batch 20 of 92
Uploaded batch 21 of 92
Uploaded batch 22 of 92
Uploaded batch 23 of 92
Uploaded batch 24 of 92
Uploaded batch 25 of 92
Uploaded batch 26 of 92
Uploaded batch 27 of 92
Uploaded batch 28 of 92
Uploaded batch 29 of 92
Uploaded batch 30 of 92
Uploaded batch 31 of 92
Uploaded batch 32 of 92
Uploaded batch 33 of 92
Uploaded batch 34 of 92
Uploaded batch 35 of 92
Uploaded batch 36 of 92
Uploaded batch 37 of 92
Uploaded batch 38 of 92
Uploaded batch 39 of 92
Uploaded batch 40 of 92
Uploaded batch 41 of 92
Uploaded batch 42 of 92
U

# DATA RETRIEVAL AND RESPONSE GENERATION

In [4]:
import re

def remove_tags(text):
    pattern = re.compile(r'<think>.*?</think>', re.DOTALL)
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def only_p_tags(text):
    descriptions = re.findall(r"<p>(.*?)</p>", text)
    return descriptions

In [29]:

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Small and efficient
# model_name = "deepseek-r1-distill-llama-70b"
# model_name = "llama-3.3-70b-versatile"
model_name = "mixtral-8x7b-32768"

# Call Groq API for response
chat = ChatGroq(
api_key = GROQ_API_KEY,
model_name = model_name
)   

In [30]:

def get_unique_content_only(results):
    
    context = ''
    for search_result in results: 
        for match in search_result["matches"]:
            if match['metadata']['text'] not in context:
                context += f"- {match['metadata']['text']}\n"
    return context

def retrieve_context(description):
    results = []

    for i in range(len(description)):
        query_vector = embedding_model.encode(description[i]).tolist()
        search_results = index.query(vector=query_vector, top_k=5, include_metadata=True)
        results.append(search_results)
    return results

    
def generate_final_output(report,type,disease,generated_text,context):
    
    
    # Construct prompt
    prompt = f"""You are a expert medical lab technician. You have to interpret the medical lab report of the patient. I have provided 
    you the lab report, the test type, the disease which the patient thinks he is suffereing from and some context which may assist you in interpreting the report.
    Interpret the report in layman understandable form in just 2 lines, not more than that and donot write anything else other than the interpretation. If you think that
    the disease he thinks he is suffering from do not match the report, you can mention that as well and recommend the possible diseases.In case the Context is not benificial,
    you can ignore it and answer from your own knowledge. Also score your answer out of 10, but just the number in the markdown form, nothing else. The answer should not be more than 2 to 3 lines.
    :
        
    Context: {context}
    Type: {type}
    Disease: {disease}
    Report: {report}
    Random Context (it can be wrong): {generated_text}
    Answer:
    """
    
    # # Call Groq API for response
    # chat = ChatGroq(
    # api_key = GROQ_API_KEY,
    # model_name = model_name
    # )   
    
    chain = chat | StrOutputParser()
    
    response = chain.invoke(prompt)
    response = remove_tags(response)
    return response


In [31]:

def generate_refined_prompt(query,type,disease):
    
    # Construct prompt
    prompt = f"""You are an expert medical lab technician. I have provided the test type, the suspected disease (if mentioned by the patient), and the lab report. Since the lab report primarily consists of numerical values, retrieving relevant context from my RAG system is challenging. Your task is to generate a concise 2-3 line textual summary that best represents this report, incorporating key abnormalities or notable findings. Provide 3 descriptions which are inside the <p>*</p> tag (don't use any other mode for separating them). This summary will be used to fetch the most relevant medical context from my RAG, which, along with the report, will help the LLM provide an accurate interpretation.
    For more context, the data i have scrapped is from Testing.com, medlinplus.com and some books.
    Don't write anything else, other than the relavant interpretation.
    :
    
    Type: {type}
    Diesease: {disease}
    Report: {query}
        
    Answer:
    """
    
    # Call Groq API for response
    # chat = ChatGroq(
    # api_key = GROQ_API_KEY,
    # model_name = model_name
    # )   
    
    chain = chat | StrOutputParser()
    
    response = chain.invoke(prompt)
    response = remove_tags(response)
    response = only_p_tags(response)
    return response # returns the list of 3 description

In [32]:

def discard_irrelevant_context(test_name,normal_ranges,retrieved_context,report):
    
    # Construct prompt
    prompt = f"""You are an expert medical lab technician. Your task is to extract only the most relevant information from the provided medical context and discard anything unrelated.  

### Given Information:
- **Test Type:** {test_name}  
- **Medical Report:** {report}  
- **Normal Ranges:** {normal_ranges}  
- **Retrieved Context:** {retrieved_context}  

### Instructions:
- Read the retrieved context carefully.  
- Identify the information that is directly relevant to the test {test_name} and the provided ranges (only
 consider the provided ranges if the normal ranges are not present in the test report, otherwise consider the provided ones).  
- Extract only the relevant paragraphs and discard anything that is not directly related.  
- Ensure that the extracted content provides useful insights about the given test.  

### Output Format:
Return the **filtered context** as clean paragraphs that are relevant to {test_name}. Do not include any unrelated information, just provide the paragraphs nothing else.  

"""
    
    chain = chat | StrOutputParser()
    
    response = chain.invoke(prompt)
    response = remove_tags(response)
    return response # final paragraphs

In [33]:
test_name = "CBC"
disease = "dengue"
report = """Complete Blood Count (CBC) Report
Test	Result	Reference Range	Units	Flag
White Blood Cell Count (WBC)	7.5	4.0 - 11.0	x10³/μL	Normal
Red Blood Cell Count (RBC)	3	4.5 - 5.9	x10⁶/μL	Abnormal
Hemoglobin (Hgb)	15.0	13.5 - 17.5	g/dL	Normal
Hematocrit (Hct)	45	40 - 52	%	Normal
Mean Corpuscular Volume (MCV)	88	80 - 100	fL	Normal
Mean Corpuscular Hemoglobin (MCH)	29	27 - 34	pg	Normal
Mean Corpuscular Hemoglobin Concentration (MCHC)	34	32 - 36	g/dL	Normal
Red Cell Distribution Width (RDW)	13.5	11.5 - 14.5	%	Normal
Platelet Count	250	150 - 450	x10³/μL	Normal
Neutrophils	35	40 - 70	%	Abnormal
Lymphocytes	10	20 - 40	%	Abnormal
Monocytes	6	2 - 10	%	Normal
Eosinophils	3	1 - 4	%	Normal
Basophils	1	0 - 2	%	Normal"""
generated_text = generate_refined_prompt(report,test_name,disease) # list of 3 descriptions

retrieved_content = retrieve_context(generated_text)
unique_content = get_unique_content_only(retrieved_content)

context = discard_irrelevant_context(test_name,None,unique_content,report)
print(context)

print("------------------------------")
response = generate_final_output(report,test_name,disease,generated_text,context)
print(response)

The Complete Blood Count (CBC) is a test that provides information about the number and type of cells in the blood. Here are the relevant details extracted from the provided context:

- Red Blood Cell Count (RBC) reflects the number of circulating red blood cells. A decrease in the RBC count indicates anemia, while an elevated RBC count may suggest polycythemia vera or dehydration.

- Hemoglobin (Hgb) is a protein within the cytoplasm of the red blood cells, which plays a role in tissue perfusion. It is the most commonly used marker of anemia.

- Hematocrit (Hct) measures the percentage of red blood cells in the blood. Like hemoglobin, it can be used to diagnose anemia or polycythemia.

- Mean Corpuscular Volume (MCV) measures the average size of the red blood cells. Low MCV indicates microcytic anemia, while high MCV suggests macrocytic anemia.

- Mean Corpuscular Hemoglobin (MCH) measures the average amount of hemoglobin in the red blood cells. Low MCH is indicative of hypochromic an