In [7]:
import dotenv
import os
import pandas as pd
from pinecone import Pinecone
import fitz
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from sentence_transformers import SentenceTransformer

CHUNK_SIZE = 128

dotenv.load_dotenv(".env")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")


In [8]:
pc = Pinecone(api_key=PINECONE_API_KEY)


In [9]:
index_name = "medicalbot-vdb"
index = pc.Index(index_name)



# FOR FILES/BOOKS

In [4]:

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text("text") for page in doc])

# Function to split text into chunks
def create_text_chunks(text, chunk_size=CHUNK_SIZE):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

# Function to extract headings and map content to them
def extract_headings_and_text(text):
    lines = text.split("\n")
    extracted_data = []
    current_heading = None
    buffer = []
    
    for line in lines:
        if line.isupper() and len(line) > 3:  # Heuristic for headings
            if buffer:
                extracted_data.append((current_heading, " ".join(buffer)))
                buffer = []
            current_heading = line.strip()
        else:
            buffer.append(line.strip())
    
    if buffer:
        extracted_data.append((current_heading, " ".join(buffer)))
    
    return extracted_data

# Function to create a structured DataFrame
def create_dataframe_from_text_files(txt_folder):
    data_list = []
    txt_files = sorted(os.listdir(txt_folder), key=lambda x: int(x.split(".")[0]))
    
    for txt_file in txt_files:
        txt_path = os.path.join(txt_folder, txt_file)
        with open(txt_path, "r", encoding="utf-8") as f:
            text_data = f.read()
        
        extracted_data = extract_headings_and_text(text_data)
        text_chunks = create_text_chunks(text_data)
        
        for chunk in text_chunks:
            test_name, source = "No_testName", "No_Source"
            
            for heading, content in extracted_data:
                if chunk in content:
                    test_name = heading
                    source = heading
                    break
            
            data_list.append({
                "Test Name": test_name,
                "Description": chunk,
                "Source": source,
                "URL": "No_URl"
            })
    
    return pd.DataFrame(data_list)

# Function to save DataFrame to a CSV file
def save_dataframe(df, output_path):
    df.to_csv(output_path, index=False, encoding="utf-8")

# Main execution
txt_folder = "../DATA/TextFiles"
df = create_dataframe_from_text_files(txt_folder)
save_dataframe(df, "structured_data.csv")

print("Data processing complete. CSV file saved.")


Data processing complete. CSV file saved.


# FOR DF 

In [25]:

def upsert_data_to_pinecone(df):
    # Ensure the "tokens" column is in the correct format (list of floats)
    df["tokens"] = df["tokens"].apply(eval)  # Convert string representation of list to actual list
    print("1: ",len(df))
    # Prepare data for upserting into Pinecone
    df = df.fillna("")  # Replace NaN with an empty string
    print("1: ",len(df))
    print("Uploading data to Pinecone...")
    data_to_upsert = []
    for idx, row in df.iterrows():
        metadata = {
            "test_name": row["Test Name"],
            "source": row["Source"],
            "url": row["URL"],
            "text": row["text"]
        }
        data_to_upsert.append({"id": str(idx), "values": row["tokens"], "metadata": metadata})

    return data_to_upsert


In [26]:
def upload_In_Batches(data_to_upsert, batch_size=100):
    # Batch upload
    for i in range(0, len(data_to_upsert), batch_size):
        batch = data_to_upsert[i:i + batch_size]
        index.upsert(vectors=batch)
        print(f"Uploaded batch {i // batch_size + 1} of {len(data_to_upsert) // batch_size + 1}")
        
    print("Data upload complete!")

In [22]:
# Load your DataFrame
df = pd.read_csv("../DATA/Embedded_Files/medlinePlus_tokens.csv")  # Replace with your file path
df2 = pd.read_csv("../DATA/Embedded_Files/testing_com_tokens.csv")  # Replace with your file path
df3 = pd.read_csv("../DATA/Embedded_Files/files_tokens.csv")  # Replace with your file path

In [23]:
data_to_upsert3 = upsert_data_to_pinecone(df3)
print("DATA 1",data_to_upsert3[0])

Uploading data to Pinecone...
DATA 1 {'id': '0', 'values': [-0.02836349792778492, -0.03139110282063484, -0.09892386943101883, -0.007072347681969404, -0.06738731265068054, -0.058915454894304276, 0.0014865617267787457, 0.061360228806734085, 0.07193094491958618, -0.024456391111016273, -0.035403572022914886, 0.020397065207362175, -0.02396518364548683, -0.037037044763565063, -0.11722361296415329, -0.044816453009843826, 0.009107595309615135, -0.045026808977127075, -0.0029555754736065865, 0.055858660489320755, -0.020438693463802338, 0.04453589394688606, -0.04997698962688446, 0.014400175772607327, -0.028468910604715347, -0.0711422860622406, 0.055036257952451706, 0.04217354953289032, -0.017407488077878952, -0.01515008695423603, -0.0021145245991647243, -0.029250796884298325, 0.05639529228210449, -0.02793634869158268, 0.01492819469422102, 0.021664081141352654, 0.02408755011856556, 0.013438326306641102, 0.003115665866062045, 0.015332106500864029, 0.03658853843808174, -0.05782416835427284, -0.05288

In [24]:
upload_In_Batches(data_to_upsert3)

Uploaded batch 1 of 30
Uploaded batch 2 of 30
Uploaded batch 3 of 30
Uploaded batch 4 of 30
Uploaded batch 5 of 30
Uploaded batch 6 of 30
Uploaded batch 7 of 30
Uploaded batch 8 of 30
Uploaded batch 9 of 30
Uploaded batch 10 of 30
Uploaded batch 11 of 30
Uploaded batch 12 of 30
Uploaded batch 13 of 30
Uploaded batch 14 of 30
Uploaded batch 15 of 30
Uploaded batch 16 of 30
Uploaded batch 17 of 30
Uploaded batch 18 of 30
Uploaded batch 19 of 30
Uploaded batch 20 of 30
Uploaded batch 21 of 30
Uploaded batch 22 of 30
Uploaded batch 23 of 30
Uploaded batch 24 of 30
Uploaded batch 25 of 30
Uploaded batch 26 of 30
Uploaded batch 27 of 30
Uploaded batch 28 of 30
Uploaded batch 29 of 30
Uploaded batch 30 of 30
Data upload complete!


In [27]:
data_to_upsert = upsert_data_to_pinecone(df)
data_to_upsert2 = upsert_data_to_pinecone(df2)
print("DATA 1",data_to_upsert[0])
print("DATA 2",data_to_upsert2[0])

1:  2803
1:  2803
Uploading data to Pinecone...
1:  3449
1:  3449
Uploading data to Pinecone...
DATA 1 {'id': '0', 'values': [0.044602010399103165, -0.029450392350554466, -0.02910509705543518, 0.003605336882174015, 0.05494566261768341, -0.013170881196856499, 0.07718700915575027, 0.1311064511537552, -0.008360992185771465, 0.0014408610295504332, -0.00457599014043808, -0.036209359765052795, 0.05374126508831978, 0.04322823882102966, -0.09087313711643219, -0.03279377892613411, -0.0017876591300591826, -0.0343351736664772, -0.07934784144163132, 0.0035242594312876463, -0.0078086815774440765, 0.039435118436813354, -0.01924910582602024, 0.06796807050704956, -0.07220721244812012, 0.05634573847055435, -0.019596802070736885, -0.003030163934454322, -0.017355451360344887, 0.03209801763296127, 0.09978058934211731, 0.043821483850479126, 0.034802913665771484, 0.021847618743777275, -0.002018026076257229, -0.012731552124023438, -0.0409465990960598, 0.0585450641810894, 0.023514684289693832, -0.012673799879

In [28]:
upload_In_Batches(data_to_upsert)
upload_In_Batches(data_to_upsert2)

Uploaded batch 1 of 29
Uploaded batch 2 of 29
Uploaded batch 3 of 29
Uploaded batch 4 of 29
Uploaded batch 5 of 29
Uploaded batch 6 of 29
Uploaded batch 7 of 29
Uploaded batch 8 of 29
Uploaded batch 9 of 29
Uploaded batch 10 of 29
Uploaded batch 11 of 29
Uploaded batch 12 of 29
Uploaded batch 13 of 29
Uploaded batch 14 of 29
Uploaded batch 15 of 29
Uploaded batch 16 of 29
Uploaded batch 17 of 29
Uploaded batch 18 of 29
Uploaded batch 19 of 29
Uploaded batch 20 of 29
Uploaded batch 21 of 29
Uploaded batch 22 of 29
Uploaded batch 23 of 29
Uploaded batch 24 of 29
Uploaded batch 25 of 29
Uploaded batch 26 of 29
Uploaded batch 27 of 29
Uploaded batch 28 of 29
Uploaded batch 29 of 29
Data upload complete!
Uploaded batch 1 of 35
Uploaded batch 2 of 35
Uploaded batch 3 of 35
Uploaded batch 4 of 35
Uploaded batch 5 of 35
Uploaded batch 6 of 35
Uploaded batch 7 of 35
Uploaded batch 8 of 35
Uploaded batch 9 of 35
Uploaded batch 10 of 35
Uploaded batch 11 of 35
Uploaded batch 12 of 35
Uploaded bat

# DATA RETRIEVAL AND RESPONSE GENERATION

In [35]:
# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Small and efficient

def retrieve_and_generate(report,type,disease,generated_text):
    query_vector = embedding_model.encode(generated_text).tolist()

    # Search in Pinecone
    search_results = index.query(vector=query_vector, top_k=5,include_values=True, include_metadata=True)

    # Extract relevant context
    context = "\n".join([f"- {match['metadata']['test_name']}: {match['metadata']['text']}" 
                          for match in search_results["matches"]])
    
    # Construct prompt
    prompt = f"""You are a expert medical lab technician. You have to interpret the medical lab report of the patient. I have provided 
    you the lab report, the test type, the disease which the patient thinks he is suffereing from and some context which may assist you in interpreting the report.
    Interpret the report in layman understandable form in just 2 lines, not more than that and donot write anything else other than the interpretation.
    In case the Context is not benificial, you can ignore it and answer from your own knowledge.
    :
        
    Context: {context}
    Type: {type}
    Disease: {disease}
    Report: {report}
    Random Context (it can be wrong): {generated_text}
    Answer:
    """
    
    # Call Groq API for response
    chat = ChatGroq(
    api_key = GROQ_API_KEY,
    model_name = "llama-3.3-70b-versatile"
    )   
    
    chain = chat | StrOutputParser()
    
    response = chain.invoke(prompt)
    return response,context


In [36]:

def retrieve_and_generate_prompt(query,type,disease):
    
    # Construct prompt
    prompt = f"""You are an expert medical lab technician. I have provided the test type, the suspected disease (if mentioned by the patient), and the lab report. Since the lab report primarily consists of numerical values, retrieving relevant context from my RAG system is challenging. Your task is to generate a concise 2-3 line textual summary that best represents this report, incorporating key abnormalities or notable findings. This summary will be used to fetch the most relevant medical context from my RAG, which, along with the report, will help the LLM provide an accurate interpretation.
    For more context, the data i have scrapped is from Testing.com, medlinplus.com and some books.
    Don't write anything else, other than the relavant interpretation.
    :
    
    Type: {type}
    Diesease: {disease}
    Report: {query}
        
    Answer:
    """
    
    # Call Groq API for response
    chat = ChatGroq(
    api_key = GROQ_API_KEY,
    model_name = "llama-3.3-70b-versatile"
    )   
    
    chain = chat | StrOutputParser()
    
    response = chain.invoke(prompt)
    return response

In [37]:
type = "CBC"
disease = "dengue"
query = """Complete Blood Count (CBC) Report
Test	Result	Reference Range	Units	Flag
White Blood Cell Count (WBC)	7.5	4.0 - 11.0	x10³/μL	Normal
Red Blood Cell Count (RBC)	3	4.5 - 5.9	x10⁶/μL	Abnormal
Hemoglobin (Hgb)	15.0	13.5 - 17.5	g/dL	Normal
Hematocrit (Hct)	45	40 - 52	%	Normal
Mean Corpuscular Volume (MCV)	88	80 - 100	fL	Normal
Mean Corpuscular Hemoglobin (MCH)	29	27 - 34	pg	Normal
Mean Corpuscular Hemoglobin Concentration (MCHC)	34	32 - 36	g/dL	Normal
Red Cell Distribution Width (RDW)	13.5	11.5 - 14.5	%	Normal
Platelet Count	250	150 - 450	x10³/μL	Normal
Neutrophils	35	40 - 70	%	Abnormal
Lymphocytes	10	20 - 40	%	Abnormal
Monocytes	6	2 - 10	%	Normal
Eosinophils	3	1 - 4	%	Normal
Basophils	1	0 - 2	%	Normal"""
generated_text = retrieve_and_generate_prompt(query,type,disease)
print(generated_text)
response, context = retrieve_and_generate(query,type,disease,generated_text)
print(context)
print(response)

Low Red Blood Cell Count and abnormal neutrophil and lymphocyte percentages are noted in this CBC report, which may be relevant in the context of suspected dengue disease. The patient's RBC count is below the reference range, indicating anemia or possible blood loss. Neutrophil and lymphocyte percentages are also outside the normal range, suggesting an abnormal immune response.
- Platelet Count: Test Quest Complete Blood Count Price 29 Type In person Sample Blood Tests for Red blood cells hemoglobin hematocrit mean corpuscular volume mean corpuscular hemoglobin mean corpuscular hemoglobin concentration red cell distribution width platelets mean platelet volume white blood cells neutrophils lymphocytes monocytes eosinophils basophils Results timeline A few business days Quest s Complete Blood Count includes the number of platelets per microliter in your blood sample as well as a mean platelet volume measuring their average size this is related to your physical performance blood clotting