In [35]:
import dotenv
import os
import pandas as pd
from pinecone import Pinecone
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser


dotenv.load_dotenv(".env")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")


In [36]:


pc = Pinecone(api_key=PINECONE_API_KEY)


In [16]:
# Chunk the data using LangChain
chunk_size = 384

# Create or connect to a Pinecone index
index_name = "test-medbot"
index = pc.Index(index_name)



# FOR FILES/BOOKS

In [21]:

CHUNK_SIZE = 384  # Define chunk size

# Function to extract text from PDF and save to txt file
def save_pdf_text(pdf_path, output_folder):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    file_number = len(os.listdir(output_folder)) + 1
    output_path = os.path.join(output_folder, f"{file_number}.txt")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(text)
    return output_path

# Function to split text into chunks
def create_text_chunks(text, chunk_size):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

# Function to extract headings from text
def extract_headings_and_text(text):
    lines = text.split("\n")
    extracted_data = []
    current_heading = "Unknown"
    buffer = []
    for line in lines:
        if line.isupper() and len(line) > 3:  # Heuristic for headings
            if buffer:
                extracted_data.append((current_heading, " ".join(buffer)))
                buffer = []
            current_heading = line.strip()
        else:
            buffer.append(line.strip())
    if buffer:
        extracted_data.append((current_heading, " ".join(buffer)))
    return extracted_data

# Function to create embeddings
def embeddingCreator(text):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return model.encode(text).tolist()



In [18]:

# # Process PDF files and save text
# pdf_folder = "../DATA/Books/New folder"
# txt_folder = "../DATA/TextFiles"
# os.makedirs(txt_folder, exist_ok=True)


# for file in os.listdir(pdf_folder):
#     file_path = os.path.join(pdf_folder, file)
#     if os.path.isfile(file_path) and file.endswith(".pdf"):
#         print(f"Processing PDF: {file}")
#         save_pdf_text(file_path, txt_folder)


In [22]:
# Process saved text files
txt_folder = "../DATA/TextFiles"
data_to_upsert = []
txt_files = sorted(os.listdir(txt_folder), key=lambda x: int(x.split(".")[0]))

for txt_file in txt_files:
    txt_path = os.path.join(txt_folder, txt_file)
    with open(txt_path, "r", encoding="utf-8") as f:
        text_data = f.read()
    
    extracted_data = extract_headings_and_text(text_data)
    text_chunks = create_text_chunks(text_data, CHUNK_SIZE)
    
    for chunk_idx, chunk in enumerate(text_chunks):
        tokens = embeddingCreator(chunk)
        heading = "Unknown"
        for heading_text, content in extracted_data:
            if chunk in content:
                heading = heading_text
                break
        
        vector_id = f"{txt_file}_chunk_{chunk_idx}"
        metadata = {"test_name": heading, "source": txt_file, "url": "NaN"}
        data_to_upsert.append({"id": vector_id, "values": tokens, "metadata": metadata})


In [23]:
len(data_to_upsert)

970

In [24]:
# Batch upload to Pinecone
batch_size = 100
for i in range(0, len(data_to_upsert), batch_size):
    batch = data_to_upsert[i:i + batch_size]
    # Convert all values in the chunk to float
    for item in batch:
        item['values'] = [float(x) for x in item['values']]
    index.upsert(vectors=batch, namespace="ns1")
    print(f"Uploaded batch {i // batch_size + 1} of {len(data_to_upsert) // batch_size + 1}")

print("All data uploaded successfully!")

Uploaded batch 1 of 10
Uploaded batch 2 of 10
Uploaded batch 3 of 10
Uploaded batch 4 of 10
Uploaded batch 5 of 10
Uploaded batch 6 of 10
Uploaded batch 7 of 10
Uploaded batch 8 of 10
Uploaded batch 9 of 10
Uploaded batch 10 of 10
All data uploaded successfully!


# FOR DF 

In [26]:

def upsert_data_to_pinecone(df):
    # Ensure the "tokens" column is in the correct format (list of floats)
    df["tokens"] = df["tokens"].apply(eval)  # Convert string representation of list to actual list
    # Prepare data for upserting into Pinecone
    print("Uploading data to Pinecone...")
    data_to_upsert = []
    for idx, row in df.iterrows():
        metadata = {
            "test_name": row["Test Name"],
            "source": row["Source"],
            "url": row["URL"]
        }
        data_to_upsert.append({"id": str(idx), "values": row["tokens"], "metadata": metadata})

    return data_to_upsert


In [27]:
def upload_In_Batches(data_to_upsert, batch_size=100):
    # Batch upload
    for i in range(0, len(data_to_upsert), batch_size):
        batch = data_to_upsert[i:i + batch_size]
        index.upsert(vectors=batch)
        print(f"Uploaded batch {i // batch_size + 1} of {len(data_to_upsert) // batch_size + 1}")
        
    print("Data upload complete!")

In [28]:


# Load your DataFrame
df = pd.read_csv("../DATA/Embedded_Files/medlinePlus_tokens.csv")  # Replace with your file path
df2 = pd.read_csv("../DATA/Embedded_Files/testing_com_tokens.csv")  # Replace with your file path

In [29]:
data_to_upsert = upsert_data_to_pinecone(df)
data_to_upsert2 = upsert_data_to_pinecone(df2)
print("DATA 1",data_to_upsert[0])
print("DATA 2",data_to_upsert2[0])

Uploading data to Pinecone...
Uploading data to Pinecone...
DATA 1 {'id': '0', 'values': [0.06680531799793243, -0.06175839528441429, 0.007783485110849142, 0.0021615491714328527, 0.06139960139989853, -0.019108224660158157, 0.054005976766347885, 0.11581610143184662, 0.005655866116285324, -0.0014369128039106727, -0.035817358642816544, -0.01720498688519001, 0.04893006384372711, 0.08150792121887207, -0.049228064715862274, -0.0016232702182605863, -0.001231823000125587, -0.014281578361988068, -0.09298525005578995, 0.017924031242728233, -0.03283045068383217, 0.022996917366981506, 0.0019434948917478323, 0.072845958173275, -0.07834290713071823, 0.06060261279344559, -0.01609104312956333, -0.009373136796057224, -0.049127303063869476, 0.017370333895087242, 0.08494198322296143, 0.02621404640376568, 0.0038784390781074762, 0.010923248715698719, 0.0061714225448668, -0.0158137995749712, -0.038665831089019775, 0.05852259695529938, 0.032175470143556595, 0.0005945598240941763, -0.03807175159454346, -0.0202

In [30]:
upload_In_Batches(data_to_upsert)
upload_In_Batches(data_to_upsert2)

Uploaded batch 1 of 4
Uploaded batch 2 of 4
Uploaded batch 3 of 4
Data upload complete!
Uploaded batch 1 of 4
Uploaded batch 2 of 4
Uploaded batch 3 of 4
Uploaded batch 4 of 4
Data upload complete!


# DATA RETRIEVAL AND RESPONSE GENERATION

In [37]:
# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Small and efficient

# Load medical test dataset (Ensure your DataFrame has the required columns)
df = pd.read_csv("../DATA/Scraped_Data/medical_tests_interpretation.csv")  # Replace with your actual dataset file
df = df[:4]
# **Step 1: Indexing the Descriptions into Pinecone**
# def index_data():
#     vectors = []
#     for _, row in df.iterrows():
#         text = row["Description"]
#         vector = embedding_model.encode(text).tolist()
#         print("Vector: ",vector)
#         metadata = {
#             "Test Name": row["Test Name"],
#             "Source": row["Source"],
#             "URL": row["URL"]
#         }
#         vectors.append((str(_), vector, metadata))

#     # Upload to Pinecone
#     index.upsert(vectors)
#     print(f"Indexed {len(vectors)} records successfully!")

# **Step 2: Query Pinecone and Generate Response using DeepSeek**
def retrieve_and_generate(query,type,disease):
    query_vector = embedding_model.encode(query).tolist()

    # Search in Pinecone
    search_results = index.query(vector=query_vector, top_k=5, include_metadata=True)
    print("Search Result: ",search_results)

    # Extract relevant context
    context = "\n".join([f"- {match['metadata']['test_name']}: {match['metadata']['url']}" 
                          for match in search_results["matches"]])
    
    print(context)

    # Construct prompt
    prompt = f"""You are a expert medical lab technician. You have to interpret the medical lab report of the patient. I have provided 
    you the lab report, the test type, the disease which the patient thinks he is suffereing from and some context which may assist you in interpreting the report.
    Interpret the report in layman understandable form in just 2 to 3 lines, not more than that and donot write anything else other than the interpretation.
    In case the Context is not benificial, you can ignore it and answer from your own knowledge.
    :
        
    Context:
    {context}
    Type: {type}
    Diesease: {disease}
    Report: {query}
        
    Answer:
    """
    
    # Call Groq API for response
    chat = ChatGroq(
    api_key = GROQ_API_KEY,
    model_name = "mixtral-8x7b-32768"
    )   
    
    chain = chat | StrOutputParser()
    
    response = chain.invoke(prompt)
    return response

# **Run the indexing function once**
# index_data()
type = "CBC"
disease = "dengue"
query = """Complete Blood Count (CBC) Report
Test	Result	Reference Range	Units	Flag
White Blood Cell Count (WBC)	7.5	4.0 - 11.0	x10³/μL	Normal
Red Blood Cell Count (RBC)	3	4.5 - 5.9	x10⁶/μL	Abnormal
Hemoglobin (Hgb)	15.0	13.5 - 17.5	g/dL	Normal
Hematocrit (Hct)	45	40 - 52	%	Normal
Mean Corpuscular Volume (MCV)	88	80 - 100	fL	Normal
Mean Corpuscular Hemoglobin (MCH)	29	27 - 34	pg	Normal
Mean Corpuscular Hemoglobin Concentration (MCHC)	34	32 - 36	g/dL	Normal
Red Cell Distribution Width (RDW)	13.5	11.5 - 14.5	%	Normal
Platelet Count	250	150 - 450	x10³/μL	Normal
Neutrophils	35	40 - 70	%	Abnormal
Lymphocytes	10	20 - 40	%	Abnormal
Monocytes	6	2 - 10	%	Normal
Eosinophils	3	1 - 4	%	Normal
Basophils	1	0 - 2	%	Normal"""
response = retrieve_and_generate(query,type,disease)
print(response)


Search Result:  {'matches': [{'id': '279',
              'metadata': {'source': 'testing.com',
                           'test_name': 'Mch Test',
                           'url': 'https://www.testing.com/tests/mch-test/'},
              'score': 0.636786342,
              'values': []},
             {'id': '318',
              'metadata': {'source': 'testing.com',
                           'test_name': 'Rdw Test',
                           'url': 'https://www.testing.com/tests/rdw-test/'},
              'score': 0.615108132,
              'values': []},
             {'id': '211',
              'metadata': {'source': 'testing.com',
                           'test_name': 'Mchc Test',
                           'url': 'https://www.testing.com/tests/mchc-test/'},
              'score': 0.613560379,
              'values': []},
             {'id': '104',
              'metadata': {'source': 'testing.com',
                           'test_name': 'Mcv Test',
                           'u