In [8]:
import pandas as pd

# Load dataset
df = pd.read_csv("Final_Augmented_dataset_Diseases_and_Symptoms.csv")

# Convert binary symptom columns into a list of symptom names
df["symptom_text"] = df.drop(columns=["diseases"]).apply(
    lambda x: [col for col in x.index if x[col] == 1], axis=1
)

# Merge symptoms for each unique disease while ensuring no duplicates
df_grouped = df.groupby("diseases")["symptom_text"].apply(
    lambda x: list(set(sum(x, [])))  # Flatten list and remove duplicates
).reset_index()

# Convert list back to comma-separated string
df_grouped["symptoms"] = df_grouped["symptom_text"].apply(lambda x: ", ".join(sorted(x)))

# Drop intermediate column
df_grouped = df_grouped.drop(columns=["symptom_text"])

# Save the processed dataset
df_grouped.to_csv("processed_symptom_disease.csv", index=False)

print(df_grouped.head())  # Preview the new format

                    diseases                                                                                                                                                                                                  symptoms
0  abdominal aortic aneurysm                                                                                                  arm swelling, back pain, burning abdominal pain, palpitations, sharp abdominal pain, shortness of breath
1           abdominal hernia  ache all over, groin mass, irregular belly button, lower abdominal pain, regurgitation, regurgitation.1, sharp abdominal pain, swollen abdomen, symptoms of the scrotum and testes, upper abdominal pain
2            abscess of nose                                                                                                         coryza, cough, fever, irritable infant, nasal congestion, sinus congestion, sore throat, vomiting
3        abscess of the lung                                                

In [2]:
import pandas as pd

# Load the dataset (replace 'your_dataset.csv' with the actual file name)
df = pd.read_csv("dataset.csv", header=None)

# Rename the first column to 'disease' and the rest as 'symptoms'
df.rename(columns={0: "disease"}, inplace=True)

# Melt the dataframe to convert symptom columns into rows
df = df.melt(id_vars=["disease"], value_name="symptom").drop(columns=["variable"])

# Drop NaN values in the symptom column
df = df.dropna()

# Group by disease and aggregate symptoms as a unique list
df_grouped = df.groupby("disease")["symptom"].apply(lambda x: ", ".join(sorted(set(x)))).reset_index()

# Save to a new CSV file
df_grouped.to_csv("disease_symptoms.csv", index=False)

print("Conversion complete! Check 'disease_symptoms.csv'.")


Conversion complete! Check 'disease_symptoms.csv'.


In [4]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

# Load processed dataset
df = pd.read_csv("merged_dataset.csv")

# Load a pre-trained sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")  # Fast and accurate

# Encode symptom descriptions into dense vectors
df["embedding"] = df["symptoms"].apply(lambda x: model.encode(x))

# Save embeddings (optional, to avoid re-encoding every time)
np.save("symptom_embeddings.npy", np.vstack(df["embedding"].values))
df.to_csv("processed_with_embeddings.csv", index=False)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def merge_disease_datasets(df1, df2):
    # Convert disease names to lowercase for case-insensitive merging
    df1['diseases'] = df1['diseases'].str.lower()
    df2['diseases'] = df2['diseases'].str.lower()
    
    # Combine the two datasets
    combined = pd.concat([df1, df2], ignore_index=True)
    
    # Merge symptoms by disease
    merged = (
        combined.groupby('diseases')['symptoms']
        .apply(lambda x: ', '.join(sorted(set(', '.join(x).split(', ')))))
        .reset_index()
    )
    
    return merged

df1 = pd.read_csv("disease_symptoms.csv")
df2 = pd.read_csv("processed_symptom_disease.csv")

merged_df = merge_disease_datasets(df1, df2)
merged_df.to_csv("merged_dataset.csv", index=False)

In [5]:
import faiss
import numpy as np

# Load symptom embeddings
embeddings = np.load("symptom_embeddings.npy")

# Create a FAISS index (L2 distance)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# Save FAISS index
faiss.write_index(index, "symptom_faiss.index")

In [6]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import faiss

model = SentenceTransformer("all-MiniLM-L6-v2")  # Fast and accurate

df = pd.read_csv("merged_dataset.csv")

def retrieve_diseases(user_input, model, df, index, top_k=10):
    # Encode user symptoms
    user_embedding = model.encode(user_input).reshape(1, -1)

    # Search FAISS index
    distances, indices = index.search(user_embedding, top_k)

    # Get top-matching diseases
    results = df.iloc[indices[0]][["diseases", "symptoms"]]
    return results

# Load FAISS index
index = faiss.read_index("symptom_faiss.index")

# Example: User input symptoms
user_symptoms = "tiredness, fever, chills, sore throat"
matching_diseases = retrieve_diseases(user_symptoms, model, df, index)

print(matching_diseases)


                     diseases  \
438                  leukemia   
577               pharyngitis   
700              strep throat   
51            aplastic anemia   
742                tracheitis   
670                    sepsis   
260                       flu   
5      abscess of the pharynx   
467                meningitis   
794  white blood cell disease   

                                              symptoms  
438  fatigue, fever, mouth pain, muscle stiffness o...  
577  cough, fever, hoarse voice, sore throat, wheezing  
700  ache all over, chills, cough, decreased appeti...  
51   fatigue, fever, lack of growth, nausea, slurri...  
742  cough, fever, hoarse voice, nasal congestion, ...  
670  chills, cough, decreased appetite, difficulty ...  
260  ache all over, chills, coryza, cough, diarrhea...  
5    cough, difficulty in swallowing, fever, headac...  
467  ache all over, cough, fever, headache, nausea,...  
794  chills, cough, diarrhea, fatigue, fever, heart...  


In [1]:
from langgraph.graph import START, StateGraph, END
from pydantic import BaseModel, Field
from typing import Optional, List, Dict, Sequence, Any, Literal
from typing_extensions import TypedDict
from langchain_openai import ChatOpenAI
from langgraph.types import Command
from enum import Enum

from sentence_transformers import SentenceTransformer
import pandas as pd
import faiss
from langchain_core.prompts import ChatPromptTemplate

import sys
import os, getpass
from dotenv import load_dotenv
load_dotenv()

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "langchain-academy"
_set_env("OPENAI_API_KEY")
_set_env("LANGCHAIN_API_KEY")

model = SentenceTransformer("all-MiniLM-L6-v2")  # Fast and accurate

df = pd.read_csv("processed_symptom_disease.csv")

index = faiss.read_index("symptom_faiss.index")

def retrieve_diseases(user_input, model, df, index, top_k=10):
    # Encode user symptoms
    user_embedding = model.encode(user_input).reshape(1, -1)

    # Search FAISS index
    distances, indices = index.search(user_embedding, top_k)

    # Get top-matching diseases
    results = df.iloc[indices[0]][["diseases", "symptoms"]]
    return results

class Input(TypedDict):
    user_query: str
    symptoms: Optional[List[str]] = None
    final_answer: Optional[str] = None
    diseases: Optional[str] = None

class Source(str, Enum):
    Medical_Query = "Medical Query"
    Generic = "Generic"

base_model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

def supervisor(input : Input)-> Command[Literal["Analysis", "Help Desk"]]:
    class LLMOutput(TypedDict):
        category: Source
    system_msg = """You are a supervisor routing user query. You have to analyze the provided user query and decide where to route the user query by deciding the category of the query, keeping the following instructions in mind:
                 1. If the user query is requesting for medical assistance or diagnosis related to their symptoms, then assign the category as Medical Query.
                 2. If the user query is of any type other than specified above, then assign the category as Generic.
                 Return as output the category of the user query, which is one of [Medical Query, Generic]
                 """
    messages = [
        ("system", system_msg),
        ("user", input["user_query"])
    ]
    response = base_model.with_structured_output(LLMOutput).invoke(messages)
    print(response)
    if response["category"] == "Medical Query":
        return Command(goto="Analysis", update={
        "user_query":  input["user_query"]
       })
    return Command(goto="Help Desk", update={
        "user_query":  input["user_query"]
    })

def help_desk(input : Input):
    system_msg = """You are an expert medical examiner. You have been provided a generic user query. You have to return a short and brief response explaining to the user that their query can't be answered, and that they should inquire about medical diagnosis instead"""
    messages = [
        ("system", system_msg),
        ("user", input["user_query"])
    ]
    response  = base_model.invoke(ChatPromptTemplate.from_messages(messages).invoke({}))
    return {
        "final_answer": response.content
    }

def analysis(input : Input):
    class LLMOutput(TypedDict):
        symptoms = List[str]
    system_msg = """You are an expert medical examiner.
    You have been provided a user query for medical diagnosis. You have to analyze the query and 
    determine the primary symptoms being experienced by the user.
    Return a list of symptoms being experienced by the user."""
    messages = [
        ("system", system_msg),
        ("user", input["user_query"])
    ]
    response = base_model.with_structured_output(LLMOutput).invoke(messages)
    print(response['symptoms'])
    return{
        "symptoms": response['symptoms']
    }

def diagnosis(input : Input):
    symptoms = ",".join(input["symptoms"])
    diseases = retrieve_diseases(symptoms, model, df, index)
    system_msg = """You are an expert medical examiner.
    You have been provided a user query for medical diagnosis, as well as a list of diseases which might be likely affecting the patient according to their displayed symptoms.
    User query: {query}
    Diseases: {diseases}
    Aanalyze the user query and diseases and provide a detailed possible diagnosis to the user"""
    messages = [
        ("system", system_msg)
    ]
    response  = base_model.invoke(ChatPromptTemplate.from_messages(messages).invoke({
                "query": input["user_query"],
                "diseases": diseases
    }))
    return{
        "diseases": diseases,
        "final_answer": response.content
    }

builder = StateGraph(Input)

builder.add_node("Analysis", analysis)
builder.add_node("Diagnosis", diagnosis)
builder.add_node("Help Desk", help_desk)
builder.add_node("Supervisor", supervisor)

builder.add_edge(START, "Supervisor")
builder.add_edge("Help Desk", END)
builder.add_edge("Analysis", "Diagnosis")
builder.add_edge("Diagnosis", END)

MedBot = builder.compile()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pu = MedBot.invoke(
    {
        "user_query": "Feeling really tired lately, and I have a fever with chills. Also, my throat is sore."
    }
)

{'category': 'Medical Query'}
['tiredness', 'fever', 'chills', 'sore throat']


In [7]:
print(pu["final_answer"])

Based on the symptoms you've described—feeling really tired, having a fever with chills, and a sore throat—there are several potential diagnoses to consider. Let's analyze the symptoms in relation to the diseases listed:

1. **Leukemia**: This condition can cause fatigue and fever, but it typically presents with additional symptoms such as unexplained bruising, frequent infections, or bleeding. While your symptoms could align with leukemia, they are not specific enough to suggest this diagnosis without further investigation.

2. **Pharyngitis**: This is an inflammation of the throat that can cause a sore throat, fever, and sometimes cough. It is a common cause of sore throat and could explain your symptoms.

3. **Strep Throat**: This bacterial infection is characterized by a severe sore throat, fever, chills, and often body aches. It is a strong contender given your sore throat and fever.

4. **Aplastic Anemia**: This condition can cause fatigue and fever, but it usually presents with 

In [5]:
diseases = pu.get("diseases", None)


None
