In [1]:
import pandas as pd
import pickle
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the data from the pickled file
file_path = '/content/drive/My Drive/synthetic_dataset/goal_set.p'
with open(file_path, 'rb') as file:
    consultation_data = pickle.load(file)

# Extract 'train' data
train_data = consultation_data['train']

# Lists to hold disease tags and symptoms
disease_tags = []
symptoms = []

# Extract disease tag and symptoms from each consultation in the 'train' data
for item in train_data:
    # Extract disease tag
    disease_tags.append(item['disease_tag'])

    # Extract symptoms
    explicit_symptoms = list(item['goal']['explicit_inform_slots'].keys())
    implicit_symptoms = list(item['goal']['implicit_inform_slots'].keys())
    all_symptoms = explicit_symptoms + implicit_symptoms
    symptoms.append(all_symptoms)

# Create a DataFrame
df = pd.DataFrame({
    'Disease Tag': disease_tags,
    'Symptoms': symptoms
})

# Dictionary to hold symptoms and connected diseases
symptom_to_diseases = {}
symptom_counts = {}

# Populate the symptom to diseases mapping and count symptoms
for index, row in df.iterrows():
    disease_tag = row['Disease Tag']
    all_symptoms = row['Symptoms']

    for symptom in all_symptoms:
        if symptom not in symptom_to_diseases:
            symptom_to_diseases[symptom] = set()
            symptom_counts[symptom] = 0
        symptom_to_diseases[symptom].add(disease_tag)
        symptom_counts[symptom] += 1

# Convert the dictionary to a DataFrame
symptom_to_diseases_df = pd.DataFrame([
    {'Symptom': symptom, 'Connected Diseases': list(diseases), 'Count': symptom_counts[symptom]}
    for symptom, diseases in symptom_to_diseases.items()
])

# Sort the DataFrame by the frequency of symptoms in ascending order
symptom_to_diseases_df = symptom_to_diseases_df.sort_values(by='Count', ascending=True)

# Display the DataFrame
print(symptom_to_diseases_df)


Mounted at /content/drive
                              Symptom  \
263                      Feeling cold   
265                      Pus in urine   
261   Hand or finger cramps or spasms   
244       Back stiffness or tightness   
228                     Pus in sputum   
..                                ...   
62   Depressive or psychotic symptoms   
1                   Diminished vision   
5                           Back pain   
3                         Pain in eye   
21                          Skin rash   

                                    Connected Diseases  Count  
263                           [Chronic kidney disease]      1  
265                            [Dissociative disorder]      2  
261                   [Diabetic peripheral neuropathy]      3  
244                                [Chronic back pain]      3  
228                          [Endometrial hyperplasia]      3  
..                                                 ...    ...  
62   [Eating disorder, Anxiety, F

In [2]:
df


Unnamed: 0,Disease Tag,Symptoms
0,Central retinal artery or vein occlusion,"[Spots or clouds in vision, Diminished vision,..."
1,Degenerative disc disease,"[Shoulder pain, Back pain, Low back pain, Neck..."
2,Diabetic retinopathy,[Foreign body sensation in eye]
3,Chronic back pain,"[Low back pain, Back pain, Side pain]"
4,Air embolism,"[Wrist pain, Pain in eye, Shoulder cramps or s..."
...,...,...
23995,Concussion,"[Neck pain, Headache, Nausea]"
23996,Cushing syndrome,[Weight gain]
23997,Fibromyalgia,"[Low back pain, Back pain, Ache all over, Shou..."
23998,Dengue fever,"[Fever, Sore throat, Wrist pain, Pain during p..."


In [3]:
# Iterate over each row in the DataFrame
for index, row in symptom_to_diseases_df.iterrows():
    symptom = row['Symptom']
    connected_diseases = row['Connected Diseases']

    # Print the symptom
    print(f"Symptom: {symptom}")

    # Print each associated disease
    print("Associated diseases:")
    for disease in connected_diseases:
        print("- " + disease)

    print()  # Print a blank line for separation


Symptom: Feeling cold
Associated diseases:
- Chronic kidney disease

Symptom: Pus in urine
Associated diseases:
- Dissociative disorder

Symptom: Hand or finger cramps or spasms
Associated diseases:
- Diabetic peripheral neuropathy

Symptom: Back stiffness or tightness
Associated diseases:
- Chronic back pain

Symptom: Pus in sputum
Associated diseases:
- Endometrial hyperplasia

Symptom: Incontinence of stool
Associated diseases:
- Decubitus ulcer

Symptom: Itching of scrotum
Associated diseases:
- Cushing syndrome

Symptom: Back weakness
Associated diseases:
- Endometrial hyperplasia
- Dissociative disorder

Symptom: Frequent menstruation
Associated diseases:
- Fibrocystic breast disease

Symptom: Nose deformity
Associated diseases:
- Acne

Symptom: Abnormal size or shape of ear
Associated diseases:
- Endometrial hyperplasia

Symptom: Scanty menstrual flow
Associated diseases:
- Female infertility of unknown cause

Symptom: Disturbance of smell or taste
Associated diseases:
- Decubit

# Print symptoms with numbers for user selection
print("Please select the symptom you have by entering its corresponding number:")
for i, symptom in enumerate(matching_symptoms, start=1):
    print(f"{i}. {symptom}")

# Ask the user to select a symptom by its corresponding number
selected_index = input("Enter the number corresponding to the symptom you have: ")

# Validate the user input
try:
    selected_index = int(selected_index)
    if 1 <= selected_index <= len(matching_symptoms):
        selected_symptom = matching_symptoms[selected_index - 1]
        print(f"You selected: {selected_symptom}")

        # Display associated diseases for the selected symptom
        connected_diseases = symptom_to_diseases_df.loc[symptom_to_diseases_df['Symptom'] == selected_symptom, 'Connected Diseases'].iloc[0]
        if connected_diseases:
            print(f"The symptom '{selected_symptom}' is associated with the following diseases:")
            for disease in connected_diseases:
                print("  - " + disease)

                # Find symptoms associated with the current disease
                symptoms_for_disease = symptom_to_diseases_df[symptom_to_diseases_df['Connected Diseases'].apply(lambda x: disease in x)]['Symptom']
                # Display associated symptoms for the current disease
                if not symptoms_for_disease.empty:
                    print("    Associated symptoms:")
                    for symptom in symptoms_for_disease:
                        print("    - " + symptom)
                else:
                    print("    No symptoms found associated with this disease.")
        else:
            print(f"No diseases are associated with the symptom '{selected_symptom}'.")
    else:
        print("Invalid input. Please enter a valid number.")
except ValueError:
    print("Invalid input. Please enter a valid number.")


In [4]:
# Initialize a dictionary to store the count of each associated symptom
symptom_counts = {}

# Iterate over all connected diseases
for connected_diseases in symptom_to_diseases_df['Connected Diseases']:
    # Iterate over symptoms associated with the current disease
    for symptom in connected_diseases:
        if symptom in symptom_counts:
            symptom_counts[symptom] += 1
        else:
            symptom_counts[symptom] = 1

# Sort the symptom_counts dictionary by value (count) in ascending order
sorted_symptom_counts = sorted(symptom_counts.items(), key=lambda x: x[1])

# Print the symptoms in ascending order of count
print("Symptoms from smallest count to highest count:")
for symptom, count in sorted_symptom_counts:
    print(f"- {symptom}: {count}")


Symptoms from smallest count to highest count:
- Chronic kidney disease: 6
- Decubitus ulcer: 7
- Aphakia: 8
- Fibrocystic breast disease: 9
- Chondromalacia of the patella: 9
- Female infertility of unknown cause: 10
- Cystic Fibrosis: 10
- Central retinal artery or vein occlusion: 10
- De Quervain disease: 10
- Dissociative disorder: 11
- Drug abuse cocaine: 11
- Adhesive capsulitis of the shoulder: 11
- Acute glaucoma: 11
- Diabetic peripheral neuropathy: 12
- Chronic back pain: 12
- Endometrial hyperplasia: 12
- Cushing syndrome: 12
- Acne: 12
- Fracture of the rib: 12
- Acariasis: 12
- Diabetes insipidus: 12
- Adjustment reaction: 12
- Carpal tunnel syndrome: 12
- Endometrial cancer: 12
- Erectile dysfunction: 12
- Contact dermatitis: 12
- Alzheimer disease: 12
- Graves disease: 12
- Anxiety: 12
- Cyst of the eyelid: 12
- Diabetic retinopathy: 12
- Epididymitis: 12
- Diaper rash: 12
- Conductive hearing loss: 12
- Eating disorder: 12
- Amyloidosis: 12
- Epidural hemorrhage: 12
- D

In [8]:
# Initialize a list to store user's input symptoms
user_symptoms = []

while True:
    # Ask the user for a symptom
    user_input = input("Please enter a symptom: ").strip().lower()

    # Find matching symptoms in the dataset
    matching_symptoms = [symptom for symptom in symptom_to_diseases_df['Symptom'] if user_input in symptom.lower()]

    if not matching_symptoms:
        print(f"No symptoms matching '{user_input}' found. Please try again.")
        continue

    # Display matching symptoms for user to choose from
    print(f"Symptoms matching '{user_input}':")
    for idx, symptom in enumerate(matching_symptoms, 1):
        print(f"{idx}. {symptom}")

    # Ask the user which symptom they have
    selected_index = input("Enter the number corresponding to the symptom you have: ")

    # Validate user selection
    try:
        selected_index = int(selected_index)
        if 1 <= selected_index <= len(matching_symptoms):
            chosen_symptom = matching_symptoms[selected_index - 1]
            if chosen_symptom not in user_symptoms:
                user_symptoms.append(chosen_symptom)
                print(f"You selected: {chosen_symptom}")
            else:
                print(f"You have already selected the symptom: {chosen_symptom}. Please choose another symptom.")
                continue
        else:
            print("Invalid selection. Please enter a number corresponding to the symptom.")
            continue
    except ValueError:
        print("Invalid selection. Please enter a number corresponding to the symptom.")
        continue

    # Find diseases associated with all user's input symptoms
    all_symptoms_diseases = [set(symptom_to_diseases_df.loc[symptom_to_diseases_df['Symptom'] == symptom]['Connected Diseases'].explode()) for symptom in user_symptoms]
    common_diseases = set.intersection(*all_symptoms_diseases)

    if len(common_diseases) == 1:
        # If only one disease is found, display it and exit
        print("Only one disease found associated with all symptoms entered:")
        print("- " + common_diseases.pop())
        break

    # If no diseases are associated with all the symptoms entered, prompt the user to enter more symptoms
    if not common_diseases:
        print("No diseases found associated with all the symptoms entered. Please enter more symptoms.")
        continue

    # Show possible symptoms associated with the common diseases
    possible_symptoms = set()
    for disease in common_diseases:
        # Filter symptoms associated with the common diseases and user's input symptoms
        disease_symptoms = set(symptom_to_diseases_df[symptom_to_diseases_df['Connected Diseases'].apply(lambda x: disease in x)]['Symptom'])
        possible_symptoms.update(disease_symptoms)

    possible_symptoms.difference_update(user_symptoms)  # Exclude already entered symptoms

    # Print the possible symptoms
    if possible_symptoms:
        print("\nPossible symptoms:")
        for i, symptom in enumerate(possible_symptoms, start=1):
            print(f"{i}. {symptom}")

        # Ask the user to select a symptom by its corresponding number
        selected_number = input("Please enter the number corresponding to the symptom you want to choose: ")
        try:
            selected_index = int(selected_number)
            if 1 <= selected_index <= len(possible_symptoms):
                user_input = list(possible_symptoms)[selected_index - 1]
                if user_input not in user_symptoms:
                    user_symptoms.append(user_input)  # Append the user's input to the list of symptoms
                else:
                    print(f"You have already selected the symptom: {user_input}. Please choose another symptom.")
            else:
                print("Invalid input. Please enter a valid number.")
        except ValueError:
            print("Invalid input. Please enter a valid number.")
    else:
        print("No additional symptoms found. Please enter more symptoms or verify your inputs.")


Please enter a symptom: back
Symptoms matching 'back':
1. Back stiffness or tightness
2. Back weakness
3. Back mass or lump
4. Back cramps or spasms
5. Low back weakness
6. Low back pain
7. Back pain
Enter the number corresponding to the symptom you have: 7
You selected: Back pain

Possible symptoms:
1. Difficulty speaking
2. Back mass or lump
3. Involuntary urination
4. Skin on leg or foot looks infected
5. Neck pain
6. Headache
7. Stiffness all over
8. Elbow pain
9. Muscle weakness
10. Hip pain
11. Fatigue
12. Hurts to breath
13. Back stiffness or tightness
14. Lower body pain
15. Abnormal involuntary movements
16. Excessive growth
17. Muscle pain
18. Dizziness
19. Shortness of breath
20. Emotional symptoms
21. Delusions or hallucinations
22. Sharp chest pain
23. Diminished hearing
24. Elbow weakness
25. Retention of urine
26. Back cramps or spasms
27. Lower abdominal pain
28. Weakness
29. Ache all over
30. Joint pain
31. Acne or pimples
32. Facial pain
33. Blood in urine
34. Dry or 

KeyboardInterrupt: Interrupted by user

In [6]:
# Initialize a list to store user's input symptoms
user_symptoms = []

while True:
    # Ask the user for a symptom
    user_input = input("Please enter a symptom: ").strip().lower()

    # Find matching symptoms in the dataset
    matching_symptoms = [symptom for symptom in symptom_to_diseases_df['Symptom'] if user_input in symptom.lower()]

    if not matching_symptoms:
        print(f"No symptoms matching '{user_input}' found. Please try again.")
        continue

    # Display matching symptoms for user to choose from
    print(f"Symptoms matching '{user_input}':")
    for idx, symptom in enumerate(matching_symptoms, 1):
        print(f"{idx}. {symptom}")

    # Ask the user which symptom they have
    selected_index = input("Enter the number corresponding to the symptom you have: ")

    # Validate user selection
    try:
        selected_index = int(selected_index)
        if 1 <= selected_index <= len(matching_symptoms):
            chosen_symptom = matching_symptoms[selected_index - 1]
            if chosen_symptom not in user_symptoms:
                user_symptoms.append(chosen_symptom)
                print(f"You selected: {chosen_symptom}")
            else:
                print(f"You have already selected the symptom: {chosen_symptom}. Please choose another symptom.")
                continue
        else:
            print("Invalid selection. Please enter a number corresponding to the symptom.")
            continue
    except ValueError:
        print("Invalid selection. Please enter a number corresponding to the symptom.")
        continue

    # Find diseases associated with all user's input symptoms
    all_symptoms_diseases = [set(symptom_to_diseases_df.loc[symptom_to_diseases_df['Symptom'] == symptom]['Connected Diseases'].explode()) for symptom in user_symptoms]
    common_diseases = set.intersection(*all_symptoms_diseases)

    if len(common_diseases) == 1:
        # If only one disease is found, display it and exit
        print("Only one disease found associated with all symptoms entered:")
        print("- " + common_diseases.pop())
        break

    # If no diseases are associated with all the symptoms entered, prompt the user to enter more symptoms
    if not common_diseases:
        print("No diseases found associated with all the symptoms entered. Please enter more symptoms.")
        continue

    # Show possible symptoms associated with the common diseases
    possible_symptoms = set()
    for disease in common_diseases:
        # Filter symptoms associated with the common diseases
        disease_symptoms = set(symptom_to_diseases_df[symptom_to_diseases_df['Connected Diseases'].apply(lambda x: disease in x)]['Symptom'])
        possible_symptoms.update(disease_symptoms)

    # Keep only those symptoms that are connected to the user's chosen symptoms
    possible_symptoms = possible_symptoms.intersection(*[
        set(symptom_to_diseases_df.loc[symptom_to_diseases_df['Symptom'] == symptom]['Connected Diseases'].explode()) for symptom in user_symptoms
    ])

    possible_symptoms.difference_update(user_symptoms)  # Exclude already entered symptoms

    # Print the possible symptoms
    if possible_symptoms:
        print("\nPossible symptoms:")
        for i, symptom in enumerate(possible_symptoms, start=1):
            print(f"{i}. {symptom}")

        # Ask the user to select a symptom by its corresponding number
        selected_number = input("Please enter the number corresponding to the symptom you want to choose: ")
        try:
            selected_index = int(selected_number)
            if 1 <= selected_index <= len(possible_symptoms):
                user_input = list(possible_symptoms)[selected_index - 1]
                if user_input not in user_symptoms:
                    user_symptoms.append(user_input)  # Append the user's input to the list of symptoms
                else:
                    print(f"You have already selected the symptom: {user_input}. Please choose another symptom.")
            else:
                print("Invalid input. Please enter a valid number.")
        except ValueError:
            print("Invalid input. Please enter a valid number.")
    else:
        print("No additional symptoms found. Please enter more symptoms or verify your inputs.")


Please enter a symptom: head
Symptoms matching 'head':
1. Frontal headache
2. Headache
Enter the number corresponding to the symptom you have: 1
You selected: Frontal headache
Only one disease found associated with all symptoms entered:
- Carbon monoxide poisoning


In [7]:
import pandas as pd
import pickle
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the data from the pickled file
file_path = '/content/drive/My Drive/synthetic_dataset/goal_set.p'
with open(file_path, 'rb') as file:
    consultation_data = pickle.load(file)

# Lists to hold symptoms and diseases
symptoms = []
diseases = []

# Extract symptoms and diseases from each consultation in the 'train' data
for item in consultation_data['train']:
    # Extract disease tag
    disease_tag = item['disease_tag']

    # Extract symptoms
    explicit_symptoms = list(item['goal']['explicit_inform_slots'].keys())
    implicit_symptoms = list(item['goal']['implicit_inform_slots'].keys())
    all_symptoms = explicit_symptoms + implicit_symptoms

    # Append the extracted symptoms and disease to the respective lists
    symptoms.extend(all_symptoms)
    diseases.extend([disease_tag] * len(all_symptoms))

# Create a DataFrame from the lists of symptoms and diseases
df = pd.DataFrame({
    'Disease Tag': diseases,
    'Symptom': symptoms
})

# Ask the user for a symptom
symptom_input = input("Enter a symptom: ").strip().lower()

# Find the symptom in the DataFrame
matching_symptoms = df[df['Symptom'].str.lower().str.contains(symptom_input)]

if matching_symptoms.empty:
    print(f"No matching diseases found for the symptom '{symptom_input}'.")
else:
    # Extract and display the matching diseases
    print(f"Matching diseases for the symptom '{symptom_input}':")
    for disease in matching_symptoms['Disease Tag'].unique():
        print("- " + disease)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Enter a symptom: Frontal headache
Matching diseases for the symptom 'frontal headache':
- Carbon monoxide poisoning
