In [None]:
import pandas as pd
import pickle
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the data from the pickled file
file_path = '/content/drive/My Drive/synthetic_dataset/goal_set.p'
with open(file_path, 'rb') as file:
    consultation_data = pickle.load(file)

# Extract 'train' data
train_data = consultation_data['train']

# Lists to hold disease tags and symptoms
disease_tags = []
symptoms = []

# Extract disease tag and symptoms from each consultation in the 'train' data
for item in train_data:
    # Extract disease tag
    disease_tags.append(item['disease_tag'])

    # Extract symptoms
    explicit_symptoms = list(item['goal']['explicit_inform_slots'].keys())
    implicit_symptoms = list(item['goal']['implicit_inform_slots'].keys())
    all_symptoms = explicit_symptoms + implicit_symptoms
    symptoms.append(all_symptoms)

# Create a DataFrame
df = pd.DataFrame({
    'Disease Tag': disease_tags,
    'Symptoms': symptoms
})

# Display the DataFrame
print(df)


Mounted at /content/drive
                                    Disease Tag  \
0      Central retinal artery or vein occlusion   
1                     Degenerative disc disease   
2                          Diabetic retinopathy   
3                             Chronic back pain   
4                                  Air embolism   
...                                         ...   
23995                                Concussion   
23996                          Cushing syndrome   
23997                              Fibromyalgia   
23998                              Dengue fever   
23999                                 Flat feet   

                                                Symptoms  
0      [Spots or clouds in vision, Diminished vision,...  
1      [Shoulder pain, Back pain, Low back pain, Neck...  
2                        [Foreign body sensation in eye]  
3                  [Low back pain, Back pain, Side pain]  
4      [Wrist pain, Pain in eye, Shoulder cramps or s...  
...    

In [None]:
df

Unnamed: 0,Disease Tag,Symptoms
0,Central retinal artery or vein occlusion,"[Spots or clouds in vision, Diminished vision,..."
1,Degenerative disc disease,"[Shoulder pain, Back pain, Low back pain, Neck..."
2,Diabetic retinopathy,[Foreign body sensation in eye]
3,Chronic back pain,"[Low back pain, Back pain, Side pain]"
4,Air embolism,"[Wrist pain, Pain in eye, Shoulder cramps or s..."
...,...,...
23995,Concussion,"[Neck pain, Headache, Nausea]"
23996,Cushing syndrome,[Weight gain]
23997,Fibromyalgia,"[Low back pain, Back pain, Ache all over, Shou..."
23998,Dengue fever,"[Fever, Sore throat, Wrist pain, Pain during p..."


In [None]:
# Dictionary to hold symptoms and the diseases they appear in
symptom_disease_map = {}

# Iterate through each consultation in the 'train' data
for item in train_data:
    disease_tag = item['disease_tag']
    explicit_symptoms = list(item['goal']['explicit_inform_slots'].keys())
    implicit_symptoms = list(item['goal']['implicit_inform_slots'].keys())
    all_symptoms = explicit_symptoms + implicit_symptoms

    # Update the symptom-disease mapping
    for symptom in all_symptoms:
        if symptom not in symptom_disease_map:
            symptom_disease_map[symptom] = set()
        symptom_disease_map[symptom].add(disease_tag)

# List to hold repeating symptoms
repeating_symptoms = []

# Iterate through the symptom-disease mapping to find repeating symptoms
for symptom, diseases in symptom_disease_map.items():
    if len(diseases) > 1:
        repeating_symptoms.append((symptom, len(diseases)))

# Print the repeating symptoms along with the number of diseases they appear in
print("Repeating Symptoms and Number of Diseases:")
for idx, (symptom, num_diseases) in enumerate(repeating_symptoms, start=1):
    print(f"{idx}. {symptom}: {num_diseases} diseases")


Repeating Symptoms and Number of Diseases:
1. Spots or clouds in vision: 7 diseases
2. Diminished vision: 12 diseases
3. Symptoms of eye: 11 diseases
4. Pain in eye: 18 diseases
5. Shoulder pain: 7 diseases
6. Back pain: 13 diseases
7. Low back pain: 6 diseases
8. Neck pain: 9 diseases
9. Hip pain: 7 diseases
10. Ache all over: 11 diseases
11. Foreign body sensation in eye: 7 diseases
12. Side pain: 6 diseases
13. Wrist pain: 10 diseases
14. Shoulder cramps or spasms: 6 diseases
15. Facial pain: 8 diseases
16. Ankle pain: 9 diseases
17. Pain during pregnancy: 6 diseases
18. Joint stiffness or tightness: 7 diseases
19. Pain or soreness of breast: 8 diseases
20. Knee lump or mass: 9 diseases
21. Excessive urination at night: 5 diseases
22. Skin rash: 15 diseases
23. Skin dryness, peeling, scaliness, or roughness: 6 diseases
24. Diarrhea: 3 diseases
25. Excessive anger: 10 diseases
26. Fatigue: 14 diseases
27. Depression: 10 diseases
28. Sweating: 2 diseases
29. Abnormal involuntary movem

In [None]:
# Dictionary to hold the number of symptoms for each disease
disease_symptom_count = {}

# Iterate through each consultation in the 'train' data
for item in train_data:
    disease_tag = item['disease_tag']
    explicit_symptoms = list(item['goal']['explicit_inform_slots'].keys())
    implicit_symptoms = list(item['goal']['implicit_inform_slots'].keys())
    all_symptoms = explicit_symptoms + implicit_symptoms

    # Update the symptom count for the disease
    if disease_tag not in disease_symptom_count:
        disease_symptom_count[disease_tag] = len(set(all_symptoms))
    else:
        disease_symptom_count[disease_tag] += len(set(all_symptoms))

# Print the number of symptoms for each disease
print("Number of Symptoms for Each Disease:")
for disease, symptom_count in disease_symptom_count.items():
    print(f"{disease}: {symptom_count}")


Number of Symptoms for Each Disease:
Central retinal artery or vein occlusion: 763
Degenerative disc disease: 1181
Diabetic retinopathy: 667
Chronic back pain: 753
Air embolism: 2594
Dyshidrosis: 989
Diaper rash: 676
Conversion disorder: 1230
Anxiety: 944
Conductive hearing loss: 909
Alcohol intoxication: 1204
Conjunctivitis due to allergy: 1307
Drug reaction: 1087
Endometriosis: 877
Granuloma inguinale: 981
Extrapyramidal effect of drugs: 968
Corneal disorder: 937
Encephalitis: 1054
Eczema: 958
Actinic keratosis: 894
Endometrial cancer: 529
Adjustment reaction: 743
Drug abuse cocaine: 805
Chondromalacia of the patella: 553
Chalazion: 959
Alzheimer disease: 565
Acariasis: 985
Epidural hemorrhage: 956
Guillain Barre syndrome: 1291
Acute kidney injury: 840
Cerebral edema: 1227
Diabetic peripheral neuropathy: 668
Cushing syndrome: 379
Contact dermatitis: 699
Epididymitis: 946
Chickenpox: 1032
Flat feet: 922
Dissociative disorder: 895
Decubitus ulcer: 311
Amyotrophic lateral sclerosis ALS:

In [None]:
import pandas as pd

# Convert the dictionary to a DataFrame
df_symptom_count = pd.DataFrame.from_dict(disease_symptom_count, orient='index', columns=['Number of Symptoms'])

# Sort the DataFrame by the number of symptoms in descending order
df_symptom_count = df_symptom_count.sort_values(by='Number of Symptoms', ascending=False)

# Display the DataFrame
print("Number of Symptoms for Each Disease:")
print(df_symptom_count)


Number of Symptoms for Each Disease:
                                     Number of Symptoms
Fat embolism                                       3021
Air embolism                                       2594
Cat scratch disease                                1751
Dengue fever                                       1476
Ectropion                                          1382
...                                                 ...
Graves disease                                      431
Female infertility of unknown cause                 388
Cushing syndrome                                    379
Chronic kidney disease                              344
Decubitus ulcer                                     311

[90 rows x 1 columns]


In [None]:
# Dictionary to hold the number of symptoms (both explicit and implicit) for each disease
disease_total_symptom_count = {}

# Iterate through each consultation in the 'train' data
for item in train_data:
    disease_tag = item['disease_tag']
    explicit_symptoms = list(item['goal']['explicit_inform_slots'].keys())
    implicit_symptoms = list(item['goal']['implicit_inform_slots'].keys())
    all_symptoms = explicit_symptoms + implicit_symptoms

    # Update the symptom count for the disease
    if disease_tag not in disease_total_symptom_count:
        disease_total_symptom_count[disease_tag] = len(set(all_symptoms))
    else:
        disease_total_symptom_count[disease_tag] += len(set(all_symptoms))

# Convert the dictionary to a DataFrame
df_total_symptom_count = pd.DataFrame.from_dict(disease_total_symptom_count, orient='index', columns=['Total Number of Symptoms'])

# Sort the DataFrame by the total number of symptoms in descending order
df_total_symptom_count = df_total_symptom_count.sort_values(by='Total Number of Symptoms', ascending=False)

# Display the DataFrame
print("Total Number of Symptoms (Explicit + Implicit) for Each Disease:")
print(df_total_symptom_count)


Total Number of Symptoms (Explicit + Implicit) for Each Disease:
                                     Total Number of Symptoms
Fat embolism                                             3021
Air embolism                                             2594
Cat scratch disease                                      1751
Dengue fever                                             1476
Ectropion                                                1382
...                                                       ...
Graves disease                                            431
Female infertility of unknown cause                       388
Cushing syndrome                                          379
Chronic kidney disease                                    344
Decubitus ulcer                                           311

[90 rows x 1 columns]


In [None]:
# Calculate the total number of diseases
total_diseases = len(associated_diseases)

# Assign probabilities to each repeating symptom based on the number of diseases they appear in
repeating_symptoms_with_prob = [(symptom, num_diseases, (num_diseases / total_diseases) * 100)
                                for symptom, num_diseases in repeating_symptoms]

# Sort the repeating symptoms based on their probabilities in descending order
repeating_symptoms_with_prob.sort(key=lambda x: x[2], reverse=True)

# Initialize a dictionary to hold the probability of each symptom
symptom_prob = {}

# Print the repeating symptoms along with their probabilities
print("Repeating Symptoms, Number of Diseases, and Probability (Sorted by Probability):")
for idx, (symptom, num_diseases, probability) in enumerate(repeating_symptoms_with_prob, start=1):
    symptom_prob[symptom] = probability  # Save the probability of the symptom
    print(f"{idx}. {symptom}: {num_diseases} diseases, Probability: {probability:.2f}%")


Repeating Symptoms, Number of Diseases, and Probability (Sorted by Probability):
1. Pain in eye: 18 diseases, Probability: 900.00%
2. Skin rash: 15 diseases, Probability: 750.00%
3. Fatigue: 14 diseases, Probability: 700.00%
4. Leg pain: 14 diseases, Probability: 700.00%
5. Problems with movement: 14 diseases, Probability: 700.00%
6. Back pain: 13 diseases, Probability: 650.00%
7. Dizziness: 13 diseases, Probability: 650.00%
8. Skin lesion: 13 diseases, Probability: 650.00%
9. Headache: 13 diseases, Probability: 650.00%
10. Diminished vision: 12 diseases, Probability: 600.00%
11. Itching of skin: 12 diseases, Probability: 600.00%
12. Depressive or psychotic symptoms: 12 diseases, Probability: 600.00%
13. Symptoms of eye: 11 diseases, Probability: 550.00%
14. Ache all over: 11 diseases, Probability: 550.00%
15. Sharp abdominal pain: 11 diseases, Probability: 550.00%
16. Loss of sensation: 11 diseases, Probability: 550.00%
17. Itchiness of eye: 11 diseases, Probability: 550.00%
18. Pares

In [None]:
# Ask the user for input
user_input = input("Enter a word to match symptoms: ").strip().lower()

# List to hold matching symptoms
matching_symptoms = []

# Iterate through the symptom-disease mapping
for symptom in symptom_disease_map.keys():
    if user_input in symptom.lower():
        matching_symptoms.append(symptom)

# Print matching symptoms
if matching_symptoms:
    print(f"Symptoms matching '{user_input}':")
    for idx, symptom in enumerate(matching_symptoms, 1):
        print(f"{idx}. {symptom}")

    # Ask the user which symptom they have
    selection = input("Enter the number corresponding to the symptom you have: ")

    # Validate user selection
    if selection.isdigit() and 1 <= int(selection) <= len(matching_symptoms):
        user_input_symptom = matching_symptoms[int(selection) - 1]
        print(f"You selected: {user_input_symptom}")
    else:
        print("Invalid selection. Please enter a number corresponding to the symptom.")
else:
    print(f"No symptoms matching '{user_input}' found.")


Enter a word to match symptoms: back
Symptoms matching 'back':
1. Back pain
2. Low back pain
3. Back weakness
4. Low back weakness
5. Back stiffness or tightness
6. Back cramps or spasms
7. Back mass or lump
Enter the number corresponding to the symptom you have: 3
You selected: Back weakness


In [None]:
# Define a variable to track whether diseases are found
diseases_found = False

# Initialize a set to hold encountered diseases
encountered_diseases = set()

# Initialize a list to hold confirmed symptoms
confirmed_symptoms = []

# Iterate over sorted symptoms in ascending order of probability
for symptom, probability in sorted_symptoms:
    # Check if the symptom has already been confirmed
    if symptom in confirmed_symptoms:
        continue

    response = input(f"Do you have {symptom}? (yes/no): ").lower().strip()
    if response == 'yes':
        confirmed_symptoms.append(symptom)
        # Find diseases associated with confirmed symptoms
        matching_diseases = set()
        for disease, symptoms in symptom_disease_map.items():
            if all(symptom in confirmed_symptoms for symptom in symptoms):
                matching_diseases.add(disease)

        # Print diseases containing confirmed symptoms
        if matching_diseases:
            print("Predicted Diseases:")
            for disease in matching_diseases:
                print(disease)
            diseases_found = True
            break
        else:
            print("No diseases found containing the confirmed symptoms.")
    elif response == 'no':
        continue
    else:
        print("Invalid response. Please enter 'yes' or 'no'.")

# If diseases are found, calculate NDCG score
if diseases_found:
    # Calculate NDCG score (NDCG calculation code goes here)
    ndcg_score = 0.0  # Placeholder for actual NDCG calculation
    print("NDCG Score:", ndcg_score)
else:
    print("No diseases predicted.")



Do you have Groin pain? (yes/no): yes
No diseases found containing the confirmed symptoms.
Do you have Headache? (yes/no): yes
No diseases found containing the confirmed symptoms.
Do you have Knee pain? (yes/no): yes
No diseases found containing the confirmed symptoms.
No diseases predicted.


In [None]:
# Define a variable to track whether diseases are found
diseases_found = False

# Initialize a set to hold encountered diseases
encountered_diseases = set()

# Initialize a set to hold all symptoms found in associated diseases
other_symptoms = set()

# Iterate over sorted symptoms in ascending order of probability
for symptom, probability in sorted_symptoms:
    # Check if the symptom has already been confirmed
    if symptom == user_input_symptom:
        continue

    response = input(f"Do you have {symptom}? (yes/no): ").lower().strip()
    if response == 'yes':
        # Combine the confirmed symptom with the user input symptom
        combined_symptoms = [user_input_symptom, symptom]

        # Find diseases associated with both symptoms
        combined_diseases = []
        for index, row in df.iterrows():
            disease_tag = row['Disease Tag']
            symptoms = row['Symptoms']
            if 'Wrist pain' in symptoms and 'Pain in eye' in symptoms:
                if disease_tag not in encountered_diseases:
                    combined_diseases.append(disease_tag)
                    encountered_diseases.add(disease_tag)
                # Add other symptoms of the disease to the set
                other_symptoms.update(symptoms)

        # Print diseases containing both symptoms
        if combined_diseases:
            print(f"Diseases containing both '{user_input_symptom}' and '{symptom}':")
            for idx, disease in enumerate(combined_diseases, 1):
                print(f"{idx}. {disease}")
            diseases_found = True
            break
        else:
            print(f"No diseases found containing both '{user_input_symptom}' and '{symptom}'.")
    elif response == 'no':
        continue
    else:
        print("Invalid response. Please enter 'yes' or 'no'.")

# If diseases are found, display other symptoms in associated diseases
if diseases_found:
    print("\nOther Symptoms in Associated Diseases:")
    for idx, symptom in enumerate(other_symptoms, 1):
        print(f"{idx}. {symptom}")


Do you have Sweating? (yes/no): yes
Diseases containing both 'Back weakness' and 'Sweating':
1. Air embolism
2. Gas gangrene
3. Fat embolism
4. Chancroid
5. Chagas disease
6. Dengue fever

Other Symptoms in Associated Diseases:
1. Excessive urination at night
2. Fever
3. Ankle pain
4. Sore throat
5. Fatigue
6. Sharp abdominal pain
7. Pain in eye
8. Pain during pregnancy
9. Joint stiffness or tightness
10. Pain or soreness of breast
11. Knee lump or mass
12. Excessive anger
13. Facial pain
14. Shoulder cramps or spasms
15. Wrist pain


In [None]:
###############################################