## This module finds top 10 possible diseases based on each ML model for given symptoms

In [1]:
# Filter & ignore warnings for clear output visualization

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import all necessary packages

import os
import math
import joblib
import pandas as pd
from statistics import mean

### Functions to find co-occuring symptoms with some threshold & process ML results to get list of diseases

In [3]:
# Function to generate top 10 diseases from the ML results
# Pass mean_score as another argument if you need probabilities too
def ProcessResultAndGenerateDiseases(top10_list, mean_score, cooccuring_symptoms, user_symptoms_len):
    
    global df_independent, all_symptoms, all_diseases
    top10_dict = {}

    # Checks for each disease, the matched symptoms & generates probability of having that disease
    for (idx, disease_id) in enumerate(top10_list):
        matched_symptoms = set()
        top10 = df_independent.loc[df_independent['Disease_Name'] == all_diseases[disease_id]].values.tolist()
        
        # Obtains the disease name which is at the top of the dataframe
        disease = top10[0].pop(0)

        # Each row contains 0s & 1s indicating whether a disease is associated with a particular symptom or not
        for (idx, value) in enumerate(top10[0]):
            if value != 0:
                matched_symptoms.add(all_symptoms[idx])
                
        probability = (len(matched_symptoms.intersection(set(cooccuring_symptoms))) + 1) / (user_symptoms_len + 1)
        top10_dict[disease] = round(probability * mean_score * 100, 2)
    
    top10_sorted_dict = dict(sorted(top10_dict.items(), key=lambda kv: kv[1], reverse=True))
    return top10_sorted_dict  
        

In [4]:
# Function to display the results from the dictionary
def PrintDictionary(top10_sorted_dict):
    for (key, value) in top10_sorted_dict.items():
        print(key, "\t", value, "%")


In [5]:
# Function to find co-occuring symptoms with all the symptoms user chosen
# We use a threshold to check for a 95% match with the given symptoms

def FindCooccuringSymptomsWithThreshold(user_symptoms):
    
    global df_independent, all_symptoms
    threshold = math.ceil(len(user_symptoms) * 0.95)

    # Get all unique possible diseases with the given symptoms
    unique_diseases = set()
    for symptom in user_symptoms:
        possible_diseases_for_symptom = list(df_independent[df_independent[symptom] == 1]['Disease_Name'])
        for disease in possible_diseases_for_symptom:
            unique_diseases.add(disease)
        
    # Get all unique diseases & sort them
    unique_diseases = sorted(list(unique_diseases))
    
    #print(unique_diseases)

    # Obtain co-occuring symptoms with 95% threshold
    # cooccuring_symptoms must have all given symptoms by default
    cooccuring_symptoms = set(user_symptoms)   
    for disease in unique_diseases:
        
        # First, obtain all symptoms associated with each disease in unique diseases obtained
        symptoms_of_disease = df_independent.loc[df_independent['Disease_Name'] == disease].values.tolist().pop(0)

        # Maintain a temporary set of symptoms of the disease & add them only when they meet threshold requirements
        temp_symptoms = set()
        count, add_symptoms = 0, False
        for idx in range(len(symptoms_of_disease)):
            
            # Symptoms of a disease will have 1 in their respective symptom columns
            if symptoms_of_disease[idx] == 1:
                temp_symptoms.add(all_symptoms[idx])
                count = count + 1

                # Our threshold is set to 95% of original symptoms
                if count > threshold:
                    add_symptoms = True

        # Adds temporary symptoms to cooccuring symptoms only if they meet threshold requirements
        if add_symptoms == True:
            for symp in temp_symptoms:
                cooccuring_symptoms.add(symp)

    cooccuring_symptoms = sorted(list(cooccuring_symptoms))
    return cooccuring_symptoms
    

### Prepares data to be compatible with the dataset to make predictions

In [6]:
# Load datasets for all possible combinations & for individual disease's respective symptoms
current_directory = os.getcwd()
data_path = current_directory + "/Datasets-CSV"

df_combination = pd.read_csv(data_path + "/Disease_Symptom_Dataset_For_All_Symptom_Subsets.csv") 
df_independent = pd.read_csv(data_path + "/Disease_Symptom_Dataset_For_Respective_Symptoms.csv") 

X_combination = df_combination.iloc[:, 1:]
Y_combination = df_combination.iloc[:, 0:1]

X_independent = df_independent.iloc[:, 1:]
Y_independent = df_independent.iloc[:, 0:1]

# List of all possible symptoms
all_symptoms = list(X_independent.columns)
all_diseases = list(set(Y_independent['Disease_Name']))
all_diseases.sort()

# We obtain top 10 possible diseases
no_of_diseases = 10


In [7]:
# Will be obtained from the front-end
user_symptoms = ["weakness fatigue", "headache", "prolonged cough", "fever", "tiredness", "shortness breath", "breathlessness"]
user_symptoms_len = len(set(user_symptoms))

# Obtains all possible cooccuring symptoms including given symptoms
cooccuring_symptoms = FindCooccuringSymptomsWithThreshold(user_symptoms)
print(cooccuring_symptoms)

# Process obtained symptoms to create rows compatible with the dataset
processed_symptoms = [0 for x in range(0, len(all_symptoms))]
for symptom in cooccuring_symptoms:
    processed_symptoms[all_symptoms.index(symptom)] = 1
    

['absent near absent urine output', 'aching', 'anxiety', 'asthenopia', 'bacterial infection', 'bloating', 'breathlessness', 'bruising', 'change skin color red black', 'chest tightness', 'coma', 'confused thinking', 'constipation', 'coolness', 'crawl', 'decreased ability feel pain', 'decreased ability turn', 'depends location runny nose', 'dermatitis herpetiformis', 'diarrhea mixed blood', 'diarrhoea may bloody', 'dry eye', 'expanding area redness site tick bite', 'feel need check thing repeatedly', 'feeling faint upon standing', 'fever', 'fever start low increase daily', 'firm', 'half ring finger', 'hallucination usually hearing voice', 'headache', 'hearing impairment loss', 'heat intolerance', 'inability gain maintain erection', 'increased fat', 'increased hunger', 'internal bleeding', 'irritation', 'limited critical thinking', 'loss bladder bowel control', 'low energy', 'lower abdominal pain', 'malabsorption', 'mental change', 'multiple painful joint', 'muscle cramp', 'nausea vomitin

### Uses 4 Machine Learning models to obtain possible predictions

In [8]:
# Get sav file path
sav_path = current_directory + "/Model-Weights/"


In [9]:
print("Processing with Logistic Regression...")
lr_cls = joblib.load(sav_path + "log_reg.sav")
lr_mean_score = joblib.load(sav_path + "log_reg_cv.sav")
lr_result = lr_cls.predict_proba([processed_symptoms])
print("Done")


Processing with Logistic Regression...
Done


In [10]:
print("Processing with KNN Classifier...")
knn_cls = joblib.load(sav_path + "knn.sav")
knn_mean_score = joblib.load(sav_path + "knn_cv.sav")
knn_result = knn_cls.predict_proba([processed_symptoms])
print("Done")


Processing with KNN Classifier...
Done


In [11]:
print("Processing with Multinomial Naive Bayes...")
mnb_cls = joblib.load(sav_path + "mnb.sav")
mnb_mean_score = joblib.load(sav_path + "mnb_cv.sav")
mnb_result = mnb_cls.predict_proba([processed_symptoms])
print("Done")


Processing with Multinomial Naive Bayes...
Done


In [12]:
print("Processing with Random Forest Classifier...")
rf_cls = joblib.load(sav_path + "rand_forest.sav")
rf_mean_score = joblib.load(sav_path + "rand_forest_cv.sav")
rf_result = rf_cls.predict_proba([processed_symptoms])
print("Done")


Processing with Random Forest Classifier...
Done


### Generate top 10 diseases associated with the symptoms from ML models' prediction results

In [13]:
# Logistic Regression result
print("---------- LOGISTIC REGRESSION: ----------\n")

lr_top10 = lr_result[0].argsort()[-no_of_diseases:][::-1]
lr_dict = ProcessResultAndGenerateDiseases(lr_top10, lr_mean_score, cooccuring_symptoms, user_symptoms_len)
PrintDictionary(lr_dict)

# Random Forest result
print("\n---------- RANDOM FOREST: ----------\n")

rf_top10 = rf_result[0].argsort()[-no_of_diseases:][::-1]
rf_dict = ProcessResultAndGenerateDiseases(rf_top10, rf_mean_score, cooccuring_symptoms, user_symptoms_len)
PrintDictionary(rf_dict)

# Knn Result
print("\n---------- KNN CLASSIFIER: ----------\n")

knn_top10 = knn_result[0].argsort()[-no_of_diseases:][::-1]
knn_dict = ProcessResultAndGenerateDiseases(knn_top10, knn_mean_score, cooccuring_symptoms, user_symptoms_len)
PrintDictionary(knn_dict)


# Multinomial Naive bayes Result
print("\n---------- MULTINOMIAL NAIVE BAYES: ----------\n")

mnb_top10 = mnb_result[0].argsort()[-no_of_diseases:][::-1]
mnb_dict = ProcessResultAndGenerateDiseases(mnb_top10, mnb_mean_score, cooccuring_symptoms, user_symptoms_len)
PrintDictionary(mnb_dict)


---------- LOGISTIC REGRESSION: ----------

Lyme disease 	 58.63 %
Coronavirus disease 2019 (covid-19) 	 58.63 %
Celiacs disease 	 46.9 %
Schizophrenia 	 46.9 %
Rabies 	 46.9 %
Brain tumour 	 46.9 %
Leukemia 	 46.9 %
Anthrax 	 46.9 %
Mucormycosis 	 46.9 %
Porphyria 	 46.9 %

---------- RANDOM FOREST: ----------

Coronavirus disease 2019 (covid-19) 	 57.64 %
Porphyria 	 46.11 %
Legionellosis 	 46.11 %
Celiacs disease 	 46.11 %
Typhoid / enteric fever 	 34.58 %
Sepsis 	 34.58 %
Asthma 	 34.58 %
Erectile dysfunctions 	 23.05 %
Exposure keratopathy 	 23.05 %
Fibromyalgia 	 11.53 %

---------- KNN CLASSIFIER: ----------

Coronavirus disease 2019 (covid-19) 	 57.68 %
Lyme disease 	 57.68 %
Rabies 	 46.14 %
Schizophrenia 	 46.14 %
Aseptic meningitis 	 46.14 %
Anthrax 	 46.14 %
Crimean congo haemorrhagic fever (cchf) 	 46.14 %
Irritable bowel syndrome 	 23.07 %
Fibromyalgia 	 11.54 %
Endometriosis 	 11.54 %

---------- MULTINOMIAL NAIVE BAYES: ----------

Lyme disease 	 56.21 %
Schizophrenia 	

### Process the obtained results to be suitable for UI

In [14]:
# We use joint probabilities for the final dictionary & probabilities
final_dict = {}

# For Logistic Regression
for (key, val) in lr_dict.items():
    if key not in final_dict:
        final_dict[key] = [lr_dict[key], 1]
    else:
        prob, count = final_dict[key]
        final_dict[key] = [lr_dict[key] + prob, 1 + count]
        
# For Random Forest
for (key, val) in rf_dict.items():
    if key not in final_dict:
        final_dict[key] = [rf_dict[key], 1]
    else:
        prob, count = final_dict[key]
        final_dict[key] = [rf_dict[key] + prob, 1 + count]

# For KNN Classifier
for (key, val) in knn_dict.items():
    if key not in final_dict:
        final_dict[key] = [knn_dict[key], 1]
    else:
        prob, count = final_dict[key]
        final_dict[key] = [knn_dict[key] + prob, 1 + count]
        
# For Multinomial Naive Bayes
for (key, val) in mnb_dict.items():
    if key not in final_dict:
        final_dict[key] = [mnb_dict[key], 1]
    else:
        prob, count = final_dict[key]
        final_dict[key] = [mnb_dict[key] + prob, 1 + count]


In [15]:
# Obtain probability over max.count possible
processed_dict = {}
max_prob = 0
for (key, val) in final_dict.items():
    processed_dict[key] = round(final_dict[key][0] / 4, 2)
    if processed_dict[key] > max_prob:
        max_prob = processed_dict[key]
    #print(key, "...", processed_dict[key], "...", final_dict[key][1])

# Obtain likeliness range
prob_100 = round(max_prob, 2)
prob_50 = round(prob_100 / 2, 2)
prob_25 = round(prob_50 / 2, 2)
prob_75 = round(prob_50 + prob_25)

# Visualize the probability ranges
print("100% ", prob_100, "\t75% ", prob_75, "\t50% ", prob_50, "\t25% ", prob_25, "\n")

# Sort dictionary by probabilities & leave off the less possible ones
final_dict = dict(sorted(processed_dict.items(), key=lambda item: item[1], reverse=True)[:15])
#PrintDictionary(final_dict)

# Set count values by range
for key in final_dict.keys():
    prob, count = final_dict[key], 0
    if prob <= prob_100 and prob > prob_75:
        count = 4
    elif prob <= prob_75 and prob > prob_50:
        count = 3
    elif prob <= prob_50 and prob > prob_25:
        count = 2
    else:
        count = 1
    final_dict[key] = "count" + str(count)
    print(key, ":\t", final_dict[key])

# Pass the final_dict to the UI --> We call this in views.py
#return render(request, "index.html", {"final_dict": final_dict, 'disable': True, 'show': False, 'back': True})


100%  43.49 	75%  33 	50%  21.74 	25%  10.87 

Coronavirus disease 2019 (covid-19) :	 count4
Lyme disease :	 count4
Schizophrenia :	 count4
Rabies :	 count3
Anthrax :	 count3
Celiacs disease :	 count3
Porphyria :	 count3
Aseptic meningitis :	 count3
Brain tumour :	 count2
Leukemia :	 count2
Mucormycosis :	 count2
Crimean congo haemorrhagic fever (cchf) :	 count2
Legionellosis :	 count2
Malaria :	 count2
Black death :	 count2
