## This module finds top 10 possible diseases based on each ML model for given symptoms

In [1]:
# Filter & ignore warnings for clear output visualization

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import all necessary packages

import os
import math
import joblib
import pandas as pd
from statistics import mean
from itertools import combinations

### Functions to find co-occuring symptoms with some threshold & process ML results to get list of diseases

In [3]:
# Function to generate top 10 diseases from the ML results
# Pass mean_score as another argument if you need probabilities too
def ProcessResultAndGenerateDiseases(top10_list, mean_score, cooccuring_symptoms, user_symptoms_len):
    
    global df_independent, all_symptoms, all_diseases
    top10_dict = {}

    # Checks for each disease, the matched symptoms & generates probability of having that disease
    for (idx, disease_id) in enumerate(top10_list):
        matched_symptoms = set()
        top10 = df_independent.loc[df_independent['Disease_Name'] == all_diseases[disease_id]].values.tolist()
        
        # Obtains the disease name which is at the top of the dataframe
        disease = top10[0].pop(0)

        # Each row contains 0s & 1s indicating whether a disease is associated with a particular symptom or not
        for (idx, value) in enumerate(top10[0]):
            if value != 0:
                matched_symptoms.add(all_symptoms[idx])
                
        #print("\n", matched_symptoms)
        probability = (len(matched_symptoms.intersection(set(cooccuring_symptoms))) + 1) / (user_symptoms_len + 1)
        top10_dict[disease] = round(probability * mean_score * 100, 2)
    
    top10_sorted_dict = dict(sorted(top10_dict.items(), key=lambda kv: kv[1], reverse=True))
    return top10_sorted_dict  
        

In [4]:
# Function to display the results from the dictionary
def PrintDictionary(top10_sorted_dict):
    for (key, value) in top10_sorted_dict.items():
        print(key, "\t", value, "%")


In [5]:
# Function to generate subsets
def GetPossibleSubsets(user_symptoms):
    
    global all_symptoms
    processed_symptoms = []
    user_symptoms_len = len(user_symptoms)
    minSubsetLength = math.floor(user_symptoms_len * 0.8)
    
    # Form possible subsets with minSubsetLength
    for combination in range(minSubsetLength, user_symptoms_len + 1):
        for subset in combinations(user_symptoms, combination):
            temp_processed_symptoms = [0 for x in range(0, len(all_symptoms))]
            for symptom in subset:
                temp_processed_symptoms[all_symptoms.index(symptom)] = 1
            processed_symptoms.append(temp_processed_symptoms)
    
    return processed_symptoms
    

In [6]:
# Function to get predictions for possible subsets from given symptoms
def GetTop10BySubsets(model, mean_score, user_symptoms, processed_symptoms):
    
    model_dict_res, res_dict = {}, {}
    user_symptoms_len = len(user_symptoms)
    subsets = 0
    
    for proc_sym in processed_symptoms:
        subsets += 1
        model_result = model.predict_proba([proc_sym])
        model_top10 = model_result[0].argsort()[-10:][::-1]
        model_dict = ProcessResultAndGenerateDiseases(model_top10, mean_score, user_symptoms, user_symptoms_len)

        for (key, value) in model_dict.items():
            if key not in model_dict_res.keys():
                model_dict_res[key] = [value, 1]
            else:
                model_dict_res[key] = [model_dict_res[key][0] + value, model_dict_res[key][1] + 1]
        #print(model_dict_res, "\n")
    
    print("Total no. of subsets considered: ", subsets)
    for (key, value) in model_dict_res.items():
        res_dict[key] = round(value[0] / value[1], 2)
        
    res_dict = dict(sorted(res_dict.items(), key=lambda item: item[1], reverse=True)[:10])
    return res_dict


In [7]:
# Function to find co-occuring symptoms with all the symptoms user chosen
# We use a threshold to check for a 90% match with the given symptoms

def FindCooccuringSymptomsWithThreshold(user_symptoms):
    
    global df_independent, all_symptoms
    threshold = math.floor(len(user_symptoms) * 0.90)

    # Get all unique possible diseases with the given symptoms
    unique_diseases = set()
    for symptom in user_symptoms:
        possible_diseases_for_symptom = list(df_independent[df_independent[symptom] == 1]['Disease_Name'])
        for disease in possible_diseases_for_symptom:
            unique_diseases.add(disease)
        
    # Get all unique diseases & sort them
    unique_diseases = sorted(list(unique_diseases))
    
    #print(unique_diseases)

    # Obtain co-occuring symptoms with 90% threshold
    # cooccuring_symptoms must have all given symptoms by default
    cooccuring_symptoms = set(user_symptoms)   
    for disease in unique_diseases:
        
        # First, obtain all symptoms associated with each disease in unique diseases obtained
        symptoms_of_disease = df_independent.loc[df_independent['Disease_Name'] == disease].values.tolist().pop(0)

        # Maintain a temporary set of symptoms of the disease & add them only when they meet threshold requirements
        temp_symptoms = set()
        count, add_symptoms = 0, False
        for idx in range(len(symptoms_of_disease)):
            
            # Symptoms of a disease will have 1 in their respective symptom columns
            if symptoms_of_disease[idx] == 1:
                temp_symptoms.add(all_symptoms[idx-1])
                count = count + 1

                # Our threshold is set to 90% of original symptoms
                if count > threshold:
                    add_symptoms = True

        # Adds temporary symptoms to cooccuring symptoms only if they meet threshold requirements
        if add_symptoms == True:
            for symp in temp_symptoms:
                cooccuring_symptoms.add(symp)

    cooccuring_symptoms = sorted(list(cooccuring_symptoms))
    return cooccuring_symptoms
    

### Prepares data to be compatible with the dataset to make predictions

In [8]:
# Load datasets for all possible combinations & for individual disease's respective symptoms
current_directory = os.getcwd()
data_path = current_directory + "/Datasets-CSV"

df_combination = pd.read_csv(data_path + "/Disease_Symptom_Dataset_For_All_Symptom_Subsets.csv") 
df_independent = pd.read_csv(data_path + "/Disease_Symptom_Dataset_For_Respective_Symptoms.csv") 

X_combination = df_combination.iloc[:, 1:]
Y_combination = df_combination.iloc[:, 0:1]

X_independent = df_independent.iloc[:, 1:]
Y_independent = df_independent.iloc[:, 0:1]

# List of all possible symptoms
all_symptoms = list(X_independent.columns)
all_diseases = list(set(Y_independent['Disease_Name']))
all_diseases.sort()

# We obtain top 10 possible diseases
no_of_diseases = 10


In [9]:
# Will be obtained from the front-end
#user_symptoms = ['back', 'shortness breath', 'stomach pain', 'chest pain', 'cold sweat', 'feeling faint', 'feeling tired']
user_symptoms = ['headache', 'light sensitivity', 'sound', 'nausea', 'irritation', 'muscle joint pain']
user_symptoms_len = len(set(user_symptoms))

# Get possible subsets with minimum 80% count
processed_symptoms = GetPossibleSubsets(user_symptoms)

# Obtains all possible cooccuring symptoms including given symptoms
cooccuring_symptoms = FindCooccuringSymptomsWithThreshold(user_symptoms)
processed_symptoms2 = [0 for x in range(0, len(all_symptoms))]
for symptom in cooccuring_symptoms:
    processed_symptoms2[all_symptoms.index(symptom)] = 1

processed_symptoms.append(processed_symptoms2)
#print(processed_symptoms)


### Uses 4 Machine Learning models to obtain possible predictions

In [10]:
# Get sav file path
sav_path = current_directory + "/Model-Weights/"


In [11]:
print("Processing with Logistic Regression...")
lr_cls = joblib.load(sav_path + "log_reg.sav")
lr_mean_score = joblib.load(sav_path + "log_reg_cv.sav")
lr_dict = GetTop10BySubsets(lr_cls, lr_mean_score, user_symptoms, processed_symptoms)
print("Done\n")
PrintDictionary(lr_dict)


Processing with Logistic Regression...
Total no. of subsets considered:  23
Done

Migraine 	 66.83 %
Zika virus disease 	 40.1 %
Dengue 	 40.1 %
Glaucoma 	 26.73 %
Lactose intolerance 	 26.73 %
Kidney stone disease 	 26.73 %
Iritis 	 26.73 %
Keratoconus 	 26.73 %
Hepatitis a 	 26.73 %
Tetanus 	 26.73 %


In [12]:
print("Processing with Random Forest Classifier...")
rf_cls = joblib.load(sav_path + "rand_forest.sav")
rf_mean_score = joblib.load(sav_path + "rand_forest_cv.sav")
rf_dict = GetTop10BySubsets(rf_cls, rf_mean_score, user_symptoms, processed_symptoms)
print("Done\n")
PrintDictionary(rf_dict)


Processing with Random Forest Classifier...
Total no. of subsets considered:  23
Done

Migraine 	 65.73 %
Zika virus disease 	 39.44 %
Dengue 	 39.44 %
Glaucoma 	 26.29 %
Exposure keratopathy 	 26.29 %
Factitious keratoconjunctivitis 	 26.29 %
Japanese encephalitis 	 26.29 %
Listeriosis 	 26.29 %
Iritis 	 26.29 %
Astigmatism 	 26.29 %


In [13]:
print("Processing with KNN Classifier...")
knn_cls = joblib.load(sav_path + "knn.sav")
knn_mean_score = joblib.load(sav_path + "knn_cv.sav")
knn_dict = GetTop10BySubsets(knn_cls, knn_mean_score, user_symptoms, processed_symptoms)
print("Done\n")
PrintDictionary(knn_dict)


Processing with KNN Classifier...
Total no. of subsets considered:  23
Done

Migraine 	 65.78 %
Zika virus disease 	 39.47 %
Dengue 	 39.47 %
Glaucoma 	 26.31 %
Exposure keratopathy 	 26.31 %
Factitious keratoconjunctivitis 	 26.31 %
Rocky mountain spotted fever 	 26.31 %
Mumps 	 26.31 %
Lyme disease 	 26.31 %
Leptospirosis 	 26.31 %


In [14]:
print("Processing with Multinomial Naive Bayes...")
mnb_cls = joblib.load(sav_path + "mnb.sav")
mnb_mean_score = joblib.load(sav_path + "mnb_cv.sav")
mnb_dict = GetTop10BySubsets(mnb_cls, mnb_mean_score, user_symptoms, processed_symptoms)
print("Done\n")
PrintDictionary(mnb_dict)


Processing with Multinomial Naive Bayes...
Total no. of subsets considered:  23
Done

Migraine 	 64.52 %
Zika virus disease 	 38.71 %
Dengue 	 38.71 %
Hepatitis a 	 25.81 %
Mucormycosis 	 25.81 %
Tetanus 	 25.81 %
Flu 	 25.81 %
Crimean congo haemorrhagic fever (cchf) 	 25.81 %
Ebola 	 25.81 %
Chickenpox 	 25.81 %


### Process the obtained results to be suitable for UI

In [15]:
# We use joint probabilities for the final dictionary & probabilities
final_dict = {}

# For Logistic Regression
for (key, val) in lr_dict.items():
    if key not in final_dict:
        final_dict[key] = [lr_dict[key], 1]
    else:
        prob, count = final_dict[key]
        final_dict[key] = [lr_dict[key] + prob, 1 + count]
        
# For Random Forest
for (key, val) in rf_dict.items():
    if key not in final_dict:
        final_dict[key] = [rf_dict[key], 1]
    else:
        prob, count = final_dict[key]
        final_dict[key] = [rf_dict[key] + prob, 1 + count]

# For KNN Classifier
for (key, val) in knn_dict.items():
    if key not in final_dict:
        final_dict[key] = [knn_dict[key], 1]
    else:
        prob, count = final_dict[key]
        final_dict[key] = [knn_dict[key] + prob, 1 + count]
        
# For Multinomial Naive Bayes
for (key, val) in mnb_dict.items():
    if key not in final_dict:
        final_dict[key] = [mnb_dict[key], 1]
    else:
        prob, count = final_dict[key]
        final_dict[key] = [mnb_dict[key] + prob, 1 + count]


In [16]:
# Obtain probability over max.count possible
processed_dict = {}
max_prob = 0
for (key, val) in final_dict.items():
    processed_dict[key] = round(final_dict[key][0] / 4, 2)
    if processed_dict[key] > max_prob:
        max_prob = processed_dict[key]
    #print(key, "...", processed_dict[key], "...", final_dict[key][1])

# Obtain likeliness range
prob_100 = round(max_prob, 2)
prob_50 = round(prob_100 / 2, 2)
prob_25 = round(prob_50 / 2, 2)
prob_75 = round(prob_50 + prob_25)

# Visualize the probability ranges
print("100% ", prob_100, "\t75% ", prob_75, "\t50% ", prob_50, "\t25% ", prob_25, "\n")

# Sort dictionary by probabilities & leave off the less possible ones
final_dict = dict(sorted(processed_dict.items(), key=lambda item: item[1], reverse=True)[:10])
#PrintDictionary(final_dict)

# Set count values by range
for key in final_dict.keys():
    prob, count = final_dict[key], 0
    if prob <= prob_100 and prob > prob_75:
        count = 4
    elif prob <= prob_75 and prob > prob_50:
        count = 3
    elif prob <= prob_50 and prob > prob_25:
        count = 2
    else:
        count = 1
    final_dict[key] = "count" + str(count)
    print(key, ":\t", final_dict[key])

# Pass the final_dict to the UI --> We call this in views.py
#return render(request, "index.html", {"final_dict": final_dict, 'disable': True, 'show': False, 'back': True})


100%  65.72 	75%  49 	50%  32.86 	25%  16.43 

Migraine :	 count4
Zika virus disease :	 count3
Dengue :	 count3
Glaucoma :	 count2
Iritis :	 count1
Exposure keratopathy :	 count1
Factitious keratoconjunctivitis :	 count1
Hepatitis a :	 count1
Tetanus :	 count1
Lactose intolerance :	 count1
