## This module finds top 10 possible diseases based on each ML model for given symptoms

In [1]:
# Filter & ignore warnings for clear output visualization

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import all necessary packages

import pandas as pd
from statistics import mean
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

### Functions to find co-occuring symptoms with some threshold & process ML results to get list of diseases

In [3]:
# Function to generate top 10 diseases from the ML results
# Pass mean_score as another argument if you need probabilities too
def ProcessResultAndGenerateDiseases(top10_list):
    
    global df_independent, all_symptoms, all_diseases, processed_symptoms
    top10_diseases = []
    #top10_dict = {}

    # Checks for each disease, the matched symptoms & generates probability of having that disease
    for (idx, disease_id) in enumerate(top10_list):
        matched_symptoms = set()
        top10 = df_independent.loc[df_independent['Disease_Name'] == all_diseases[disease_id]].values.tolist()
        
        # Obtains the disease name which is at the top of the dataframe
        disease = top10[0].pop(0)

        # Each row contains 0s & 1s indicating whether a disease is associated with a particular symptom or not
        for (idx, value) in enumerate(top10[0]):
            if value != 0:
                matched_symptoms.add(all_symptoms[idx])
                
        #probability = (len(matched_symptoms.intersection(set(processed_symptoms))) + 1) / (len(set(processed_symptoms)) + 1)
        #top10_dict[disease] = round(probability * mean_score * 100, 2)
        top10_diseases.append(disease)
    
    #top10_sorted = dict(sorted(top10_dict.items(), key=lambda kv: kv[1], reverse=True))
    return sorted(top10_diseases)    #top10_sorted
        

In [4]:
# Function to display the results from the dictionary
def PrintResults(top10_sorted_dict):
    for (key, value) in top10_sorted_dict.items():
        print(key, "\t", value, "%")


# Function to print list contents
def printList(list_data):
    for item in list_data:
        print(item)
        

In [5]:
# Function to find co-occuring symptoms with all the symptoms user chosen
# We use a threshold to check for a 80% match with the given symptoms

def FindCooccuringSymptomsWithThreshold(user_symptoms):
    
    global df_independent, all_symptoms
    threshold = len(user_symptoms) * 0.80

    # Get all unique possible diseases with the given symptoms
    unique_diseases = set()
    for symptom in user_symptoms:
        possible_diseases_for_symptom = list(df_independent[df_independent[symptom] == 1]['Disease_Name'])
        for disease in possible_diseases_for_symptom:
            unique_diseases.add(disease)
        
    # Get all unique diseases & sort them
    unique_diseases = sorted(list(unique_diseases))
    
    #print(unique_diseases)

    # Obtain co-occuring symptoms with 80% threshold
    # cooccuring_symptoms must have all given symptoms by default
    cooccuring_symptoms = set(user_symptoms)   
    for disease in unique_diseases:
        
        # First, obtain all symptoms associated with each disease in unique diseases obtained
        symptoms_of_disease = df_independent.loc[df_independent['Disease_Name'] == disease].values.tolist().pop(0)

        # Maintain a temporary set of symptoms of the disease & add them only when they meet threshold requirements
        temp_symptoms = set()
        count, add_symptoms = 0, False
        for idx in range(len(symptoms_of_disease)):
            
            # Symptoms of a disease will have 1 in their respective symptom columns
            if symptoms_of_disease[idx] == 1:
                temp_symptoms.add(all_symptoms[idx])
                count = count + 1

                # Our threshold is set to 80% of original symptoms
                if count > threshold:
                    add_symptoms = True

        # Adds temporary symptoms to cooccuring symptoms only if they meet threshold requirements
        if add_symptoms == True:
            for symp in temp_symptoms:
                cooccuring_symptoms.add(symp)

    cooccuring_symptoms = sorted(list(cooccuring_symptoms))
    return cooccuring_symptoms
    

### Prepares data to be compatible with the dataset to make predictions

In [6]:
# Load datasets for all possible combinations & for individual disease's respective symptoms
df_combination = pd.read_csv("./Disease_Symptom_Dataset_For_All_Symptom_Subsets.csv") 
df_independent = pd.read_csv("./Disease_Symptom_Dataset_For_Respective_Symptoms.csv") 

X_combination = df_combination.iloc[:, 1:]
Y_combination = df_combination.iloc[:, 0:1]

X_independent = df_independent.iloc[:, 1:]
Y_independent = df_independent.iloc[:, 0:1]

# List of all possible symptoms
all_symptoms = list(X_independent.columns)
all_diseases = list(set(Y_independent['Disease_Name']))
all_diseases.sort()

# We obtain top 10 possible diseases
no_of_diseases = 10


In [7]:
# Will be obtained from the front-end
user_symptoms = ["unexplained weight loss", "headache", "prolonged cough", "fever", "confusion", "tiredness", "weakness",
                 "feel need check thing repeatedly", "certain thought repeatedly", "perform certain routine repeatedly"]

# Obtains all possible cooccuring symptoms including given symptoms
cooccuring_symptoms = FindCooccuringSymptomsWithThreshold(user_symptoms)
print(cooccuring_symptoms)

# Process obtained symptoms to create rows compatible with the dataset
processed_symptoms = [0 for x in range(0, len(all_symptoms))]
for symptom in cooccuring_symptoms:
    processed_symptoms[all_symptoms.index(symptom)] = 1
    

['absent near absent urine output', 'anxiety', 'better sitting worse lying', 'blood stool', 'blood urine', 'bruising', 'certain thought repeatedly', 'change skin color red black', 'chest tightness', 'confusion', 'decreased ability turn', 'dermatitis herpetiformis', 'diarrhea mixed blood', 'diarrhoea may bloody', 'dry eye', 'feel need check thing repeatedly', 'fever', 'fever start low increase daily', 'firm', 'headache', 'hearing impairment loss', 'heat intolerance', 'irritation', 'limited critical thinking', 'loss bladder bowel control', 'malabsorption', 'muscle cramp', 'nausea vomiting', 'nausea vomiting weight loss dehydration occur', 'perform certain routine repeatedly', 'pinkish', 'post nasal drip', 'prolonged cough', 'sign ageing', 'small face', 'stroke', 'swell pain near tumor', 'symptom dehydration', 'thick skin crack', 'tiredness', 'tremor', 'unexplained weight loss', 'unpleasant smell present breath', 'usage resulting problem', 'weakness', 'weakness limb', 'wet', 'yellowish co

### Uses 4 Machine Learning models to obtain possible predictions (Takes 2 mins)

In [8]:
# Create Logistic Regression Classifier & fit the data to it
print("Processing with Logistic Regression...")

lr_classifier = LogisticRegression()
lr_classifier = lr_classifier.fit(X_combination, Y_combination)

# Obtain cross-validation scores & make predictions
#lr_scores = cross_val_score(lr_classifier, X_combination, Y_combination, cv=5)
#lr_mean_score = mean(lr_scores)
lr_result = lr_classifier.predict_proba([processed_symptoms])

print("Done")


Processing with Logistic Regression...
Done


In [9]:
# Create Random Forest Classifier & fit the data to it
print("Processing with Random Forest Classifier...")

rf_classifier = RandomForestClassifier(n_estimators=10, criterion='entropy')
rf_classifier = rf_classifier.fit(X_combination, Y_combination)

# Obtain cross-validation scores & make predictions
#rf_scores = cross_val_score(rf_classifier, X_combination, Y_combination, cv=5)
#rf_mean_score = mean(rf_scores)
rf_result = rf_classifier.predict_proba([processed_symptoms])

print("Done")


Processing with Random Forest Classifier...
Done


In [10]:
# Create KNN Classifier & fit the data to it
print("Processing with KNN Classifier...")

knn_classifier = KNeighborsClassifier(n_neighbors=7, weights='distance', n_jobs=4)
knn_classifier = knn_classifier.fit(X_combination, Y_combination)

# Obtain cross-validation scores & make predictions
#knn_scores = cross_val_score(knn_classifier, X_combination, Y_combination, cv=5)
#knn_mean_score = mean(knn_scores)
knn_result = knn_classifier.predict_proba([processed_symptoms])
    
print("Done")


Processing with KNN Classifier...
Done


In [11]:
# Create Multinomial Naive Bayes Classifier & fit the data to it
print("Processing with Multinomial Naive Bayes...")

mnb_classifier = MultinomialNB()
mnb_classifier = mnb_classifier.fit(X_combination, Y_combination)

# Obtain cross-validation scores & make predictions
#mnb_scores = cross_val_score(mnb_classifier, X_combination, Y_combination, cv=5)
#mnb_mean_score = mean(mnb_scores)
mnb_result = mnb_classifier.predict_proba([processed_symptoms])

print("Done")


Processing with Multinomial Naive Bayes...
Done


### Generate top 10 diseases associated with the symptoms from ML models' prediction results

In [12]:
# Logistic Regression result
print("---------- LOGISTIC REGRESSION: ----------\n")

lr_top10 = lr_result[0].argsort()[-no_of_diseases:][::-1]
lr_list = ProcessResultAndGenerateDiseases(lr_top10)
printList(lr_list)

# Random Forest result
print("\n---------- RANDOM FOREST: ----------\n")

rf_top10 = rf_result[0].argsort()[-no_of_diseases:][::-1]
rf_list = ProcessResultAndGenerateDiseases(rf_top10)
printList(rf_list)

# Knn Result
print("\n---------- KNN CLASSIFIER: ----------\n")

knn_top10 = knn_result[0].argsort()[-no_of_diseases:][::-1]
knn_list = ProcessResultAndGenerateDiseases(knn_top10)
printList(knn_list)

# Multinomial Naive bayes Result
print("\n---------- MULTINOMIAL NAIVE BAYES: ----------\n")

mnb_top10 = mnb_result[0].argsort()[-no_of_diseases:][::-1]
mnb_list = ProcessResultAndGenerateDiseases(mnb_top10)
printList(mnb_list)

# lr_list, rf_list, knn_list and mnb_list must be stored in DB & processed to be useful in the UI


---------- LOGISTIC REGRESSION: ----------

Acute encephalitis syndrome
Anaemia during pregnancy (maternal anemia)
Carbon monoxide poisoning
Celiacs disease
Chickenpox
Colitis
Inflammatory bowel disease
Japanese encephalitis
Obsessive compulsive disorder
Sepsis

---------- RANDOM FOREST: ----------

Acute encephalitis syndrome
Anaemia
Colorectal cancer
Erectile dysfunctions
Exposure keratopathy
Factitious keratoconjunctivitis
Plague
Sepsis
Stomach ulcers
Typhoid / enteric fever

---------- KNN CLASSIFIER: ----------

Acute encephalitis syndrome
Carbon monoxide poisoning
Crimean congo haemorrhagic fever (cchf)
Erectile dysfunctions
Exposure keratopathy
Factitious keratoconjunctivitis
Fibroids
Fibromyalgia
Filamentary keratitis
Japanese encephalitis

---------- MULTINOMIAL NAIVE BAYES: ----------

Acute encephalitis syndrome
Black death
Colitis
Inflammatory bowel disease
Japanese encephalitis
Lyme disease
Malaria
Obsessive compulsive disorder
Paratyphoid fever
Plague
