In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Step 1 (optional): Data Preparation
Load the dataset and symptoms to generate training data in the working directory. (Warning: this may take a few minutes). The resulting data frame contains a column with the prognosis and a column for each possible symptom. If a symptom belongs to a disease, the corresponding cell value is set to 1. The result is saved as "training.csv" in the working directory. To avoid having to run this method every time the notebook is started, I uploaded the resulting files to the input directory.


**The resulting dataset looks like this:**
![image.png](attachment:dbee55ef-fd7f-41de-a558-fd0f0362be17.png)

In [None]:
dataset = pd.read_csv('dataset.csv',delimiter=",")
severity = pd.read_csv('/kaggle/input/disease-symptom-description-dataset/Symptom-severity.csv',delimiter=",",)
#symptoms = severity["Symptom"]
#training = pd.DataFrame()
#temp = dataset.drop(columns=["Disease"])
#temp = pd.DataFrame(temp[:].values)
#for (i, values) in temp.iterrows():
#    newRow = pd.DataFrame([dict.fromkeys(values.dropna().unique(),"1")])
#    training = training.append(newRow)
#training = training.reset_index(drop=True)
#training["Prognosis"] = dataset["Disease"]
#training.to_csv('/kaggle/working/training.csv')

# Step 2: Generating the training and test data
Now we generate the training and test data for our model. First, the training data is split into a training and a test data set after replacing NaN with 0. By setting random_state (default=None) to an integer value, we get consistent output data. Later, we can use the test data to validate the model quality after we have performed the training.

Attention: The data frame must have a correct index column for the model to be trained properly.

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

training = pd.read_csv('/kaggle/input/training/training.csv',delimiter=",",index_col=0).fillna(0)

cols= training.columns
cols= cols[:-1]
x = training[cols]
y = training['Prognosis']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)


# Step 3: Generate model 1 - DecisionTreeClassifier
Now we genereate our first model: a decision tree classifier. To estimate the accuracy of our model, we use a utility function for cross-validation and output the mean accuracy and standard deviation over the test data. For training and validating the model, we use the data we have generated in step 2. Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression (https://scikit-learn.org/stable/modules/tree.html)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

classifier  = DecisionTreeClassifier()
classifierModel = classifier.fit(x_train.values,y_train.values)
scores = cross_val_score(classifierModel, x_test, y_test, cv=3)
print("%0.2f accuracy for decision tree classifier with a standard deviation of %0.2f" % (scores.mean(), scores.std()))


# Optional: Method for plotting the tree
The following method plots the tree up to a depth of 2. Unfortunately, the graphics are still hard to read.

In [None]:
from sklearn import tree
features = x_train.columns
tree.plot_tree(classifierModel,feature_names=features, class_names=sorted(classifierModel.classes_),max_depth=2)


# Optional: Generate model 2: C-Support Vector Classification
Now we generate a second model using C-Support Vector Classification. Interestingly, the accuracy for this model is higher.
These models can also be used for regression and classification problems: https://scikit-learn.org/stable/modules/svm.html#svm-classification.

In [None]:
from sklearn.svm import SVC

svcModel=SVC()
svcModel.fit(x_train.values,y_train.values)
print("%0.2f accuracy for svm" % (svcModel.score(x_test.values,y_test.values)))


# Step 4: Implement the Chatbot
# Prepare global variables and data
Now we can finally implement the chatbot. Therefore, we load all necessary data into several data frames.
First, we initialize the symptoms severities, descriptions and precautions.
The weight of the symptoms corresponds with the effect of the symptom on the human body within 2 days.
For giving the program user more information about his symptoms and providing him or her some recommendatations, we load the corresponding data also into 2 data frames and set the index to the diseases column.

Last, the model tree, feature names and a helper dataframe are created.

In [None]:
severities = pd.read_csv('/kaggle/input/disease-symptom-description-dataset/Symptom-severity.csv',delimiter=",")
descriptions = pd.read_csv('/kaggle/input/disease-symptom-description-dataset/symptom_Description.csv',delimiter=",")
precautions = pd.read_csv('/kaggle/input/disease-symptom-description-dataset/symptom_precaution.csv',delimiter=",")
diseasesData = pd.read_csv('/kaggle/input/disease-symptom-description-dataset/dataset.csv',delimiter=",")
precautions.set_index("Disease",inplace=True)
descriptions.set_index("Disease",inplace=True)

modelTree = classifierModel.tree_

groupedData = training.groupby(training['Prognosis']).max()



# Retrieving additional symptoms and possible disease
The following takes the first symptom which has been inputted by the user and walks the tree until the node with the corresponding name is found. The threshold value is used by the classification algorithm to determine whether a value belongs to class A or B. Since all edges of the tree have the same weight, the threshold is always 0.5. 
Once a node with the corresponding symptom is found, the disease associated with the node is returned together with the symptoms.

In [None]:
def walkTree(node, userInput):
    names = training.columns[:-1]
    featureNames = [
        names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in modelTree.feature
     ]
    if modelTree.feature[node] != _tree.TREE_UNDEFINED:
            name = featureNames[node]
            threshold = modelTree.threshold[node]
            if name.strip() == userInput.strip():
                val = 1
            else:
                val = 0
            if  val <= threshold:
                return walkTree(modelTree.children_left[node],userInput)
            else:
                return walkTree(modelTree.children_right[node], userInput)
    else:
        node = tree_.value[node][0]
        val  = node.nonzero() 
        disease = le.inverse_transform(val[0])
        possibleDisease = disease[0]
        indices  = groupedData.loc[disease].values[0].nonzero()
        columnList = groupedData.columns.values[0]
        symptomSuggestions = [groupedData.columns.values[i] for i in indices][0]
        return (symptomSuggestions , possibleDisease) 
    

*This is a test method for walkTree*

In [None]:
(symptomSuggestions, possibleDisease) = walkTree(0,"itching")
print(symptomSuggestions, possibleDisease)

# Make prediction with symptom list
Now we finally get to use our machine learning model: based on the symptoms which have been inputted by the program user, we are now making a prediction with our classification model.
First, a dictionary is created which holds the name and the index of each symptom within the training data set. Then, an input vector is created, which is set to 0 except for the positons of the symptoms, which are set to 1.

In [None]:
def makePrediction(experiencedSymptoms):
    x = training.iloc[:, :-1]
    symptomsDict = {symptom: index for index, symptom in enumerate(x)}
    inputVector = np.zeros(len(symptomsDict))
    for s in experiencedSymptoms:
      inputVector[[symptomsDict[s]]] = 1
    #print(svcModel.predict([inputVector])[0])
    return classifierModel.predict([inputVector])[0]

*This is a test method for makePrediction*

In [None]:
makePrediction(["itching"])

# Get first user input
This method looks into the symptom data frame and returns all matching symptoms after the user has entered the first symptom. This method could also use an ML model, but in this simple example, a stupid regex expression is used.

In [None]:
import re
def getRelatedSymptoms(userInput):
    result=[]
    symptoms = severities['Symptom']
    userInput=userInput.replace(' ','_')
    pattern = f"{userInput}"
    regexp = re.compile(pattern)
    predictionList=[item.replace('_',' ') for item in symptoms if regexp.search(item)]
    if(len(predictionList)>0):
        return (True,predictionList)
    else:
        return (False, [])

*This is a test method for getRelatedSymptoms*

In [None]:
getRelatedSymptoms("e")

# Implement the Chatbot - Greeting
Let's say hi to our program user!

# Get related symptoms based on first symptom
Get related symptoms based on the first user input. Based on the related symptom, the application evaluates whether a second option for a disease exists.

In [None]:
def getRelatedSymptomsFromUser():
    print("\nEnter the symptom you are experiencing \t\t",end="->") 
    userInput = input("") 
    return getRelatedSymptoms(userInput)

# Get first symptom from user input
This is main symptom which the user enters when he starts interacting with the application

In [None]:
def getSymptomsChoiceFromUser(relatedSymptoms):
    print("searches related to input: ") 
    for num, it in enumerate(relatedSymptoms): 
        print(num,")",it)
    i=0
    if num!=0: 
        print(f"Select the one you meant (0 - {num}): ", end="") 
        try:
            i = int(input("")) 
        except:
            pass
    else: 
        i=0 
    return relatedSymptoms[i]

# Get more symptoms from user
Get more symptoms which the user is experincing based on the previous symptom

In [None]:
def getAdditionalSymptomsFromUser(symptomSuggestions):
    result = []
    print("Are you experiencing any ")
    for s in symptomSuggestions:
        inp=""
        print(s,"? : ",end='')
        while True:
            inp=input("")
            if(inp=="yes" or inp=="no"):
                break
            else:
                print("provide proper answers i.e. (yes/no) : ",end="")
            if(inp=="yes"):
                result.append(s)
    return result

# Print description and recommendaton
Provide information and the disease and corresponding precautions.

In [None]:
def printDiseaseRecommendation(disease):
    print("You may have ", disease)
    print(descriptions.loc[disease].values[0])
    print("Take the following precautions: ")
    for p in precautions.loc[disease].values:
        print("- " + str(p))
     

*This is a test method for printDiseaseRecommendation*

In [None]:
printDiseaseRecommendation("hepatitis A")

# The main method: run the Healthcare chat bot!!!!
This method starts the application an retrieves the first symptom from the user. Based on the first symptom, the application makes a guess for a disease using the decision tree and provides more symptoms related to the first symptom.
From the second list, the user picks the symptoms which he experiences additionally to the current symptom.
Then, a second guess for a disease is made and corresponding advice is printed for the user.

Have fun!!!

In [None]:
print("-----------------------------------HealthCare ChatBot-----------------------------------")
print("\nWhat's your name? \t\t\t\t",end="->")
userName = input("Hello: ")
print("Hello,",userName,"! Nice to see you!")

while True:
    currentSymptom = "" 
    (result, relatedSymptoms) = getRelatedSymptomsFromUser()
    if result == True: 
        currentSymptom = getSymptomsChoiceFromUser(relatedSymptoms)
        (symptomSuggestions, possibleDisease) = walkTree(0, currentSymptom.replace(" ","_"))
        print(symptomSuggestions,possibleDisease )
        additionalSymptoms = getAdditionalSymptomsFromUser(symptomSuggestions)
        print(additionalSymptoms)
        otherPossibleDisease = makePrediction(additionalSymptoms)
        printDiseaseRecommendation(possibleDisease)
        if not possibleDisease == otherPossibleDisease:
            print("You may also have: \n")
            printDiseaseRecommendation(otherPossibleDisease)
            print("\n")
    print("Bye!")
    break