In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import confusion_matrix, accuracy_score, plot_confusion_matrix, f1_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Inspect Data
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 10000)
df = pd.read_csv('../data/dataset.csv')

In [3]:
#Model does not accept nan values, so we need to change them
df.fillna('No Symptom', inplace=True)
df = df.applymap(lambda row: row.strip())

In [4]:
#Create unique symptoms list in order to encode them
symptoms_list = []
for i in range(1, 18):
    symptoms = df.iloc[:, i].unique()
    for symptom in symptoms:
        symptoms_list.append(symptom)

In [5]:
#Encoding label
label_encoder = LabelEncoder().fit(np.array(symptoms_list))
for i in range(1, 18):
    df.iloc[:, i] = label_encoder.transform(df.iloc[:, i])

In [6]:
#Create feature set(x) and target(y)
x = df.iloc[:, 1:]
y = df['Disease']

#Train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [7]:
#Random Forest for classification
disease_forest = RandomForestClassifier().fit(x_train, y_train)
scores = cross_validate(disease_forest, x, y, cv=10, scoring=['precision_macro', 'recall_macro', 'f1_macro'])
print(scores)

{'fit_time': array([0.33173728, 0.32425404, 0.32218814, 0.31573534, 0.34482551,
       0.32005405, 0.31689191, 0.32276201, 0.35834408, 0.31894159]), 'score_time': array([0.03492212, 0.03383517, 0.03340793, 0.03331709, 0.03476954,
       0.03193069, 0.03194952, 0.0364306 , 0.03263187, 0.03188586]), 'test_precision_macro': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 'test_recall_macro': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 'test_f1_macro': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}


# Average f1 score, precision and recall are equall to 100%

# Using our model to predict illness based on chosen symptoms

In [8]:
#User input prediction
user_input = ['cough', 'headache', 'sweating']
while len(user_input) < 17:
    user_input.append('No Symptom')

#Encode user input and predict top 5 possible diagnoses
user_input = label_encoder.transform(np.array(user_input)).reshape(-1, 17)
diagnose = disease_forest.predict_proba(user_input)
index_of_maximum_proba = diagnose.argsort()[0][-5:][::-1]
predicted_classes = disease_forest.classes_[index_of_maximum_proba]

#Get description df
descriptions_df = pd.read_csv('../data/symptom_Description.csv')

#Get precautions df
precautions_df = pd.read_csv('../data/symptom_precaution.csv')

#Precaution and
precaution_description_df = pd.merge(descriptions_df, precautions_df, how='outer').set_index('Disease').loc[predicted_classes]
precaution_description_df['Probability'] = pd.Series(data = diagnose[0][index_of_maximum_proba], index = predicted_classes).apply(lambda probability: str(round(probability*100, 1)) + '%')

precaution_description_dict = precaution_description_df.T.to_dict()
for disease, descriptions_precaution_probability in precaution_description_dict.items():
    print(f'The disease: {disease}\n')
    for k, v in descriptions_precaution_probability.items():
        print(f'\t{k}: {v}')

The disease: Allergy

	Description: An allergy is an immune system response to a foreign substance that's not typically harmful to your body.They can include certain foods, pollen, or pet dander. Your immune system's job is to keep you healthy by fighting harmful pathogens.
	Precaution_1: apply calamine
	Precaution_2: cover area with bandage
	Precaution_3: nan
	Precaution_4: use ice to compress itching
	Probability: 54.0%
The disease: Bronchial Asthma

	Description: Bronchial asthma is a medical condition which causes the airway path of the lungs to swell and narrow. Due to this swelling, the air path produces excess mucus making it hard to breathe, which results in coughing, short breath, and wheezing. The disease is chronic and interferes with daily working.
	Precaution_1: switch to loose cloothing
	Precaution_2: take deep breaths
	Precaution_3: get away from trigger
	Precaution_4: seek help
	Probability: 8.0%
The disease: Dimorphic hemmorhoids(piles)

	Description: nan
	Precaution_1