# Random Forest for health disease diagnostic

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('DataSetSymptoms.csv')

In [3]:
data.head()

Unnamed: 0,Disease,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdomilnal pain,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,...,vomiting,weakness,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,HIV,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Pneumocystis carinii pneumonia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,accident cerebrovascular,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,acquired immuno-deficiency syndrome,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data.columns

Index(['Disease', 'Heberden's node', 'Murphy's sign', 'Stahli's line',
       'abdomen acute', 'abdomilnal pain', 'abdominal bloating',
       'abdominal tenderness', 'abnormal sensation',
       'abnormally hard consistency',
       ...
       'vomiting', 'weakness', 'weepiness', 'weight gain', 'welt',
       'wheelchair bound', 'wheezing', 'withdraw', 'worry', 'yellow sputum'],
      dtype='object', length=411)

In [5]:
len(data.columns)

411

In [6]:
#Ensure we don't have duplicated symptoms
len(data['Disease'].unique())

150

In [7]:
#DataFrame
df = pd.DataFrame(data)

In [8]:
df.head()

Unnamed: 0,Disease,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdomilnal pain,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,...,vomiting,weakness,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,HIV,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Pneumocystis carinii pneumonia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,accident cerebrovascular,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,acquired immuno-deficiency syndrome,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
len(df)

150

In [10]:
cols = df.columns

In [11]:
#Extracting the symptoms excluding the target disease
cols = cols[1:]

In [12]:
#Displaying the symptoms
cols

Index(['Heberden's node', 'Murphy's sign', 'Stahli's line', 'abdomen acute',
       'abdomilnal pain', 'abdominal bloating', 'abdominal tenderness',
       'abnormal sensation', 'abnormally hard consistency', 'abortion',
       ...
       'vomiting', 'weakness', 'weepiness', 'weight gain', 'welt',
       'wheelchair bound', 'wheezing', 'withdraw', 'worry', 'yellow sputum'],
      dtype='object', length=410)

In [13]:
len(cols)

410

In [14]:
#Divide attributes and target X = attributes, y = targets
X = df[cols]
y = df['Disease']

In [15]:
clf = RandomForestClassifier(n_estimators=100)
clf_rf = clf.fit(X, y)

In [16]:
prediction = clf_rf.predict([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0]])

In [17]:
print('You may have '+ prediction+ ' Disease')

['You may have malaria Disease']


In [None]:
print("Accuracy:", accuracy_score(y, prediction))

In [None]:
feature_importances_df = pd.DataFrame(
    {"Symptoms": list(X.columns), "importance": classifier.feature_importances_}
).sort_values("importance", ascending=False)

In [None]:
feature_importances_df

In [None]:
import seaborn as sns
sns.barplot(x=feature_importances_df.Symptoms, y=feature_importances_df.importance)

In [None]:
plt.xlabel("Symptom Importance Score")
plt.ylabel("Symptoms")
plt.title("Visualizing Important Symptoms")
plt.xticks(
    rotation=45, horizontalalignment="right", fontweight="light", fontsize="x-large"
)
plt.show()

In [None]:
len(X_train), len(y_train)