# Random Forest for health disease prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('DataSetSymptoms.csv')

In [3]:
data.head()

Unnamed: 0,Disease,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdomilnal pain,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,...,vomiting,weakness,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,HIV,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Pneumocystis carinii pneumonia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,accident cerebrovascular,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,acquired immuno-deficiency syndrome,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data.columns

Index(['Disease', 'Heberden's node', 'Murphy's sign', 'Stahli's line',
       'abdomen acute', 'abdomilnal pain', 'abdominal bloating',
       'abdominal tenderness', 'abnormal sensation',
       'abnormally hard consistency',
       ...
       'vomiting', 'weakness', 'weepiness', 'weight gain', 'welt',
       'wheelchair bound', 'wheezing', 'withdraw', 'worry', 'yellow sputum'],
      dtype='object', length=411)

In [5]:
len(data.columns)

411

In [6]:
#Ensure we don't have duplicated symptoms
len(data['Disease'].unique())

150

In [7]:
#DataFrame
df = pd.DataFrame(data)

In [25]:
df.head(10)

Unnamed: 0,Disease,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdomilnal pain,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,...,vomiting,weakness,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,HIV,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Pneumocystis carinii pneumonia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,accident cerebrovascular,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,acquired immuno-deficiency syndrome,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,adenocarcinoma,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,adhesion,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,affect labile,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,anemia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,anxiety state,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [9]:
len(df)

150

In [10]:
cols = df.columns

In [11]:
#Extracting the symptoms excluding the target disease
cols = cols[1:]

In [12]:
#Displaying the symptoms
cols

Index(['Heberden's node', 'Murphy's sign', 'Stahli's line', 'abdomen acute',
       'abdomilnal pain', 'abdominal bloating', 'abdominal tenderness',
       'abnormal sensation', 'abnormally hard consistency', 'abortion',
       ...
       'vomiting', 'weakness', 'weepiness', 'weight gain', 'welt',
       'wheelchair bound', 'wheezing', 'withdraw', 'worry', 'yellow sputum'],
      dtype='object', length=410)

In [13]:
len(cols)

410

In [14]:
#Divide attributes and target X = attributes, y = targets
X = df[cols]
y = df['Disease']

In [15]:
from sklearn.model_selection import train_test_split
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=66) # 70% training and 30% test

In [30]:
from sklearn import metrics
clf = RandomForestClassifier(n_estimators=100)
clf_rf = clf.fit(X, y)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, clf.predict(X_test)))

Accuracy: 0.9555555555555556


In [17]:
prediction = clf_rf.predict([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 
                              0, 0, 0, 0, 0]])

In [18]:
print('You may have '+ prediction+ ' Disease')

['You may have malaria Disease']


In [19]:
y_pred=clf.predict(X_test)

In [20]:
y_pred

array(['carcinoma of lung', 'hypercholesterolemia', 'encephalopathy',
       'candidiasis', 'failure heart', 'malaria', "Alzheimer's disease",
       'confusion', 'bacteremia', 'psychotic disorder', 'pancreatitis',
       'thrombocytopaenia', 'diverticulosis', 'suicide attempt',
       'thrombus', 'anemia', 'hypertensive disease', 'hernia',
       'neoplasm metastasis', 'myocardial infarction', 'HIV',
       'hernia hiatal', 'gastroenteritis',
       'primary carcinoma of the liver cells', 'gout', 'neutropenia',
       'carcinoma', 'delirium', 'systemic infection', 'typhoid fever',
       'cellulitis', 'adenocarcinoma', 'migraine disorders',
       'depressive disorder', 'incontinence', 'cholecystitis',
       'fibroid tumor', 'diverticulitis', 'hyperglycemia',
       'malignant neoplasms', 'carcinoma colon', 'embolism pulmonary',
       'mitral valve insufficiency', 'hypothyroidism', 'epilepsy'],
      dtype=object)

In [32]:
len(X_train), len(y_train)

(105, 105)