# Disease Prediction

In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

## Data gathering

In [7]:
# load data
test_data = pd.read_csv('/content/sample_data/Testing.csv')
train_data = pd.read_csv('/content/sample_data/Training.csv')

## Data cleaning

*   Handling missing values
*   Scaling and Normalisation
*   Inconsistent data




In [8]:
cols_with_missing = [col for col in train_data.columns
                     if train_data[col].isnull().any()]
cols_with_missing

['Unnamed: 133']

In [9]:
train_data[cols_with_missing]

Unnamed: 0,Unnamed: 133
0,
1,
2,
3,
4,
...,...
4915,
4916,
4917,
4918,


In [10]:
print(train_data.columns)
train_data=train_data.drop(cols_with_missing, axis=1)
print(train_data.columns)

Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'scurring', 'skin_peeling', 'silver_like_dusting',
       'small_dents_in_nails', 'inflammatory_nails', 'blister',
       'red_sore_around_nose', 'yellow_crust_ooze', 'prognosis',
       'Unnamed: 133'],
      dtype='object', length=134)
Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'blackheads', 'scurring', 'skin_peeling', 'silver_like_dusting',
       'small_dents_in_nails', 'inflammatory_nails', 'blister',
       'red_sore_around_nose', 'yellow_crust_ooze', 'prognosis'],
      dtype='object', length=133)


In [11]:
# count of values missing 
missing_values_count = train_data.isnull().sum()

total_cells = np.product(train_data.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

0.0


## Model building

In [12]:
# train data into features and output
train_features = train_data.columns[0:-1]
train_output = train_data.columns[-1]

train_x = train_data[train_features]
train_y = train_data[train_output]

# test data into features and output
test_features = test_data.columns[0:-1]
test_output = test_data.columns[-1]

test_x = test_data[test_features]
test_y = test_data[test_output]

In [13]:
clf = DecisionTreeClassifier(random_state=0)

In [14]:
# train the model with train_x and train_y
clf.fit(train_x,train_y)

DecisionTreeClassifier(random_state=0)

In [15]:
# predictions
val_predictions = clf.predict(test_x)

In [16]:
print(val_predictions.tolist())
print(test_y.tolist())

['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis', 'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ', 'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine', 'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice', 'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia', 'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins', 'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia', 'Osteoarthristis', 'Arthritis', '(vertigo) Paroymsal  Positional Vertigo', 'Acne', 'Urinary tract infection', 'Psoriasis', 'Impetigo', 'Psoriasis']
['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis', 'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ', 'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine', 'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice', 'Malaria', 'Chicken pox'

In [17]:
# calculating accuracy 
expected = test_y.tolist()
actual = val_predictions.tolist()
length = len(expected)
correct = 0
for index in range(0, length):
  if(expected[index] == actual[index]):
    correct += 1
accuracy = correct/length
print(accuracy)

0.9761904761904762
