# Decision Tree Classifier

In [2]:
import pandas as pd
import numpy as np 
from sklearn.tree import DecisionTreeClassifier


## About the dataset
Imagine that you are a medical researcher compiling data for a study. You have collected data about a set of patients, all of whom suffered from the same illness. During their course of treatment, each patient responded to one of 5 medications, Drug A, Drug B, Drug c, Drug x and y.

Part of your job is to build a model to find out which drug might be appropriate for a future patient with the same illness. The feature sets of this dataset are Age, Sex, Blood Pressure, and Cholesterol of patients, and the target is the drug that each patient responded to.

It is a sample of binary classifier, and you can use the training part of the dataset to build a decision tree, and then use it to predict the class of a unknown patient, or to prescribe it to a new patient

In [3]:
import wget
wget.download('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/drug200.csv')

'drug200.csv'

In [5]:
df = pd.read_csv('drug200.csv') 

In [6]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [20]:
X=df[["Age","Sex","BP","Cholesterol","Na_to_K"]].values

In [21]:
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.113999999999999],
       [28, 'F', 'NORMAL', 'HIGH', 7.797999999999999],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

In [22]:
from sklearn import preprocessing

In [23]:
le_sex = preprocessing.LabelEncoder()

In [24]:
le_sex.fit(['F','M'])

LabelEncoder()

In [25]:
X[:,1] = le_sex.transform(X[:,1])

In [26]:
le_BP=preprocessing.LabelEncoder()

In [27]:
le_BP.fit(['LOW','NORMAL','HIGH'])
X[:,2] = le_BP.transform(X[:,2])

In [29]:
le_Chol = preprocessing.LabelEncoder()
le_Chol.fit(['NORMAL','HIGH'])
X[:,3] = le_Chol.transform(X[:,3])

In [30]:
X[:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.113999999999999],
       [28, 0, 2, 0, 7.797999999999999],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [31]:
y=df["Drug"]

In [33]:
y[:5]

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

## Setting up the Decision Tree

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=4)

In [39]:
X_train.shape

(140, 5)

In [42]:
X_test.shape

(60, 5)

In [43]:
Y_train.shape

(140,)

In [45]:
Y_test.shape

(60,)

## Modelling Decision tree

In [46]:
drugtree = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=0)

In [47]:
drugtree

DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=0)

In [48]:
drugtree.fit(X_train, Y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=0)

## Prediction

In [49]:
yhat = drugtree.predict(X_test)

In [50]:
yhat[0:5]

array(['drugY', 'drugY', 'drugY', 'drugY', 'drugC'], dtype=object)

In [51]:
print(Y_test [0:5])

11     drugY
99     drugY
128    drugY
175    drugY
1      drugC
Name: Drug, dtype: object


## Evaluation

In [56]:
from sklearn import metrics
import matplotlib.pyplot as plt
print("The accuracy of the model is: ",metrics.accuracy_score(Y_test,yhat))

The accuracy of the model is:  0.9666666666666667
