# Decision Tree

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('drug200.csv')
df.head()

In [None]:
X = df[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values   # type(X) is np.ndarray
# y = df[['Drug']]           # type(y) is pandas.core.frame.DataFrame
# y = df['Drug']             # type(y) is pandas.core.series.Series
y = df['Drug'].values        # type(y) is np.ndarray
type(X)

## Preprocessing

Scikit-learn's decision tree can only take categorical value which are integers. <br/>
Thus we need to convert these string values to integers. <br/>
We can achieve by utilizing <code>sklearn.preprocessing.LabelEncoder()</code>.

In [None]:
from sklearn import preprocessing

In [None]:
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['M','F'])
X[:,1] = le_sex.transform(X[:,1])

le_bp = preprocessing.LabelEncoder()
le_bp.fit(['LOW','NORMAL','HIGH'])
X[:,2] = le_bp.transform(X[:,2])

le_cholesterol = preprocessing.LabelEncoder()
le_cholesterol.fit(['NORMAL','HIGH'])
X[:,3] = le_cholesterol.transform(X[:,3])

#### Let's take a look at our polished ;) feature set

In [None]:
X[0:5]

### Train-test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Building the model

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()
model           # Displaying the default parameters

**Note:** We can also tweak the default value of decision tree, such as changing the *criterion* field from **gini**(default) to **entropy**.

In [None]:
model.fit(X_train, y_train)

In [None]:
y_hat = model.predict(X_test)

### Evaluating the model

In [None]:
from sklearn import metrics

In [None]:
print("The accuracy of the decision tree is %.9f" % metrics.accuracy_score(y_hat, y_test))

## Visualizing the decision tree

In [None]:
# @TODO