## Decision Tree using SKLearn

### Graphviz and dtree library can be used to produce beautiful trees

In [122]:
import pandas as pd
import numpy as np

# We could have gotten the data set from this line as well
from sklearn.datasets import load_iris

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.model_selection import train_test_split, KFold, cross_val_score



In [123]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [164]:
print(load_iris['class'])

TypeError: 'function' object is not subscriptable

In [124]:
df['target'].value_counts()

0    50
1    50
2    50
Name: target, dtype: int64

In [125]:
df.shape

(150, 5)

In [126]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [127]:
df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

In [128]:
X = df[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)']]
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [129]:
y= df['target']
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int32

In [130]:
## Train test model
## Technically we shoudl split the data here by train set and test set but for convenience we are not doing that here
## we are training the data on the entire train set
## Create an instance of the tree

clf =DecisionTreeClassifier(max_depth=5, random_state = 0)
## Train the model on entire dataset
clf.fit(X,y)

## Predictions
pred = clf.predict(X)

### Measure model performance

In [131]:
accur_score = clf.score(X,y)
print(accur_score)

1.0


#### Accuracy above is 100% that is because we did not split the data to training set and test set. hence, let us do the trian test split now

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 0)

In [133]:
X_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
137,6.4,3.1,5.5,1.8
84,5.4,3.0,4.5,1.5
27,5.2,3.5,1.5,0.2
127,6.1,3.0,4.9,1.8
132,6.4,2.8,5.6,2.2


In [134]:
X_train.shape

(120, 4)

In [135]:
X_test.shape

(30, 4)

#### Now we again build model and make predictions

In [136]:
clf = DecisionTreeClassifier(max_depth=3, random_state=0)
clf.fit(X_train,y_train)
pred = clf.predict(X_test)

In [137]:
accur_score = clf.score(X_test,y_test)
print(accur_score)

0.9666666666666667


#### K Folds Cross validation method

##### This method will create a cross validation set by splitting train and test data into multiple folds. E.g Fold 5 is test set and fold 1-4 are training set, similarly in iteration 2, fold 4 is test set and fold 1,2,3,5 are combined to form training set

In [138]:
kf = KFold(n_splits=5, shuffle=False)

In [139]:
accuracy = []

n=0
print("-------------Cross Validation for Each fold---------------")
for train_index, test_index in kf.split(X,y):
    clf = DecisionTreeClassifier().fit(X.iloc[train_index], y.iloc[train_index])
    score = clf.score(X.iloc[test_index],y.iloc[test_index])
    accuracy.append(score)
    print('Model: ', n+1)
    print('Accuracy : ',accuracy[n])
    n=n+1



-------------Cross Validation for Each fold---------------
Model:  1
Accuracy :  1.0
Model:  2
Accuracy :  0.9666666666666667
Model:  3
Accuracy :  0.8666666666666667
Model:  4
Accuracy :  0.9333333333333333
Model:  5
Accuracy :  0.7333333333333333


In [140]:
## Mean of accuracy of all folds

print(np.mean(accuracy))

0.9


##### To do the same process as shown above for kfold in one line

In [141]:
cross_val_score(clf,X,y, cv=5, scoring= 'accuracy').mean()

0.9600000000000002

#### Decision Trees are best as they provide us with the Feature importance

In [148]:
importance = pd.DataFrame({'feature': X_train.columns,'importance': np.round(clf.feature_importances_)})
importance = importance.sort_values('importance', ascending=False).set_index('feature')
importance

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
petal length (cm),1.0
sepal length (cm),0.0
sepal width (cm),0.0
petal width (cm),0.0


In [152]:
!pip install six



In [154]:
pip install pydotplus

Collecting pydotplus
  Downloading pydotplus-2.0.2.tar.gz (278 kB)
                                              0.0/278.7 kB ? eta -:--:--
     -------------------------------------  276.5/278.7 kB 5.7 MB/s eta 0:00:01
     -------------------------------------- 278.7/278.7 kB 4.3 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pydotplus
  Building wheel for pydotplus (setup.py): started
  Building wheel for pydotplus (setup.py): finished with status 'done'
  Created wheel for pydotplus: filename=pydotplus-2.0.2-py3-none-any.whl size=24578 sha256=c736e63196128305fa69fe8ae611932af55f3dd6262de22fb6f3fe69d7ccf4de
  Stored in directory: c:\users\sahil patel\appdata\local\pip\cache\wheels\bd\ce\e8\ff9d9c699514922f57caa22fbd55b0a32761114b4c4acc9e03
Successfully built pydotplus
Installing collected packages: pydotplus
Successfully installed pydotplus-2.0.2
Note: you may need to rest

In [155]:
from IPython.display import Image  
from six import StringIO
from sklearn.tree import export_graphviz
import pydotplus, graphviz

In [163]:
dot_data = StringIO()  

export_graphviz(clf, out_file=dot_data, filled=True, rounded=True,
                feature_names=X.columns, 
                class_names=['0',"1"])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

IndexError: list index out of range