In [6]:
# load datasets
>>> from sklearn import datasets
>>> import numpy as np
>>> iris = datasets.load_iris()
>>> X = iris.data[:, [0,1,2,3]]
>>> y = iris.target
>>> print('Class labels:', np.unique(y))

Class labels: [0 1 2]


In [7]:
# splitting and standardizing the data
>>> from sklearn.model_selection import train_test_split
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

>>> from sklearn.preprocessing import StandardScaler
>>> sc = StandardScaler()
>>> sc.fit(X_train)
>>> X_train_std = sc.transform(X_train)
>>> X_test_std = sc.transform(X_test)

In [8]:
# creating decision tree classifier and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier( random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 1.00
test accuracy: 0.98
[[35  0  0]
 [ 0 35  0]
 [ 0  0 35]]
[[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]


In [9]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

ModuleNotFoundError: No module named 'pydotplus'

In [12]:
# creating decision tree classifier for gini impurity with depth of 1 and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier(criterion='gini',max_depth=1,random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 0.67
test accuracy: 0.67
[[35  0  0]
 [ 0 35  0]
 [ 0 35  0]]
[[15  0  0]
 [ 0 15  0]
 [ 0 15  0]]


In [None]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

In [13]:
# creating decision tree classifier for gini impurity with depth of 2 and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier(criterion='gini',max_depth=2,random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 0.95
test accuracy: 0.96
[[35  0  0]
 [ 0 30  5]
 [ 0  0 35]]
[[15  0  0]
 [ 0 14  1]
 [ 0  1 14]]


In [None]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

In [14]:
# creating decision tree classifier for gini impurity with depth of 3 and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier(criterion='gini',max_depth=3,random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 0.95
test accuracy: 0.98
[[35  0  0]
 [ 0 34  1]
 [ 0  4 31]]
[[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]


In [None]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

In [15]:
# creating decision tree classifier for gini impurity with depth of 4 and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier(criterion='gini',max_depth=4,random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 0.97
test accuracy: 0.98
[[35  0  0]
 [ 0 32  3]
 [ 0  0 35]]
[[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]


In [11]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

ModuleNotFoundError: No module named 'pydotplus'

In [16]:
# creating decision tree classifier for gini impurity with depth of 5 and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier(criterion='gini',max_depth=5,random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 0.99
test accuracy: 0.98
[[35  0  0]
 [ 0 35  0]
 [ 0  1 34]]
[[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]


In [None]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

In [17]:
# creating decision tree classifier for gini impurity with depth of 6 and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier(criterion='gini',max_depth=6,random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 1.00
test accuracy: 0.98
[[35  0  0]
 [ 0 35  0]
 [ 0  0 35]]
[[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]


In [None]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

In [18]:
# creating decision tree classifier for entropy impurity with depth of 1 and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier(criterion='entropy',max_depth=1,random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 0.67
test accuracy: 0.67
[[35  0  0]
 [ 0 35  0]
 [ 0 35  0]]
[[15  0  0]
 [ 0 15  0]
 [ 0 15  0]]


In [None]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

In [19]:
# creating decision tree classifier for entropy impurity with depth of 2 and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier(criterion='entropy',max_depth=2,random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 0.95
test accuracy: 0.96
[[35  0  0]
 [ 0 30  5]
 [ 0  0 35]]
[[15  0  0]
 [ 0 14  1]
 [ 0  1 14]]


In [None]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

In [20]:
# creating decision tree classifier for entropy impurity with depth of 3 and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier(criterion='entropy',max_depth=3,random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 0.95
test accuracy: 0.96
[[35  0  0]
 [ 0 30  5]
 [ 0  0 35]]
[[15  0  0]
 [ 0 14  1]
 [ 0  1 14]]


In [None]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

In [21]:
# creating decision tree classifier for entropy impurity with depth of 4 and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier(criterion='entropy',max_depth=4,random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 0.95
test accuracy: 0.93
[[35  0  0]
 [ 0 35  0]
 [ 0  5 30]]
[[15  0  0]
 [ 0 15  0]
 [ 0  3 12]]


In [None]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

In [22]:
# creating decision tree classifier for entropy impurity with depth of 5 and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier(criterion='entropy',max_depth=5,random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 0.97
test accuracy: 0.98
[[35  0  0]
 [ 0 32  3]
 [ 0  0 35]]
[[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]


In [None]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

In [23]:
# creating decision tree classifier for entropy impurity with depth of 6 and training the data
>>> from sklearn.tree import DecisionTreeClassifier
>>> tree = DecisionTreeClassifier(criterion='entropy',max_depth=6,random_state=1)
>>> tree.fit(X_train_std, y_train)

>>> y_pred_1= tree.predict(X_train_std)
>>> y_pred_2 = tree.predict(X_test_std)
>>> from sklearn.metrics import accuracy_score
>>> print('train accuracy: %.2f' % accuracy_score(y_train, y_pred_1))
>>> print('test accuracy: %.2f' % accuracy_score(y_test,y_pred_2))
>>> from sklearn.metrics import confusion_matrix
>>> CM_1 = confusion_matrix(y_train,y_pred_1)
>>> print(CM_1)
>>> CM_2 = confusion_matrix(y_test,y_pred_2)
>>> print(CM_2)

train accuracy: 0.98
test accuracy: 0.98
[[35  0  0]
 [ 0 33  2]
 [ 0  0 35]]
[[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]


In [None]:
# visualizing the decision tree 
>>> from IPython.display import Image 
>>> import pydotplus 

>>> from sklearn.tree import export_graphviz
>>> dot_data = export_graphviz(tree,
...                            filled=True, 
...                            rounded=True,
...                            class_names=['Setosa', 
...                                         'Versicolor',
...                                         'Virginica'],
...                            feature_names=['sepal length','sepal width ','petal length', 
...                                           'petal width'],
...                            out_file=None) 
>>> graph = pydotplus.graph_from_dot_data(dot_data) 
>>> Image(graph.create_png())

# Results

1.The accuracies between two impurity types doesn't vary much in fitting the training data. However, while fitting the test data, gini impurity fits better compared to that of entropy impurity. As a result, we get more misclassifications in entropy's case.

2.In case of gini impurity, the accuracy of training data keeps on increasing as we increase the depth of the decision tree I.e, the misclassifications reduces. However, the test accuracy is high if the depth of decisiontree is in range of 2 to 4 and less in 5 and 6 compared to train data.

3.So, we can conclude that the data becomes overfit if we increase the depth of the decision tree in case of gini impurity. 

4.In case of entropy impurity also, the accuracy increases but increase rate is less compared to gini impurity. the test accuracy increases if we increase the depth of decisiontree.

5.Here in this case, we can conclude that the data fits well if we increase the depth of decisiontree when impurity is entropy.

6.The computation of entropy impurity  takes a little longer than that of  gini impurity. However, if the tree becomes complex then entropy performs better as compared to gini entropy.