# Classification using Decision Trees
### Author: Prof. Sandro Camargo <github.com/sandrocamargo>
### Data Mining Course <https://moodle.unipampa.edu.br/moodle/course/view.php?id=5213>
#### This script uses the basic concepts of decision trees.
##### In this script, we used the iris dataset https://archive.ics.uci.edu/dataset/53/iris


In [None]:
# Download and unzip the dataset
!wget -c https://archive.ics.uci.edu/static/public/53/iris.zip
!unzip -u iris.zip

In [None]:
# import and inspect the dataset
import pandas as pd

data = pd.read_csv('iris.data', header=None)
data.columns = ['Sepal Length','Sepal Width','Petal Length','Petal Width','Species']
data.head() # Show first 5 samples

In [None]:
# Getting to know your data
import seaborn as sns

sns.pairplot(data, hue='Species', markers=["o", "s", "D"])

In [None]:
# split dataset into train and test sets
from sklearn import tree, model_selection
import numpy as np

# Store the inputs in the matrix X and the outputs in the array y
X = data.iloc[:,0:4]
print(X.describe())

y = data.iloc[:,4]
print("\n",y.value_counts(),"\n")

target_names = list(set(y))

train_x, test_x, train_y, test_y = model_selection.train_test_split(X, y, train_size=0.76, shuffle=True, stratify=y)

# Verifying dataset dimensions
print('The training dataset (inputs) dimensions are: ', train_x.shape)
print('The training dataset (outputs) dimensions are: ', train_y.shape)
print('The testing dataset (inputs) dimensions are: ', test_x.shape)
print('The testing dataset (outputs) dimensions are: ', test_y.shape)


In [None]:
clf = tree.DecisionTreeClassifier(min_samples_leaf=2)
clf = clf.fit(train_x, train_y)

import graphviz
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=data.columns[0:4], class_names=target_names)
graph = graphviz.Source(dot_data)
graph.render("iris")
graph

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

result = cross_val_score(clf, train_x, train_y, verbose=1, cv=20)
print(result)

In [None]:
from sklearn.metrics import classification_report

predicted = clf.predict(test_x)
print(classification_report(test_y, predicted, target_names=target_names))

In [None]:
dot_data = tree.export_graphviz(clf, out_file=None,
                      feature_names=data.columns[0:4],
                      class_names=target_names,
                      filled=True, rounded=True,
                      special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("iris-color")
graph

In [None]:
sns.scatterplot(data,x='Petal Width',y='Petal Length', hue='Species', style='Species')

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_iris
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.tree import DecisionTreeClassifier

# Parameters
n_classes = 3
plot_colors = "ryb"
plot_step = 0.02

for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):
    # We only take the two corresponding features
    X = iris.data[:, pair]
    y = iris.target

    # Train
    clf = DecisionTreeClassifier().fit(X, y)

    # Plot the decision boundary
    ax = plt.subplot(2, 3, pairidx + 1)
    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
    DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        cmap=plt.cm.RdYlBu,
        response_method="predict",
        ax=ax,
        xlabel=iris.feature_names[pair[0]],
        ylabel=iris.feature_names[pair[1]],
    )

    # Plot the training points
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y == i)
        plt.scatter(
            X[idx, 0],
            X[idx, 1],
            c=color,
            label=iris.target_names[i],
            cmap=plt.cm.RdYlBu,
            edgecolor="black",
            s=15,
        )

plt.suptitle("Decision surface of decision trees trained on pairs of features")
plt.legend(loc="lower right", borderpad=0, handletextpad=0)
#_ = plt.axis("tight")