# Classification using Decision Trees
### Author: Prof. Sandro Camargo <github.com/sandrocamargo>
### Data Mining Course <https://moodle.unipampa.edu.br/moodle/course/view.php?id=5213>
#### This script uses the basic concepts of decision trees.
##### In this script, we used the iris dataset https://archive.ics.uci.edu/dataset/53/iris


In [None]:
# Download and unzip the dataset
!wget -c https://archive.ics.uci.edu/static/public/53/iris.zip
!unzip -u iris.zip

In [None]:
# import and inspect the dataset
import pandas as pd

data = pd.read_csv('iris.data', header=None)
data.columns = ['Sepal Length','Sepal Width','Petal Length','Petal Width','Species']
data.head() # Show first 5 samples

In [None]:
# Getting to know your data",
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(data, hue='Species', markers=["o", "s", "D"])
plt.savefig("iris-pairplot.pdf")

In [None]:
# split dataset into train and test sets
from sklearn import tree, model_selection
import random

random.seed(10)

# Store the inputs in the matrix X and the outputs in the array y
X = data.iloc[:,0:4]
print(X.describe())

y = data.iloc[:,4]
print("\n",y.value_counts(),"\n")

target_names = list(set(y))

train_x, test_x, train_y, test_y = model_selection.train_test_split(X, y, train_size=0.76, shuffle=True, stratify=y)

# Verifying dataset dimensions
print('The training dataset (inputs) dimensions are: ', train_x.shape)
print('The training dataset (outputs) dimensions are: ', train_y.shape)
print('The testing dataset (inputs) dimensions are: ', test_x.shape)
print('The testing dataset (outputs) dimensions are: ', test_y.shape)


In [None]:
# TRaining the model
clf = tree.DecisionTreeClassifier(min_samples_leaf=2)
clf = clf.fit(train_x, train_y)

# Plotting the model
import graphviz
dot_data = tree.export_graphviz(clf, out_file=None,
                      feature_names=data.columns[0:4],
                      class_names=target_names,
                      filled=True, rounded=True,
                      special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("iris-color")
graph

In [None]:
from matplotlib.patches import Rectangle
# Showing the performance on training set
tmpdata = train_x
tmpdata['Species']=train_y

# Plotting the training set
sns.scatterplot(data=tmpdata, x='Petal Width', y='Petal Length', hue='Species', style='Species')
xlim = plt.gca().get_xlim()
ylim = plt.gca().get_ylim()

# Plotting the decision surface
plt.axvspan(xlim[0],0.8,0,10, color='blue', alpha=0.2)
plt.axvspan(1.76,xlim[1],0,10, color='yellow', alpha=0.2)
plt.axvspan(0.81,1.75,0,(4.95-ylim[0])/(ylim[1]-ylim[0]), color='green', alpha=0.2)
plt.axvspan(0.81,1.75,(4.95-ylim[0])/(ylim[1]-ylim[0]),1, color='yellow', alpha=0.2)
plt.title("Iris - Training Data")
plt.savefig("iris-training-decisionsurface.pdf")

In [None]:
from sklearn.metrics import classification_report

predicted = clf.predict(test_x)
print(classification_report(test_y, predicted, target_names=target_names))

In [None]:
# Showing the performance on training set
tmpdata = test_x
tmpdata['Species']=test_y

# Plotting the test set
sns.scatterplot(data=tmpdata, x='Petal Width', y='Petal Length', hue='Species', style='Species')
xlim = plt.gca().get_xlim()
ylim = plt.gca().get_ylim()

# Plotting the decision surface
plt.axvspan(xlim[0],0.8,0,10, color='blue', alpha=0.2)
plt.axvspan(1.76,xlim[1],0,10, color='yellow', alpha=0.2)
plt.axvspan(0.81,1.75,0,(4.95-ylim[0])/(ylim[1]-ylim[0]), color='green', alpha=0.2)
plt.axvspan(0.81,1.75,(4.95-ylim[0])/(ylim[1]-ylim[0]),1, color='yellow', alpha=0.2)
plt.title("Iris - Test Data")
plt.savefig("iris-test-decisionsurface.pdf")