In [None]:
# Part 1: Importing the dataset

import pandas as pd

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
df = pd.read_csv(url, header=None)
df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
              'class']
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [None]:
# Part 2: Splitting the dataset into training and testing sets

from sklearn.model_selection import train_test_split

# The sklearn.model_selection.train_test_split() function splits the dataset
# into training and testing sets.

# The first argument to the function is df.drop('class', axis=1), which
# indicates that the column 'class' will be dropped from the dataset before it
# is split.

# The second argument to the function is df['class'], which indicates that the
# column 'class' will be the labels.

# The dataset will be split into X_train and X_test to the ratio of 0.9:0.1
# because of test_size=0.1, which means that 90% of the dataset will be used for
# training the machine learning algorithm, and the remaining 10% of the dataset
# will be used to test how well the algorithm performs.

# The labels will then be split in the same ratio into y_train and y_test.

X_train, X_test, y_train, y_test = train_test_split(df.drop('class', axis=1),
  df['class'], test_size=0.1, random_state=17)

In [None]:
# Part 3: Applying a standard supervised learning algorithm: decision tree, to
# the training and testing sets

from sklearn.tree import DecisionTreeClassifier

# Initializing the DecisionTreeClassifier object 'dtc' by invoking the
# constructor
dtc = DecisionTreeClassifier()

# Feeding X_train and y_train (the training set) into the fit() function,
# resulting in a trained DecisionTreeClassifier model, which is stored in the
# DecisionTreeClassifier object 'dtc'. This classifier object has taken the
# training set and used decision tree to distill generalizations into a model.
dtc.fit(X_train, y_train)

# Passing unlabeled features (X_test) into this classifier object’s predict()
# function to make predictions using this model
y_pred = dtc.predict(X_test)

In [None]:
# Part 4: Evaluating the performance of the standard supervised learning
# algorithm: decision tree, using 2 performance measures: accurucy score and
# confusion matrix

from sklearn.metrics import accuracy_score

# Using the sklearn.metrics.accuracy_score() function to check how good these
# predictions are
print('Accuracy score:', accuracy_score(y_test, y_pred))

from sklearn.metrics import confusion_matrix

# Using the sklearn.metrics.confusion_matrix() function to check how good these
# predictions are
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy score: 1.0
Confusion matrix:
[[3 0 0]
 [0 5 0]
 [0 0 7]]


In [None]:
# An accuracy score of 1.0 shows that the predictions are 100% correct,
# which indicates the DecisionTreeClassifier is excellent in this case.

# The confusion matrix also shows that the predictions are 100% correct
# (there are neither false positives nor false negatives), which also indicates
# the DecisionTreeClassifier is excellent in this case.