In [2]:
# import rquired libraries

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

iris = load_iris()
df = pd.DataFrame(data=iris['data'], columns=iris['feature_names']) # create a pandas DataFrame from Iris dataset and set column names
df['species'] = pd.Categorical.from_codes(iris['target'], iris['target_names']) # add a species column using the target codes and names

print("First 5 rows:")
df.head() # display the first few rows of the data

First 5 rows:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
# explore the data

print("\nBasic information:")
df.info() # get a summary of the data

print("\nMissing values:")
df.isnull().sum() # check for missing values


Basic information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   sepal length (cm)  150 non-null    float64 
 1   sepal width (cm)   150 non-null    float64 
 2   petal length (cm)  150 non-null    float64 
 3   petal width (cm)   150 non-null    float64 
 4   species            150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB

Missing values:


sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64

In [None]:
# encode labels

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['species_encoded'] = le.fit_transform(df['species'])

print('Label mapping:')
for i, cls in enumerate(le.classes_):
    print(f"{cls} -> {i}")

df[['species', 'species_encoded']].head()

Label mapping:
setosa -> 0
versicolor -> 1
virginica -> 2


Unnamed: 0,species,species_encoded
0,setosa,0
1,setosa,0
2,setosa,0
3,setosa,0
4,setosa,0


In [11]:
# split data into train and test sets

from sklearn.model_selection import train_test_split

X = df[iris['feature_names']]
y = df['species_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print('Train size:', X_train.shape)
print('Test size: ', X_test.shape)

Train size: (120, 4)
Test size:  (30, 4)


In [12]:
# train a decision tree classifier

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('Predictions:', y_pred[:10])

Predictions: [0 2 1 1 0 1 0 0 2 1]


In [13]:
# evaluate the model perfomance

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='macro')
rec = recall_score(y_test, y_pred, average='macro')

print(f'Accuracy: {acc:.4f}')
print(f'Precision (macro): {prec:.4f}')
print(f'Recall (macro): {rec:.4f}\n')

print('Full classification report:\n')
print(classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy: 0.9333
Precision (macro): 0.9333
Recall (macro): 0.9333

Full classification report:

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.90      0.90      0.90        10
   virginica       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30

