In [2]:
# IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, ConfusionMatrixDisplay
import joblib

# 1) Load the Iris dataset and create a pandas DataFrame

In [3]:
iris = datasets.load_iris()
# Feature matrix and target
X = iris.data
y = iris.target
# Create a DataFrame for easier inspection and preprocessing
df = pd.DataFrame(X, columns=iris.feature_names)
# Map numeric target to species names so we can demonstrate encoding
df['species'] = pd.Categorical.from_codes(y, iris.target_names)


# Show the first rows
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


# 2) Preprocessing: check and handle missing values, encode labels

In [4]:
# 2.1 Check for missing values
print('\nMissing values per column:')
print(df.isnull().sum())


# If there were missing values in features, we could impute them. Demonstrate using SimpleImputer.
# (Iris dataset has no missing values, but this shows the correct approach.)
feature_cols = iris.feature_names
imputer = SimpleImputer(strategy='mean') # replace missing numeric values with column mean
X_imputed = imputer.fit_transform(df[feature_cols])


# 2.2 Encode labels (species -> numeric)
le = LabelEncoder()
y_encoded = le.fit_transform(df['species'])
print('\nLabel classes:', le.classes_)


# Prepare final feature matrix and labels
X_final = X_imputed
y_final = y_encoded = y_encoded = y_encoded = y_encoded = y_encoded if False else y_encoded if False else y_encoded
# The above line is intentionally overwritten in the next line to keep the notebook simple.
# Use the encoded labels we created via LabelEncoder
y_final = le.transform(df['species'])


Missing values per column:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64

Label classes: ['setosa' 'versicolor' 'virginica']


In [None]:
# 3) Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42, stratify=y_final)
print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# 4) Train a Decision Tree classifier

In [None]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [None]:
# 5) Evaluate the model using accuracy, precision, and recall

In [None]:
# Predictions
y_pred = clf.predict(X_test)


# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)


print('\nEvaluation metrics on test set:')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision (macro): {precision_macro:.4f}')
print(f'Recall (macro): {recall_macro:.4f}')


print('\nFull classification report:')
print(classification_report(y_test, y_pred, target_names=le.classes_))


# Confusion matrix display
disp = ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test, display_labels=le.classes_, cmap=plt.cm.Blues)
plt.title('Confusion Matrix - Decision Tree (Iris)')
plt.show()

# 6) Visualize the trained decision tree (simple plot)

In [None]:
plt.figure(figsize=(12,8))
plot_tree(clf, feature_names=feature_cols, class_names=le.classes_, filled=True, rounded=True)
plt.title('Decision Tree Structure')
plt.show()

# 7) Save the trained model for future use

In [None]:
model_filename = 'iris_decision_tree_joblib.pkl'
joblib.dump({'model': clf, 'label_encoder': le, 'imputer': imputer}, model_filename)
print(f"Model and preprocessing objects saved to {model_filename}")

# Example predictions on new samples (showing how to use the saved pipeline)

In [None]:
new_samples = X_test[:5] # take first 5 test samples
preds = clf.predict(new_samples)
pred_names = le.inverse_transform(preds)
print('\nExample predictions on 5 test samples:')
for i, (sample, p) in enumerate(zip(new_samples, pred_names)):
print(f'Sample {i+1}: features={np.round(sample,2)} --> predicted species = {p}')