In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import numpy as np

# --- 1. Load the Dataset ---

# NOTE: Replace the following line with the correct path to your local 'iris.csv' file.
# For a common structure of the Iris dataset, the column names are usually:
# 'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'
try:
    # Attempt to load the user-specified file
    df = pd.read_csv('iris.csv')
    print("Successfully loaded 'iris.csv'")
except FileNotFoundError:
    print("Error: 'iris.csv' not found. Loading a publicly available version for demonstration.")
    # Fallback/Demonstration: Load the Iris dataset from a well-known public repository URL
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
    column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
    df = pd.read_csv(url, names=column_names)

print("\nFirst 5 rows of the dataset:")
print(df.head())

# --- 2. Preprocess the Data ---

# A. Handle Missing Values (Imputation/Removal)
print("\nChecking for missing values:")
print(df.isnull().sum())
# The standard Iris dataset is usually clean, but for real-world data, you would
# use df.fillna() or dropna() based on the situation.

# B. Separate Features (X) and Target (y)
X = df.drop('species', axis=1) # Features (Sepal and Petal measurements)
y_species = df['species']     # Target (Species names)

print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y_species.shape}")

# C. Encode Labels (Target variable: Species)
# The Decision Tree requires numerical labels, so we encode the species names.
le = LabelEncoder()
y = le.fit_transform(y_species)

# Map the original species names to their encoded integer values for reporting
target_names = le.classes_
print(f"\nOriginal Species Names: {target_names}")
print(f"Encoded Target Values (y): {np.unique(y)}")

# D. Split the Data
# Split the data into training (80%) and testing (20%) sets.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")

# --- 3. Train a Decision Tree Classifier ---

# Initialize the Decision Tree Classifier
# We use random_state for reproducibility
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model using the training data
print("\nTraining Decision Tree Classifier...")
dt_classifier.fit(X_train, y_train)
print("Training complete.")

# --- 4. Evaluate the Model ---

# Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# A. Calculate Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
# Use 'weighted' for precision and recall since the dataset is small and stratified 
# to better reflect performance across all classes.
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)

# B. Print the Results
print("\n--- Model Evaluation Results (Decision Tree) ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print("-------------------------------------------------")

# C. Detailed Classification Report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))

# --- Summary of Results ---
# The Decision Tree generally performs very well on the Iris dataset, often achieving high 
# accuracy, precision, and recall due to the distinct features of the different species.

Error: 'iris.csv' not found. Loading a publicly available version for demonstration.

First 5 rows of the dataset:
   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa

Checking for missing values:
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

Features (X) shape: (150, 4)
Target (y) shape: (150,)

Original Species Names: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
Encoded Target Values (y): [0 1 2]

Training set size: 120 samples
Testing set size: 30 samples

Training Decision Tree Classifier...
Training complete.

--- Model Evaluation Results (Decision Tree) ---
Accu