
# 🌸 Iris Species Classification using Scikit-learn

**Goal:** Train and evaluate a Decision Tree classifier on the classic *Iris dataset* using Scikit-learn.

### Steps Covered
1. Load and explore the Iris dataset  
2. Handle missing values (if any)  
3. Encode categorical labels  
4. Split data into training/testing sets  
5. Train a Decision Tree classifier  
6. Evaluate using accuracy, precision, and recall  
7. Visualize the decision tree and feature importances  


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix


## 1. Load and Explore Dataset

In [None]:

iris = load_iris(as_frame=True)
X = iris.frame.drop(columns=['target'])
y_numeric = iris.frame['target']
target_names = iris.target_names.tolist()

df = X.copy()
df['species'] = [target_names[i] for i in y_numeric]
df.head()


## 2. Handle Missing Values

In [None]:

print("Missing values per column:")
print(df.isna().sum())

# Impute if needed
features = df.drop(columns=['species']).copy()
if features.isna().sum().sum() > 0:
    imputer = SimpleImputer(strategy='mean')
    features = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)
else:
    print("\nNo missing values found — proceeding without imputation.")


## 3. Encode Labels

In [None]:

le = LabelEncoder()
y = le.fit_transform(df['species'])
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


## 4. Split Dataset

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    features, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


## 5. Train Decision Tree Classifier

In [None]:

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
print("Decision Tree trained successfully!")


## 6. Evaluate Model

In [None]:

y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
rec = recall_score(y_test, y_pred, average='macro', zero_division=0)

print(f"Accuracy: {acc:.4f}")
print(f"Precision (macro): {prec:.4f}")
print(f"Recall (macro): {rec:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))


### Confusion Matrix Visualization

In [None]:

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()


## 7. Visualize the Decision Tree

In [None]:

plt.figure(figsize=(12,8))
plot_tree(clf, feature_names=features.columns, class_names=le.classes_, filled=True, rounded=True, fontsize=10)
plt.title("Decision Tree Visualization")
plt.show()


## 8. Feature Importances

In [None]:

importances = pd.Series(clf.feature_importances_, index=features.columns)
importances.sort_values(ascending=True).plot(kind='barh', figsize=(8,4))
plt.title("Feature Importances")
plt.xlabel("Importance Score")
plt.show()
