# 01 - Classification with Scikit-learn

This notebook demonstrates:
- Loading the Iris dataset
- EDA, train/test split
- Pipeline (StandardScaler + RandomForest)
- Evaluation: accuracy, confusion matrix, classification report
- Saving figures and model for the report


In [None]:
# Install (uncomment if running in new environment)
# !pip install -q scikit-learn matplotlib seaborn pandas joblib

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import joblib
import os

os.makedirs("../figures", exist_ok=True)  # adjust path if saving in repo root
os.makedirs("../models", exist_ok=True)


In [None]:
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
display(df.head())
print("Class counts:\n", df['target'].value_counts())


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=iris.target_names)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Iris")
plt.savefig("../figures/sklearn_confusion_matrix.png", dpi=150)
plt.show()


In [None]:
model = pipeline.named_steps['model']
importances = model.feature_importances_
features = iris.feature_names

plt.figure(figsize=(6,4))
sns.barplot(x=importances, y=features)
plt.title("Feature Importances (RandomForest)")
plt.tight_layout()
plt.savefig("../figures/sklearn_feature_importance.png", dpi=150)
plt.show()

joblib.dump(pipeline, "../models/iris_randomforest.pkl")
print("Saved model to ../models/iris_randomforest.pkl")


Notes:
- Figures saved to `/figures/`
- Model saved to `/models/`
- Copy accuracy and classification_report outputs into your PDF report.
