# Multilayer Perceptron (MLP)

This notebook demonstrates a **NumPy-only** implementation of a feed-forward neural network classifier:

- `MultilayerPerceptronClassifier` (alias: `MLPClassifier`)
- Hidden layers + nonlinearity (**ReLU** / **tanh**)
- Training via backpropagation with **SGD** or **Adam**
- Multiclass output with **softmax**

We use scikit-learn **datasets only** (no scikit-learn models).


## 1. Set up

In [None]:
from __future__ import annotations

import sys
from pathlib import Path
import inspect

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris, load_breast_cancer

# --- Ensure we can import from src/ ---
here = Path.cwd()
repo_root = None
for p in [here] + list(here.parents):
    if (p / "src" / "rice_ml").exists():
        repo_root = p
        break

if repo_root is None:
    raise RuntimeError("Could not find 'src/rice_ml'. Run this notebook inside the repo.")

sys.path.insert(0, str(repo_root / "src"))

from rice_ml.processing.preprocessing import standardize, train_test_split
from rice_ml.supervised_learning.multilayer_perceptron import MultilayerPerceptronClassifier

np.random.seed(42)


## 2. Preprocessing wrapper (fits your repo API)

In [None]:
def _extract_mean_scale(params):
    """Try to extract (mean, scale/std) from a variety of param formats."""
    if isinstance(params, dict):
        mean = params.get("mean", params.get("mu", params.get("center")))
        scale = params.get("std", params.get("sigma", params.get("scale")))
        if scale is None and params.get("var") is not None:
            scale = np.sqrt(params["var"])
        return mean, scale

    if isinstance(params, (tuple, list)) and len(params) == 2:
        return params[0], params[1]

    return None, None


def standardize_fit(X: np.ndarray):
    """Fit standardization on X and return (X_std, params)."""
    out = standardize(X, return_params=True)
    if not (isinstance(out, tuple) and len(out) == 2):
        raise RuntimeError("Expected standardize(..., return_params=True) to return (X_std, params).")
    X_std, params = out
    return X_std, params


def standardize_apply(X: np.ndarray, params):
    """Apply a previously-fitted standardization to X."""
    sig = inspect.signature(standardize)
    if "params" in sig.parameters:
        return standardize(X, params=params)

    # Fallback: apply manually
    mean, scale = _extract_mean_scale(params)
    if mean is None or scale is None:
        raise RuntimeError(
            "Could not apply standardization: standardize() has no 'params' argument and params format was not recognized."
        )

    scale = np.asarray(scale, dtype=float)
    mean = np.asarray(mean, dtype=float)
    scale_safe = np.where(scale == 0.0, 1.0, scale)
    return (X - mean) / scale_safe


## 3. Helper metrics and plotting

In [None]:
def accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return float(np.mean(y_true == y_pred))


def confusion_matrix_np(y_true: np.ndarray, y_pred: np.ndarray, labels=None) -> np.ndarray:
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    if labels is None:
        labels = np.unique(np.concatenate([y_true, y_pred]))
    labels = np.asarray(labels)

    idx = {lab: i for i, lab in enumerate(labels)}
    cm = np.zeros((labels.size, labels.size), dtype=int)
    for t, p in zip(y_true, y_pred):
        cm[idx[t], idx[p]] += 1
    return cm


def plot_decision_regions_2d(model, X: np.ndarray, y: np.ndarray, title: str) -> None:
    """Plot decision regions for a 2D dataset."""
    X = np.asarray(X)
    y = np.asarray(y)

    x_min, x_max = X[:, 0].min() - 0.6, X[:, 0].max() + 0.6
    y_min, y_max = X[:, 1].min() - 0.6, X[:, 1].max() + 0.6

    xx, yy = np.meshgrid(
        np.linspace(x_min, x_max, 300),
        np.linspace(y_min, y_max, 300),
    )
    grid = np.c_[xx.ravel(), yy.ravel()]
    Z = model.predict(grid).reshape(xx.shape)

    plt.figure()
    plt.contourf(xx, yy, Z, alpha=0.25)
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor="k", s=35)
    plt.xlabel("x1")
    plt.ylabel("x2")
    plt.title(title)
    plt.show()


def plot_loss_curve(clf, title: str) -> None:
    if getattr(clf, "loss_curve_", None) is None or len(clf.loss_curve_) == 0:
        print("No loss curve available.")
        return
    plt.figure()
    plt.plot(np.arange(1, len(clf.loss_curve_) + 1), clf.loss_curve_, marker="o")
    plt.xlabel("epoch")
    plt.ylabel("training loss (cross-entropy)")
    plt.title(title)
    plt.show()


## 4. Part A — XOR (nonlinear separability)

A single linear classifier (e.g., perceptron / logistic regression) cannot solve XOR.
A small MLP with one hidden layer **can** because it learns a nonlinear decision boundary.

We standardize the inputs for training stability (not strictly required for XOR, but good practice).


In [None]:
# XOR dataset
X = np.array([[0.0, 0.0],
              [0.0, 1.0],
              [1.0, 0.0],
              [1.0, 1.0]])
y = np.array([0, 1, 1, 0])

X_std, params = standardize_fit(X)
print("Train mean (approx):", np.round(X_std.mean(axis=0), 4))
print("Train std  (approx):", np.round(X_std.std(axis=0), 4))

clf_xor = MultilayerPerceptronClassifier(
    hidden_layer_sizes=(8,),
    activation="tanh",
    solver="adam",
    learning_rate=0.05,
    batch_size=4,
    max_iter=800,
    random_state=0,
    tol=1e-8,
    n_iter_no_change=50,
    early_stopping=False,
).fit(X_std, y)

pred = clf_xor.predict(X_std)
print("Pred:", pred)
print("Acc :", accuracy(y, pred))

plot_loss_curve(clf_xor, "XOR — training loss curve")


## 5. Part B — Iris (2D) multiclass classification + decision regions

We take two Iris features so we can visualize the decision regions.  
MLPs are **sensitive** to feature scaling, so we standardize using **train-only** statistics,
then apply the same parameters to the test set.


In [None]:
iris = load_iris()
X_all = iris.data
y_all = iris.target
feature_names = iris.feature_names

# Use 2D: petal length, petal width (classic separation)
X2 = X_all[:, [2, 3]]
y = y_all

print("X2 shape:", X2.shape)
print("y shape :", y.shape)
print("classes :", list(enumerate(iris.target_names)))

plt.figure()
plt.scatter(X2[:, 0], X2[:, 1], c=y, edgecolor="k", s=35)
plt.xlabel(feature_names[2])
plt.ylabel(feature_names[3])
plt.title("Iris (raw, 2D features)")
plt.show()

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X2, y, test_size=0.25, shuffle=True, random_state=42
)

# Standardize using TRAIN only (repo-compatible API)
X_train_std, params = standardize_fit(X_train)
X_test_std = standardize_apply(X_test, params)

print("\nTrain mean (approx):", np.round(X_train_std.mean(axis=0), 4))
print("Train std  (approx):", np.round(X_train_std.std(axis=0), 4))
print("Test  mean (approx):", np.round(X_test_std.mean(axis=0), 4))
print("Test  std  (approx):", np.round(X_test_std.std(axis=0), 4))


In [None]:
# Fit baseline MLP (multiclass softmax)
clf_iris = MultilayerPerceptronClassifier(
    hidden_layer_sizes=(16,),
    activation="tanh",
    solver="adam",
    learning_rate=0.01,
    batch_size=16,
    max_iter=600,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.15,
    n_iter_no_change=20,
).fit(X_train_std, y_train)

tr_pred = clf_iris.predict(X_train_std)
te_pred = clf_iris.predict(X_test_std)

print("MLPClassifier (Iris 2D)")
print("  train acc:", accuracy(y_train, tr_pred))
print("  test  acc:", accuracy(y_test, te_pred))
print("\nConfusion matrix (test): rows=true, cols=pred, labels=[0,1,2]")
print(confusion_matrix_np(y_test, te_pred, labels=np.array([0, 1, 2])))

plot_loss_curve(clf_iris, "Iris 2D — training loss curve")
plot_decision_regions_2d(clf_iris, X_train_std, y_train, "Decision regions (train) — Iris 2D | MLP")


### 5.1 Hyperparameter sweeps (small grid)

We scan a few reasonable values to show typical effects:

- More hidden units can increase capacity (but may overfit)
- Learning rate controls optimization speed/stability


In [None]:
hidden_sizes = [4, 8, 16, 32]
lrs = [0.003, 0.01, 0.03]

results = []
for h in hidden_sizes:
    for lr in lrs:
        clf = MultilayerPerceptronClassifier(
            hidden_layer_sizes=(h,),
            activation="tanh",
            solver="adam",
            learning_rate=lr,
            batch_size=16,
            max_iter=500,
            random_state=0,
            early_stopping=True,
            validation_fraction=0.15,
            n_iter_no_change=15,
        ).fit(X_train_std, y_train)

        tr = accuracy(y_train, clf.predict(X_train_std))
        te = accuracy(y_test, clf.predict(X_test_std))
        results.append((h, lr, tr, te))

best = max(results, key=lambda t: t[3])
print("Best (by test acc): hidden =", best[0], "lr =", best[1], "train acc =", best[2], "test acc =", best[3])

plt.figure()
for lr in lrs:
    xs = [h2 for (h2, lr2, tr, te) in results if lr2 == lr]
    ys = [te for (h2, lr2, tr, te) in results if lr2 == lr]
    plt.plot(xs, ys, marker="o", label=f"lr={lr}")
plt.xlabel("hidden_layer_size")
plt.ylabel("test accuracy")
plt.title("Iris 2D — test accuracy vs hidden size")
plt.legend()
plt.show()


## 6. Part C — Breast Cancer (30D) binary classification

Here we use all 30 features. Standardization is important for stable MLP training.
We also compare **SGD vs Adam** briefly.


In [None]:
data = load_breast_cancer()
X = data.data
y = data.target  # 0/1

print("X shape:", X.shape)
print("y shape:", y.shape)
print("classes:", list(enumerate(data.target_names)))
print("class counts:", {int(k): int(v) for k, v in zip(*np.unique(y, return_counts=True))})
print("Any NaN in X?", bool(np.isnan(X).any()))

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, shuffle=True, random_state=42
)

# Standardize using TRAIN only (repo-compatible API)
X_train_std, params = standardize_fit(X_train)
X_test_std = standardize_apply(X_test, params)

print("\nTrain mean (approx):", np.round(X_train_std.mean(axis=0)[:5], 4))
print("Train std  (approx):", np.round(X_train_std.std(axis=0)[:5], 4))
print("Test  mean (approx):", np.round(X_test_std.mean(axis=0)[:5], 4))
print("Test  std  (approx):", np.round(X_test_std.std(axis=0)[:5], 4))


In [None]:
# Adam
clf_adam = MultilayerPerceptronClassifier(
    hidden_layer_sizes=(32, 16),
    activation="relu",
    solver="adam",
    learning_rate=0.01,
    batch_size=32,
    max_iter=400,
    random_state=42,
    alpha=1e-4,
    early_stopping=True,
    validation_fraction=0.15,
    n_iter_no_change=20,
).fit(X_train_std, y_train)

tr_pred = clf_adam.predict(X_train_std)
te_pred = clf_adam.predict(X_test_std)

print("MLP (Adam) — Breast Cancer")
print("  train acc:", accuracy(y_train, tr_pred))
print("  test  acc:", accuracy(y_test, te_pred))
print("\nConfusion matrix (test): rows=true, cols=pred, labels=[0,1]")
print(confusion_matrix_np(y_test, te_pred, labels=np.array([0, 1])))

plot_loss_curve(clf_adam, "Breast Cancer — training loss curve (Adam)")


In [None]:
# SGD (typically needs smaller learning rate / more epochs)
clf_sgd = MultilayerPerceptronClassifier(
    hidden_layer_sizes=(32, 16),
    activation="relu",
    solver="sgd",
    learning_rate=0.005,
    batch_size=32,
    max_iter=600,
    random_state=42,
    alpha=1e-4,
    early_stopping=True,
    validation_fraction=0.15,
    n_iter_no_change=25,
).fit(X_train_std, y_train)

tr_pred = clf_sgd.predict(X_train_std)
te_pred = clf_sgd.predict(X_test_std)

print("MLP (SGD) — Breast Cancer")
print("  train acc:", accuracy(y_train, tr_pred))
print("  test  acc:", accuracy(y_test, te_pred))

plot_loss_curve(clf_sgd, "Breast Cancer — training loss curve (SGD)")


## 7. Conclusion

- MLPs combine **linear layers + nonlinear activations** to learn complex decision boundaries (e.g., XOR).
- Unlike trees, MLPs are **sensitive to feature scaling**; standardization improves training stability and speed.
- **Adam** generally converges faster and is more forgiving than plain SGD, especially early in training.
- Capacity control matters: hidden size/depth and regularization (e.g., `alpha`, early stopping) help avoid overfitting.

**Implementation note:** this notebook uses a small wrapper (`standardize_fit` / `standardize_apply`) so it works with
your repository's `standardize()` API (which returns `(X_std, params)` and may or may not accept `params=` when applying
to test data).
