In [2]:
#!/usr/bin/env python
# coding: utf-8
# author: Bo Tang

from collections import namedtuple
import numpy as np
from scipy import stats
import gurobipy as gp
from gurobipy import GRB
from sklearn import tree
import pandas as pd

def convert_dataframe_to_matrix(data):
    if isinstance(data, pd.DataFrame):
        return data.to_numpy()
    elif isinstance(data, pd.Series):
        return data.to_numpy()
    else:
        return data


class optimalDecisionTreeClassifier:
    """
    optimal classification tree
    """
    def __init__(self, max_depth=3, min_samples_split=2, alpha=0, warmstart=True, timelimit=600, output=True):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.alpha = alpha
        self.warmstart = warmstart
        self.timelimit = timelimit
        self.output = output
        self.trained = False
        self.optgap = None

        # scaler params (fit-time)
        self.x_min_ = None
        self.x_scale_ = None  # range, with zeros replaced by 1
        self.scales = None    # kept for backward-compat (but now means x_scale_)

        # node index
        self.n_index = [i + 1 for i in range(2 ** (self.max_depth + 1) - 1)]
        self.b_index = self.n_index[:-2 ** self.max_depth]  # branch nodes
        self.l_index = self.n_index[-2 ** self.max_depth:]  # leaf nodes

    def _ensure_2d_float(self, x):
        x = convert_dataframe_to_matrix(x)
        x = np.asarray(x, dtype=float)
        if x.ndim == 1:
            x = x.reshape(1, -1)
        return x

    def _fit_scaler(self, x):
        """Fit min-max scaler to [0,1] per feature."""
        x = self._ensure_2d_float(x)
        self.x_min_ = np.min(x, axis=0)
        x_max = np.max(x, axis=0)
        self.x_scale_ = x_max - self.x_min_
        # avoid division by zero for constant features
        self.x_scale_[self.x_scale_ == 0] = 1.0
        self.scales = self.x_scale_
        return x

    def _transform(self, x):
        """Apply fitted min-max scaling."""
        if self.x_min_ is None or self.x_scale_ is None:
            raise AssertionError("Scaler is not fitted. Call fit() first.")
        x = self._ensure_2d_float(x)
        if hasattr(self, "p") and x.shape[1] != self.p:
            raise ValueError(f"Expected {self.p} features, got {x.shape[1]}.")
        return (x - self.x_min_) / self.x_scale_

    def fit(self, x, y):
        """
        fit training data
        """
        # convert data
        x_raw = self._fit_scaler(x)
        y = convert_dataframe_to_matrix(y)
        y = np.asarray(y).ravel()

        # scale data to [0,1] for MIP correctness
        x_scaled = (x_raw - self.x_min_) / self.x_scale_

        # data size
        self.n, self.p = x_scaled.shape
        if self.output:
            print(f"Training data include {self.n} instances, {self.p} features.")

        # labels
        self.labels = np.unique(y)

        # solve MIP (IMPORTANT: build on scaled data)
        m, a, b, c, d, l = self._buildMIP(x_scaled, y)
        if self.warmstart:
            self._setStart(x_scaled, y, a, c, d, l)  # warmstart consistent with scaling
        m.optimize()
        self.optgap = m.MIPGap

        # get parameters
        self._a = {ind: a[ind].x for ind in a}
        self._b = {ind: b[ind].x for ind in b}
        self._c = {ind: c[ind].x for ind in c}
        self._d = {ind: d[ind].x for ind in d}

        self.trained = True

    def predict(self, x):
        """
        model prediction
        """
        if not self.trained:
            raise AssertionError("This optimalDecisionTreeClassifier instance is not fitted yet.")

        # scale input (same transform as training)
        x_scaled = self._transform(x)

        # leaf label
        labelmap = {}
        for t in self.l_index:
            # pick the label with the largest c-value (more robust than thresholding)
            best_k = None
            best_val = -1.0
            for k in self.labels:
                val = self._c[k, t]
                if val > best_val:
                    best_val = val
                    best_k = k
            if best_k is not None and best_val > 1e-6:
                labelmap[t] = best_k

        y_pred = []
        for xi in x_scaled:
            t = 1
            while t not in self.l_index:
                right = (sum(self._a[j, t] * xi[j] for j in range(self.p)) + 1e-9 >= self._b[t])
                t = 2 * t + 1 if right else 2 * t
            y_pred.append(labelmap.get(t, self.labels[0]))  # safe fallback

        return np.array(y_pred)

    def _buildMIP(self, x, y):
        """
        build MIP formulation for Optimal Decision Tree
        NOTE: x is assumed scaled to [0,1].
        """
        # create a model
        m = gp.Model('m')

        # output
        m.Params.outputFlag = self.output
        m.Params.LogToConsole = self.output
        # time limit
        m.Params.timelimit = self.timelimit
        # parallel
        m.params.threads = 0

        # model sense
        m.modelSense = GRB.MINIMIZE

        # variables
        a = m.addVars(self.p, self.b_index, vtype=GRB.BINARY, name='a')          # splitting feature
        b = m.addVars(self.b_index, vtype=GRB.CONTINUOUS, name='b')              # splitting threshold (lb=0)
        c = m.addVars(self.labels, self.l_index, vtype=GRB.BINARY, name='c')     # node prediction
        d = m.addVars(self.b_index, vtype=GRB.BINARY, name='d')                  # splitting option
        z = m.addVars(self.n, self.l_index, vtype=GRB.BINARY, name='z')          # leaf node assignment
        l = m.addVars(self.l_index, vtype=GRB.BINARY, name='l')                  # leaf node activation
        L = m.addVars(self.l_index, vtype=GRB.CONTINUOUS, name='L')              # leaf node misclassified
        M = m.addVars(self.labels, self.l_index, vtype=GRB.CONTINUOUS, name='M') # leaf node samples with label
        N = m.addVars(self.l_index, vtype=GRB.CONTINUOUS, name='N')              # leaf node samples

        # calculate baseline accuracy
        baseline = self._calBaseline(y)

        # calculate minimum distance (on SCALED x)
        min_dis = self._calMinDist(x)

        # objective function
        obj = L.sum() / baseline + self.alpha * d.sum()
        m.setObjective(obj)

        # constraints
        # (20)
        m.addConstrs(L[t] >= N[t] - M[k, t] - self.n * (1 - c[k, t])
                     for t in self.l_index for k in self.labels)
        # (21)
        m.addConstrs(L[t] <= N[t] - M[k, t] + self.n * c[k, t]
                     for t in self.l_index for k in self.labels)
        # (17)
        m.addConstrs(gp.quicksum((y[i] == k) * z[i, t] for i in range(self.n)) == M[k, t]
                     for t in self.l_index for k in self.labels)
        # (16)
        m.addConstrs(z.sum('*', t) == N[t] for t in self.l_index)
        # (18)
        m.addConstrs(c.sum('*', t) == l[t] for t in self.l_index)

        # (13) and (14)
        bigM_left = 1.0 + float(np.max(min_dis))  # <= 2 if x is in [0,1]
        for t in self.l_index:
            left = (t % 2 == 0)
            ta = t // 2
            while ta != 0:
                if left:
                    m.addConstrs(
                        gp.quicksum(a[j, ta] * (x[i, j] + min_dis[j]) for j in range(self.p))
                        + bigM_left * (1 - d[ta])
                        <= b[ta] + bigM_left * (1 - z[i, t])
                        for i in range(self.n)
                    )
                else:
                    # big-M = 1 is valid because x,b in [0,1]
                    m.addConstrs(
                        gp.quicksum(a[j, ta] * x[i, j] for j in range(self.p))
                        >= b[ta] - (1 - z[i, t])
                        for i in range(self.n)
                    )
                left = (ta % 2 == 0)
                ta //= 2

        # (8)
        m.addConstrs(z.sum(i, '*') == 1 for i in range(self.n))
        # (6)
        m.addConstrs(z[i, t] <= l[t] for t in self.l_index for i in range(self.n))
        # (7)
        m.addConstrs(z.sum('*', t) >= self.min_samples_split * l[t] for t in self.l_index)
        # (2)
        m.addConstrs(a.sum('*', t) == d[t] for t in self.b_index)
        # (3)  -> forces b in [0,1] when d=1 (since b has lb=0)
        m.addConstrs(b[t] <= d[t] for t in self.b_index)
        # (5)
        m.addConstrs(d[t] <= d[t // 2] for t in self.b_index if t != 1)

        return m, a, b, c, d, l

    @staticmethod
    def _calBaseline(y):
        """
        obtain baseline accuracy by simply predicting the most popular class
        """
        # robust mode extraction across scipy versions
        res = stats.mode(y, keepdims=True)
        mode = res.mode[0]
        return np.sum(y == mode)

    @staticmethod
    def _calMinDist(x):
        """
        get the smallest positive distance per feature
        x is assumed numeric; if scaled to [0,1], distances are <= 1.
        """
        x = np.asarray(x, dtype=float)
        min_dis = []
        for j in range(x.shape[1]):
            xj = np.unique(x[:, j])
            xj.sort()
            diffs = np.diff(xj)
            diffs = diffs[diffs > 0]
            min_dis.append(float(diffs.min()) if diffs.size else 1.0)
        return min_dis

    def _setStart(self, x, y, a, c, d, l):
        """
        set warm start from CART
        NOTE: x should be scaled consistently with the MIP.
        """
        if self.min_samples_split > 1:
            clf = tree.DecisionTreeClassifier(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
        else:
            clf = tree.DecisionTreeClassifier(max_depth=self.max_depth)
        clf.fit(x, y)

        # get splitting rules
        rules = self._getRules(clf)

        # fix branch node
        for t in self.b_index:
            if rules[t].feat is None or rules[t].feat == tree._tree.TREE_UNDEFINED:
                d[t].start = 0
                for f in range(self.p):
                    a[f, t].start = 0
            else:
                d[t].start = 1
                for f in range(self.p):
                    a[f, t].start = 1 if f == int(rules[t].feat) else 0

        # fix leaf nodes
        for t in self.l_index:
            if rules[t].value is None:
                l[t].start = int(t % 2)
                if t % 2:
                    t_leaf = t
                    while rules[t].value is None:
                        t //= 2
                    for k in self.labels:
                        c[k, t_leaf].start = 1 if k == np.argmax(rules[t].value) else 0
                else:
                    for k in self.labels:
                        c[k, t].start = 0
            else:
                l[t].start = 1
                for k in self.labels:
                    c[k, t].start = 1 if k == np.argmax(rules[t].value) else 0

    def _getRules(self, clf):
        """
        get splitting rules
        """
        node_map = {1: 0}
        for t in self.b_index:
            node_map[2 * t] = -1
            node_map[2 * t + 1] = -1
            lch = clf.tree_.children_left[node_map[t]]
            node_map[2 * t] = lch
            rch = clf.tree_.children_right[node_map[t]]
            node_map[2 * t + 1] = rch

        rule = namedtuple('Rules', ('feat', 'threshold', 'value'))
        rules = {}
        for t in self.b_index:
            i = node_map[t]
            rules[t] = rule(None, None, None) if i == -1 else rule(clf.tree_.feature[i], clf.tree_.threshold[i], clf.tree_.value[i, 0])
        for t in self.l_index:
            i = node_map[t]
            rules[t] = rule(None, None, None) if i == -1 else rule(None, None, clf.tree_.value[i, 0])
        return rules


In [5]:
import numpy as np
import pandas as pd
import gurobipy as gp
from gurobipy import GRB

# ---- Adjust this import to your file/module name ----
# from your_module import optimalDecisionTreeClassifier
# -----------------------------------------------------


def make_multiclass_data(n, seed=0):
    """
    Depth-3-ish ground truth with:
      - continuous + discrete + correlated + constant features
      - 3 classes
      - a small region that challenges min_samples_split
      - label noise so misclassification variables matter
    """
    rng = np.random.default_rng(seed)

    x0 = rng.random(n)                                # [0,1]
    x1 = rng.random(n)                                # [0,1]
    x2 = rng.choice([0.0, 0.5, 1.0], size=n, p=[0.55, 0.30, 0.15])  # discrete
    x3 = rng.random(n)                                # noise feature
    x4 = np.ones(n) * 5.0                             # constant column (tests scaling)
    x5 = np.clip(0.35 * x0 + 0.65 * x1 + rng.normal(0, 0.03, n), 0, 1)  # correlated

    X = np.vstack([x0, x1, x2, x3, x4, x5]).T

    # Ground-truth decision logic (3 classes)
    y = np.zeros(n, dtype=int)
    y[x0 > 0.75] = 2
    mask = (x0 <= 0.75) & (x1 > 0.60)
    y[mask] = 1
    mask = (x0 <= 0.75) & (x1 <= 0.60) & (x2 == 1.0)
    y[mask] = 1

    # Add some label noise (forces L/M/N to matter)
    flip = rng.random(n) < 0.08
    y[flip] = rng.integers(0, 3, size=flip.sum())

    cols = ["x0", "x1", "x2", "x3", "x4_const", "x5_corr"]
    X_df = pd.DataFrame(X, columns=cols)
    y_sr = pd.Series(y, name="y")
    return X_df, y_sr


def affine_transform(X_df):
    """Apply harsh affine transforms to test scaling invariance."""
    X2 = X_df.copy()
    # huge scale + offset
    X2["x0"] = X2["x0"] * 1e6 + 1234.0
    # tiny scale + negative offset
    X2["x1"] = X2["x1"] * 1e-4 - 9.0
    # discrete with offset
    X2["x2"] = X2["x2"] * 10.0 + 7.0
    # noise with negative scaling
    X2["x3"] = -3.0 * X2["x3"] + 2.0
    # constant remains constant (still tests zero-range)
    # correlated with scaling
    X2["x5_corr"] = X2["x5_corr"] * 100.0 - 50.0
    return X2


def build_mip_all(clf, x_scaled, y):
    """
    Rebuild the *full* MIP (including z, l, L, M, N) so we can:
      - fix learned a,b,c,d
      - solve for remaining vars
      - validate feasibility + implied leaf stats
    This mirrors your formulation (same constraints/indices).
    """
    x_scaled = np.asarray(x_scaled, dtype=float)
    y = np.asarray(y).ravel()

    n, p = x_scaled.shape
    labels = np.unique(y)

    m = gp.Model("check_mip")
    m.Params.OutputFlag = 0
    m.Params.LogToConsole = 0
    m.Params.TimeLimit = 30
    m.Params.Threads = 0
    m.ModelSense = GRB.MINIMIZE

    a = m.addVars(p, clf.b_index, vtype=GRB.BINARY, name="a")
    b = m.addVars(clf.b_index, vtype=GRB.CONTINUOUS, name="b")
    c = m.addVars(labels, clf.l_index, vtype=GRB.BINARY, name="c")
    d = m.addVars(clf.b_index, vtype=GRB.BINARY, name="d")
    z = m.addVars(n, clf.l_index, vtype=GRB.BINARY, name="z")
    l = m.addVars(clf.l_index, vtype=GRB.BINARY, name="l")
    L = m.addVars(clf.l_index, vtype=GRB.CONTINUOUS, name="L")
    M = m.addVars(labels, clf.l_index, vtype=GRB.CONTINUOUS, name="M")
    N = m.addVars(clf.l_index, vtype=GRB.CONTINUOUS, name="N")

    baseline = clf._calBaseline(y)
    min_dis = clf._calMinDist(x_scaled)
    bigM_left = 1.0 + float(np.max(min_dis))

    # objective (same structure)
    m.setObjective(L.sum() / baseline + clf.alpha * d.sum())

    # (20) (21)
    m.addConstrs(L[t] >= N[t] - M[k, t] - n * (1 - c[k, t])
                 for t in clf.l_index for k in labels)
    m.addConstrs(L[t] <= N[t] - M[k, t] + n * c[k, t]
                 for t in clf.l_index for k in labels)

    # (17) M counts
    m.addConstrs(gp.quicksum((y[i] == k) * z[i, t] for i in range(n)) == M[k, t]
                 for t in clf.l_index for k in labels)
    # (16) N counts
    m.addConstrs(z.sum("*", t) == N[t] for t in clf.l_index)
    # (18) one label per active leaf
    m.addConstrs(c.sum("*", t) == l[t] for t in clf.l_index)

    # (13) (14) path constraints
    for t in clf.l_index:
        left = (t % 2 == 0)
        ta = t // 2
        while ta != 0:
            if left:
                m.addConstrs(
                    gp.quicksum(a[j, ta] * (x_scaled[i, j] + min_dis[j]) for j in range(p))
                    + bigM_left * (1 - d[ta])
                    <= b[ta] + bigM_left * (1 - z[i, t])
                    for i in range(n)
                )
            else:
                m.addConstrs(
                    gp.quicksum(a[j, ta] * x_scaled[i, j] for j in range(p))
                    >= b[ta] - (1 - z[i, t])
                    for i in range(n)
                )
            left = (ta % 2 == 0)
            ta //= 2

    # (8) each sample assigned exactly one leaf
    m.addConstrs(z.sum(i, "*") == 1 for i in range(n))
    # (6) assignment implies leaf active
    m.addConstrs(z[i, t] <= l[t] for t in clf.l_index for i in range(n))
    # (7) min samples per active leaf
    m.addConstrs(z.sum("*", t) >= clf.min_samples_split * l[t] for t in clf.l_index)

    # (2) one feature if split
    m.addConstrs(a.sum("*", t) == d[t] for t in clf.b_index)
    # (3) b <= d (with b>=0 default) => b in [0,1] if split
    m.addConstrs(b[t] <= d[t] for t in clf.b_index)
    # (5) parent split if child split
    m.addConstrs(d[t] <= d[t // 2] for t in clf.b_index if t != 1)

    return m, a, b, c, d, z, l, L, M, N, labels


def fix_solution_in_mip(m, a, b, c, d, clf, labels, eps_b=1e-6):
    """Fix learned (a,b,c,d) in the rebuilt MIP (with tolerance on b)."""
    # Fix d and a
    for t in clf.b_index:
        dt = int(round(clf._d[t]))
        m.addConstr(d[t] == dt)
        for j in range(clf.p):
            ajt = int(round(clf._a[j, t]))
            m.addConstr(a[j, t] == ajt)

        bt = float(clf._b[t])
        # tolerate tiny numerical differences
        m.addConstr(b[t] >= bt - eps_b)
        m.addConstr(b[t] <= bt + eps_b)

    # Fix c
    for t in clf.l_index:
        for k in labels:
            ckt = int(round(clf._c[k, t]))
            m.addConstr(c[k, t] == ckt)


def assert_close(a, b, tol, msg):
    if abs(a - b) > tol:
        raise AssertionError(f"{msg}: {a} vs {b} (tol={tol})")


def run_full_test_suite():
    # --------------------------
    # 1) Train + Predict + Types
    # --------------------------
    X_train, y_train = make_multiclass_data(180, seed=1)
    X_test, y_test = make_multiclass_data(300, seed=2)

    clf = optimalDecisionTreeClassifier(
        max_depth=3,
        min_samples_split=2,
        alpha=0.0,
        warmstart=True,
        timelimit=60,
        output=False,
    )
    clf.fit(X_train, y_train)

    # scaling sanity: transform is [0,1] (constant columns become 0 after min-max)
    Xs = clf._transform(X_train)
    if not (np.nanmin(Xs) >= -1e-9 and np.nanmax(Xs) <= 1.0 + 1e-9):
        raise AssertionError("Scaled features are not within [0,1]. Scaling is inconsistent.")

    # accuracy sanity on noisy multiclass
    yhat_train = clf.predict(X_train)
    yhat_test = clf.predict(X_test)
    acc_tr = (yhat_train == np.asarray(y_train)).mean()
    acc_te = (yhat_test == np.asarray(y_test)).mean()
    print(f"[1] Train acc: {acc_tr:.3f}, Test acc: {acc_te:.3f}")
    if acc_tr < 0.85 or acc_te < 0.82:
        raise AssertionError("Accuracy unexpectedly low (could indicate scaling/constraints issue).")

    # b bounds when split
    for t in clf.b_index:
        if int(round(clf._d[t])) == 1:
            bt = float(clf._b[t])
            if bt < -1e-6 or bt > 1.0 + 1e-6:
                raise AssertionError(f"Threshold b[{t}] out of [0,1] while split: {bt}")
    print("[1b] b in [0,1] on split nodes: OK")

    # -------------------------------------------------
    # 2) Affine Transform Invariance (scaling robustness)
    # -------------------------------------------------
    X_train_aff = affine_transform(X_train)
    X_test_aff = affine_transform(X_test)

    clf_aff = optimalDecisionTreeClassifier(
        max_depth=3,
        min_samples_split=2,
        alpha=0.0,
        warmstart=True,
        timelimit=60,
        output=False,
    )
    clf_aff.fit(X_train_aff, y_train)
    yhat_test_aff = clf_aff.predict(X_test_aff)
    acc_aff = (yhat_test_aff == np.asarray(y_test)).mean()
    print(f"[2] Test acc after harsh affine feature transform: {acc_aff:.3f}")
    if acc_aff < 0.82:
        raise AssertionError("Accuracy dropped after affine transform; scaling likely inconsistent.")

    # ---------------------------------------------------------
    # 3) Full Constraint Consistency Check (rebuild + fix a,b,c,d)
    # ---------------------------------------------------------
    Xs_train = clf._transform(X_train)
    m, a, b, c, d, z, l, L, M, N, labels = build_mip_all(clf, Xs_train, np.asarray(y_train))
    fix_solution_in_mip(m, a, b, c, d, clf, labels, eps_b=1e-5)

    m.optimize()
    if m.SolCount == 0:
        raise AssertionError("Fixed-solution feasibility check failed: no feasible solution found.")
    print("[3] Fixed-solution feasibility: OK")

    # Verify core implied logic from the solved (z,l,N,M,L)
    # - each i assigned to exactly one leaf (already constrained, but we check numerically)
    zvals = {(i, t): z[i, t].X for i in range(clf.n) for t in clf.l_index}
    for i in range(clf.n):
        s = sum(zvals[i, t] for t in clf.l_index)
        assert_close(s, 1.0, 1e-5, f"z-sum for sample {i}")

    # - min_samples_split for active leaves
    for t in clf.l_index:
        lt = l[t].X
        Nt = N[t].X
        if lt > 0.5:
            if Nt + 1e-6 < clf.min_samples_split:
                raise AssertionError(f"Active leaf {t} violates min_samples_split: N={Nt}")

    # - L equals misclassified count at leaf (at optimum it should match N - max_k M)
    for t in clf.l_index:
        Nt = N[t].X
        maxM = max(M[k, t].X for k in labels) if labels.size else 0.0
        implied = Nt - maxM
        if abs(L[t].X - implied) > 1e-4:
            raise AssertionError(f"Leaf {t} L mismatch: L={L[t].X} implied={implied}")
    print("[3b] Leaf statistics consistency (N/M/L): OK")

    # --------------------------------------------
    # 4) min_samples_split stress (small region)
    # --------------------------------------------
    clf_ms = optimalDecisionTreeClassifier(
        max_depth=3,
        min_samples_split=25,  # this should prevent carving out the tiny x2==1 region
        alpha=0.0,
        warmstart=True,
        timelimit=60,
        output=False,
    )
    clf_ms.fit(X_train, y_train)

    Xs_train_ms = clf_ms._transform(X_train)
    m2, a2, b2, c2, d2, z2, l2, L2, M2, N2, labels2 = build_mip_all(clf_ms, Xs_train_ms, np.asarray(y_train))
    fix_solution_in_mip(m2, a2, b2, c2, d2, clf_ms, labels2, eps_b=1e-5)
    m2.optimize()
    if m2.SolCount == 0:
        raise AssertionError("min_samples_split feasibility check failed (fixed solution infeasible).")

    for t in clf_ms.l_index:
        if l2[t].X > 0.5 and N2[t].X + 1e-6 < clf_ms.min_samples_split:
            raise AssertionError(f"min_samples_split violated at leaf {t}: N={N2[t].X}")
    print("[4] min_samples_split enforced: OK")

    # --------------------------------------------
    # 5) alpha regularization tends to reduce splits
    # --------------------------------------------
    clf_lo = optimalDecisionTreeClassifier(
        max_depth=3, min_samples_split=2, alpha=0.0, warmstart=True, timelimit=60, output=False
    )
    clf_hi = optimalDecisionTreeClassifier(
        max_depth=3, min_samples_split=2, alpha=1.0, warmstart=True, timelimit=60, output=False
    )
    clf_lo.fit(X_train, y_train)
    clf_hi.fit(X_train, y_train)

    splits_lo = sum(int(round(clf_lo._d[t])) for t in clf_lo.b_index)
    splits_hi = sum(int(round(clf_hi._d[t])) for t in clf_hi.b_index)
    print(f"[5] splits alpha=0.0: {splits_lo}, splits alpha=1.0: {splits_hi}")
    if splits_hi > splits_lo:
        raise AssertionError("Higher alpha produced MORE splits (unexpected).")
    print("[5b] alpha regularization effect: OK")

    # --------------------------------------------
    # 6) Warmstart off should still work
    # --------------------------------------------
    clf_nows = optimalDecisionTreeClassifier(
        max_depth=3, min_samples_split=2, alpha=0.0, warmstart=False, timelimit=60, output=False
    )
    clf_nows.fit(X_train, y_train)
    acc_nows = (clf_nows.predict(X_test) == np.asarray(y_test)).mean()
    print(f"[6] Warmstart OFF test acc: {acc_nows:.3f}")
    if acc_nows < 0.80:
        raise AssertionError("Warmstart=False produced unexpectedly low accuracy.")
    print("\nALL FULL INTEGRATION TESTS PASSED ✅")


if __name__ == "__main__":
    run_full_test_suite()


[1] Train acc: 0.950, Test acc: 0.903
[1b] b in [0,1] on split nodes: OK
[2] Test acc after harsh affine feature transform: 0.910
[3] Fixed-solution feasibility: OK
[3b] Leaf statistics consistency (N/M/L): OK
[4] min_samples_split enforced: OK
[5] splits alpha=0.0: 7, splits alpha=1.0: 0
[5b] alpha regularization effect: OK
[6] Warmstart OFF test acc: 0.933

ALL FULL INTEGRATION TESTS PASSED ✅
