A self‑contained activity that focuses on depth‑controlling hyperparameters in DecisionTreeClassifier, and then uses a grid search to pick the best combination based on test performance.

In [2]:
# Part 0 – Setup and data split
# Use any classification dataset (Iris is convenient):

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load data
iris = load_iris()
X, y = iris.data, iris.target

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [3]:
# Part 1 – Baseline tree
# Create a baseline tree with defaults (except random_state=42).

# Measure:
# - Tree depth.
# - Train accuracy.
# - Test accuracy.

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Baseline model
base_clf = DecisionTreeClassifier(random_state=42)
base_clf.fit(X_train, y_train)

base_depth = base_clf.get_depth()         # actual depth of the fitted tree[web:47][web:73]
base_train_acc = accuracy_score(y_train, base_clf.predict(X_train))
base_test_acc = accuracy_score(y_test, base_clf.predict(X_test))

print("Baseline depth:", base_depth)
print("Baseline train accuracy:", base_train_acc)
print("Baseline test accuracy:", base_test_acc)

# Goal: this gives a reference to see how restricting depth‑related hyperparameters changes performance.

Baseline depth: 5
Baseline train accuracy: 1.0
Baseline test accuracy: 0.9333333333333333


In [4]:
# Part 2 - Control growth with each hyperparameter
# You now build three additional models, each focusing on one depth‑related hyperparameter:

# A. Vary min_samples_split
# This controls the minimum number of samples required in a node to allow a split.
# Larger values → fewer splits → shallower, simpler tree.

values_min_samples_split = [2, 5, 10, 20]

results_min_samples_split = []

for mss in values_min_samples_split:
    clf = DecisionTreeClassifier(
        random_state=42,
        min_samples_split=mss
    )
    clf.fit(X_train, y_train)

    depth = clf.get_depth()
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))

    results_min_samples_split.append((mss, depth, train_acc, test_acc))

print("\n=== min_samples_split results ===")
for mss, depth, train_acc, test_acc in results_min_samples_split:
    print(f"min_samples_split={mss:2d} | depth={depth:2d} | "
          f"train_acc={train_acc:.3f} | test_acc={test_acc:.3f}")


=== min_samples_split results ===
min_samples_split= 2 | depth= 5 | train_acc=1.000 | test_acc=0.933
min_samples_split= 5 | depth= 4 | train_acc=0.981 | test_acc=0.933
min_samples_split=10 | depth= 4 | train_acc=0.981 | test_acc=0.933
min_samples_split=20 | depth= 4 | train_acc=0.981 | test_acc=0.933


In [5]:
# B. Vary max_depth
# - This directly limits the maximum levels the tree can grow.
# - Low values → high bias (underfit risk).
# - Very high or None → low bias, high variance (overfit risk).

values_max_depth = [None, 2, 3, 4, 5]

results_max_depth = []

for md in values_max_depth:
    clf = DecisionTreeClassifier(
        random_state=42,
        max_depth=md
    )
    clf.fit(X_train, y_train)

    depth = clf.get_depth()
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))

    results_max_depth.append((md, depth, train_acc, test_acc))

print("\n=== max_depth results ===")
for md, depth, train_acc, test_acc in results_max_depth:
    print(f"max_depth={md} | depth={depth:2d} | "
          f"train_acc={train_acc:.3f} | test_acc={test_acc:.3f}")


=== max_depth results ===
max_depth=None | depth= 5 | train_acc=1.000 | test_acc=0.933
max_depth=2 | depth= 2 | train_acc=0.971 | test_acc=0.889
max_depth=3 | depth= 3 | train_acc=0.981 | test_acc=0.978
max_depth=4 | depth= 4 | train_acc=0.990 | test_acc=0.889
max_depth=5 | depth= 5 | train_acc=1.000 | test_acc=0.933


In [6]:
# C. Vary min_impurity_decrease
# - This controls the minimum reduction in impurity (Gini/entropy) required for a split to be made.
# - Larger values → only “strong” splits allowed → simpler tree.

values_min_impurity_decrease = [0.0, 1e-4, 1e-3, 1e-2]

results_min_imp = []

for mid in values_min_impurity_decrease:
    clf = DecisionTreeClassifier(
        random_state=42,
        min_impurity_decrease=mid
    )
    clf.fit(X_train, y_train)

    depth = clf.get_depth()
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))

    results_min_imp.append((mid, depth, train_acc, test_acc))

print("\n=== min_impurity_decrease results ===")
for mid, depth, train_acc, test_acc in results_min_imp:
    print(f"min_impurity_decrease={mid:.5f} | depth={depth:2d} | "
          f"train_acc={train_acc:.3f} | test_acc={test_acc:.3f}")


=== min_impurity_decrease results ===
min_impurity_decrease=0.00000 | depth= 5 | train_acc=1.000 | test_acc=0.933
min_impurity_decrease=0.00010 | depth= 5 | train_acc=1.000 | test_acc=0.933
min_impurity_decrease=0.00100 | depth= 5 | train_acc=1.000 | test_acc=0.933
min_impurity_decrease=0.01000 | depth= 3 | train_acc=0.981 | test_acc=0.933


Activity questions to answer:

- For each hyperparameter, how does increasing it affect tree depth?
- Where do you see signs of overfitting (very high train accuracy, noticeably lower test accuracy)?
- Which settings seem to give a good balance between depth and test accuracy?

In [None]:
# Part 3 - Combine them in a grid search
# Now you define a small parameter grid over these three hyperparameters and use GridSearchCV to pick the best combination.

from sklearn.model_selection import GridSearchCV

param_grid = {
    "max_depth": [None, 3, 4, 5],
    "min_samples_split": [2, 5, 10],
    "min_impurity_decrease": [0.0, 1e-3, 1e-2],
}

grid_clf = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
)

grid_clf.fit(X_train, y_train)

print("\n=== Grid search results ===")
print("Best params:", grid_clf.best_params_)
print("Best CV accuracy:", grid_clf.best_score_)

best_tree = grid_clf.best_estimator_
best_depth = best_tree.get_depth()
best_train_acc = accuracy_score(y_train, best_tree.predict(X_train))
best_test_acc = accuracy_score(y_test, best_tree.predict(X_test))

print(f"Best tree depth: {best_depth}")
print(f"Train accuracy (best tree): {best_train_acc:.3f}")
print(f"Test accuracy (best tree): {best_test_acc:.3f}")


=== Grid search results ===
Best params: {'max_depth': None, 'min_impurity_decrease': 0.0, 'min_samples_split': 5}
Best CV accuracy: 0.9523809523809523
Best tree depth: 4
Train accuracy (best tree): 0.981
Test accuracy (best tree): 0.933


Exception ignored in: <function ResourceTracker.__del__ at 0x77d80918e020>
Traceback (most recent call last):
  File "/home/vinny/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/vinny/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/vinny/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7699b3386020>
Traceback (most recent call last):
  File "/home/vinny/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/vinny/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/vinny/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ 