In [1]:
# Block 1: Import libraries and generate dataset
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
# Generate dataset
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=5,
    n_redundant=10,
    n_classes=2,
    flip_y=0.15,
    random_state=42
)
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
# Block 2: Function to evaluate tree (accuracy, precision, recall)
def evaluate_tree(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    train_prec = precision_score(y_train, y_train_pred, average='macro', zero_division=0)
    test_prec = precision_score(y_test, y_test_pred, average='macro', zero_division=0)
    train_rec = recall_score(y_train, y_train_pred, average='macro', zero_division=0)
    test_rec = recall_score(y_test, y_test_pred, average='macro', zero_division=0)
    return train_acc, test_acc, train_prec, test_prec, train_rec, test_rec

In [6]:
clf = DecisionTreeClassifier(max_depth=1, random_state=42, splitter='best')
train_acc, test_acc, train_prec, test_prec, train_rec, test_rec = evaluate_tree(clf, X_train, y_train, X_test, y_test)
print(train_acc)
print(test_acc)

0.7575
0.77


In [7]:
clf = DecisionTreeClassifier(min_samples_split=24, max_depth=None, random_state=42, splitter='best')
train_acc, test_acc, train_prec, test_prec, train_rec, test_rec = evaluate_tree(clf, X_train, y_train, X_test, y_test)
print(train_acc)
print(test_acc)

0.9125
0.83


In [8]:
clf = DecisionTreeClassifier(min_samples_leaf=11, max_depth=None, random_state=42, splitter='best')
train_acc, test_acc, train_prec, test_prec, train_rec, test_rec = evaluate_tree(clf, X_train, y_train, X_test, y_test)
print(train_acc)
print(test_acc)

0.8725
0.8
