# Exercise 2: Decision Trees

This notebook covers the implementation and analysis of Decision Tree algorithms for machine learning tasks.

## 1. Import Required Libraries

Import the necessary libraries for Decision Tree implementation and analysis.

In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import plot_tree, export_text
import graphviz
from sklearn.tree import export_graphviz
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

## 2. Load, Split and Preprocess Data

First, load the dataset and split it into training, validation, and test sets (60/20/20).

In [9]:
# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")

# 60% train, 20% validation, 20% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print(X_train.shape, X_val.shape, X_test.shape)

(341, 30) (114, 30) (114, 30)


Then, preprocess the data by scaling the features using `StandardScaler`.

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## 3. Basic Decision Tree Implementation

Implement a basic Decision Tree model with default parameters.

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold

dt = DecisionTreeClassifier(random_state=42)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(
    dt, X_train_scaled, y_train,
    cv=cv,
    scoring=['accuracy', 'precision', 'recall', 'roc_auc'],
    return_train_score=True
)

for metric in ['test_accuracy', 'test_precision', 'test_recall', 'test_roc_auc']:
    print(f"{metric}: {scores[metric].mean():.3f} ± {scores[metric].std():.3f}")

test_accuracy: 0.909 ± 0.017
test_precision: 0.934 ± 0.010
test_recall: 0.921 ± 0.024
test_roc_auc: 0.905 ± 0.016


### Evaluate the Model



In [12]:
param_grid = {
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    dt, param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)

print("Best params:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)

# Evaluate on validation set
val_acc = grid_search.score(X_val_scaled, y_val)
print("Validation accuracy:", val_acc)

Best params: {'max_depth': 3, 'min_samples_split': 2}
Best CV accuracy: 0.9442881500426259
Validation accuracy: 0.956140350877193


In [13]:
# Combine train + val
X_combined = np.vstack((X_train_scaled, X_val_scaled))
y_combined = np.hstack((y_train, y_val))

best_dt = DecisionTreeClassifier(**grid_search.best_params_, random_state=42)
best_dt.fit(X_combined, y_combined)

y_pred = best_dt.predict(X_test_scaled)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test Precision:", precision_score(y_test, y_pred))
print("Test Recall:", recall_score(y_test, y_pred))
print("Test AUC:", roc_auc_score(y_test, best_dt.predict_proba(X_test_scaled)[:, 1]))

Test Accuracy: 0.9385964912280702
Test Precision: 0.9571428571428572
Test Recall: 0.9436619718309859
Test AUC: 0.9277759580740256
