In [5]:
%pip install ucimlrepo

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from ucimlrepo import fetch_ucirepo
from scipy.stats import entropy

#-----------------------------------------------------------------
# 1. DECISION TREE IMPLEMENTATION
#-----------------------------------------------------------------

class DecisionTreeNode:
    """
    Decision Tree Node class
    """
    def __init__(self, is_leaf=False, feature_idx=None, threshold=None, value=None,
                 left=None, right=None, gain=None):
        self.is_leaf = is_leaf
        self.feature_idx = feature_idx  # Feature index to split on
        self.threshold = threshold      # Threshold value for split
        self.value = value              # Value if leaf node (class label or regression value)
        self.left = left                # Left child (samples where feature < threshold)
        self.right = right              # Right child (samples where feature >= threshold)
        self.gain = gain                # Information gain from this split

class DecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=2):
        """
        Decision Tree Classifier implementation from scratch using entropy criterion

        Parameters:
        -----------
        max_depth : int or None
            Maximum depth of the tree
        min_samples_split : int
            Minimum number of samples required to split a node
        """
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        """
        Build the decision tree from data

        Parameters:
        -----------
        X : numpy array Training features
        y : numpy array Target values
        """
        self.n_features = X.shape[1]
        self.n_classes = len(np.unique(y))
        self.root = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        """
        function to build the tree
        """
        n_samples, n_features = X.shape

        # Count of each class in current node
        class_counts = np.bincount(y.astype(int), minlength=self.n_classes)

        # Determine the class with highest count (mode)
        predicted_class = np.argmax(class_counts)

        # Init a leaf node
        leaf_node = DecisionTreeNode(is_leaf=True, value=predicted_class)

        # Create a leaf node if meets stopp criteria
        if depth >= self.max_depth if self.max_depth else False:
            return leaf_node

        if n_samples < self.min_samples_split:
            return leaf_node

        if len(np.unique(y)) == 1:
            return leaf_node

        # Find the best split
        best_feature, best_threshold, best_gain = self._best_split(X, y)

        # If no good split is found, create a leaf node
        if best_gain <= 0:
            return leaf_node

        # Split the data based on the best feature and threshold
        left_indices = X[:, best_feature] < best_threshold
        right_indices = ~left_indices

        # Recursively grow the left and right subtrees
        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        # Return a decision node
        return DecisionTreeNode(
            feature_idx=best_feature,
            threshold=best_threshold,
            left=left_subtree,
            right=right_subtree,
            gain=best_gain
        )

    def _best_split(self, X, y):
        """
        Find the best feature and threshold for splitting the data using entropy

        Returns:
        --------
        best_feature : int
            Index of the best feature to split on
        best_threshold : float
            Threshold value for the best split
        best_gain : float
            Information gain from the best split
        """
        best_gain = -1
        best_feature = None
        best_threshold = None

        # For each feature
        for feature_idx in range(self.n_features):
            # Get unique values in the feature
            thresholds = np.unique(X[:, feature_idx])

            # Skip if only one unique value
            if len(thresholds) <= 1:
                continue

            # For potential thresholds, choose midpoints between consecutive values
            thresholds = (thresholds[:-1] + thresholds[1:]) / 2

            # For each threshold
            for threshold in thresholds:
                # Split the data
                left_indices = X[:, feature_idx] < threshold
                right_indices = ~left_indices

                # Skip if split results in empty node
                if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
                    continue

                # Calculate information gain
                n_samples = len(y)
                n_left = np.sum(left_indices)
                n_right = np.sum(right_indices)

                left_weight = n_left / n_samples
                right_weight = n_right / n_samples

                left_entropy = self._entropy(y[left_indices])
                right_entropy = self._entropy(y[right_indices])

                # Calculate information gain
                gain = self._entropy(y) - (left_weight * left_entropy + right_weight * right_entropy)

                # Update best gain, feature, and threshold if better than current best
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_idx
                    best_threshold = threshold

        return best_feature, best_threshold, best_gain

    def _entropy(self, y):
        """
        Calculate entropy of a set of labels
        """
        m = len(y)
        if m == 0:
            return 0

        counts = np.bincount(y.astype(int), minlength=self.n_classes)
        return entropy(counts, base=2)

    def predict(self, X):
        """
        Predict class labels for samples in X
        """
        return np.array([self._predict_sample(x, self.root) for x in X])

    def _predict_sample(self, x, node):
        """
        Predict the class label for a single sample
        """
        if node.is_leaf:
            return node.value

        if x[node.feature_idx] < node.threshold:
            return self._predict_sample(x, node.left)
        return self._predict_sample(x, node.right)


#-----------------------------------------------------------------
# 2. FETCH SPAMBASE DATASET DATA
#-----------------------------------------------------------------
def fetch_spambase_data():
    """
    Fetch Spambase dataset from UCI ML Repository

    Returns:
    --------
    X : numpy array
        Features
    y : numpy array
        Target values
    """

    # Fetch dataset using ucimlrepo
    spambase = fetch_ucirepo(id=94)

    # Extract features and targets
    X = spambase.data.features.values
    y = spambase.data.targets.values.ravel()
    return X, y

#-----------------------------------------------------------------
# 3. MAIN FUNCTION: SPAMBASE DATASET ANALYSIS
#-----------------------------------------------------------------

def main():
    # Load the Spambase dataset using ucimlrepo
    print("\nFetching Spambase dataset from UCI ML Repository...")

    X, y = fetch_spambase_data()

    print(f"Dataset fetched successfully: {X.shape[0]} samples, {X.shape[1]} features")

    # Check class distribution
    unique_classes, counts = np.unique(y, return_counts=True)
    print("\nClass distribution:")
    for cls, count in zip(unique_classes, counts):
        print(f"Class {int(cls)}: {count} samples ({count / len(y) * 100:.2f}%)")

    # Set the k-fold cross-validation
    k = 5
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # Define hyperparameters
    max_depths = [6, 8, 10]
    min_samples_splits = [2, 20]

    # Dictionary to store results
    results = {}

    print("\n" + "=" * 70)
    print("HYPERPARAMETER TUNING WITH K-FOLD CROSS-VALIDATION (ENTROPY CRITERION)")
    print("=" * 70)

    # Run k-fold cross-validation for each combination of hyperparameters
    for max_depth in max_depths:
        for min_samples_split in min_samples_splits:
            key = f"maxdepth_{max_depth}_minsplit_{min_samples_split}"
            results[key] = {
                'train_acc': [],
                'val_acc': []
            }

            print(f"\nEvaluating with max_depth={max_depth}, min_samples_split={min_samples_split}")

            fold = 1
            for train_idx, val_idx in kf.split(X):
                X_train, X_val = X[train_idx], X[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]

                # Normalize the data
                scaler = MinMaxScaler()
                X_train_norm, X_val_norm = scaler.fit_transform(X_train), scaler.transform(X_val)

                # Train decision tree
                clf = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split)
                # clf = sklearn.tree.DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, min_samples_split=min_samples_split)
                clf.fit(X_train_norm, y_train)

                # Evaluate on training and validation data
                y_train_pred = clf.predict(X_train_norm)
                y_val_pred = clf.predict(X_val_norm)

                train_acc = accuracy_score(y_train, y_train_pred)
                val_acc = accuracy_score(y_val, y_val_pred)

                results[key]['train_acc'].append(train_acc)
                results[key]['val_acc'].append(val_acc)

                print(f"  Fold {fold}: Train Acc = {train_acc:.4f}, Val Acc = {val_acc:.4f}")
                fold += 1

            # Calculate average accuracies
            avg_train_acc = np.mean(results[key]['train_acc'])
            avg_val_acc = np.mean(results[key]['val_acc'])
            std_train_acc = np.std(results[key]['train_acc'])
            std_val_acc = np.std(results[key]['val_acc'])
            avg_train_error = 1 - avg_train_acc
            avg_val_error = 1 - avg_val_acc

            print(f"  Average: Train Acc = {avg_train_acc:.4f} ± {std_train_acc:.4f}, Val Acc = {avg_val_acc:.4f} ± {std_val_acc:.4f}")
            print(f"  Train Error = {avg_train_error:.4f}, Val Error = {avg_val_error:.4f}")


    # Create summary dataframe
    summary = []
    for max_depth in max_depths:
        for min_samples_split in min_samples_splits:
            key = f"maxdepth_{max_depth}_minsplit_{min_samples_split}"
            summary.append({
                'Max Depth': 'No limit' if max_depth is None else max_depth,
                'Min Samples Split': min_samples_split,
                'Avg Train Acc': np.mean(results[key]['train_acc']),
                'Avg Val Acc': np.mean(results[key]['val_acc']),
                'Avg Train Error': 1 - np.mean(results[key]['train_acc']),
                'Avg Val Error': 1 - np.mean(results[key]['val_acc'])
            })

    summary_df = pd.DataFrame(summary)
    summary_df = summary_df.sort_values('Avg Val Acc', ascending=False)

    print("\n" + "=" * 70)
    print("RESULTS SUMMARY (TOP 5 SORTED BY VALIDATION ACCURACY)")
    print("=" * 70)
    print(summary_df.head(5).to_string(index=False, float_format=lambda x: f"{x:.4f}"))

if __name__ == "__main__":
    main()


Fetching Spambase dataset from UCI ML Repository...
Dataset fetched successfully: 4601 samples, 57 features

Class distribution:
Class 0: 2788 samples (60.60%)
Class 1: 1813 samples (39.40%)

HYPERPARAMETER TUNING WITH K-FOLD CROSS-VALIDATION (ENTROPY CRITERION)

Evaluating with max_depth=6, min_samples_split=2
  Fold 1: Train Acc = 0.9315, Val Acc = 0.9034
  Fold 2: Train Acc = 0.9267, Val Acc = 0.9196
  Fold 3: Train Acc = 0.9261, Val Acc = 0.9109
  Fold 4: Train Acc = 0.9310, Val Acc = 0.9130
  Fold 5: Train Acc = 0.9291, Val Acc = 0.9217
  Average: Train Acc = 0.9289 ± 0.0022, Val Acc = 0.9137 ± 0.0065
  Train Error = 0.0711, Val Error = 0.0863

Evaluating with max_depth=6, min_samples_split=20
  Fold 1: Train Acc = 0.9293, Val Acc = 0.9066
  Fold 2: Train Acc = 0.9253, Val Acc = 0.9217
  Fold 3: Train Acc = 0.9237, Val Acc = 0.9087
  Fold 4: Train Acc = 0.9277, Val Acc = 0.9087
  Fold 5: Train Acc = 0.9280, Val Acc = 0.9207
  Average: Train Acc = 0.9268 ± 0.0020, Val Acc = 0.9133

# Spambase Dataset Decision Tree Report

| Trial | Criterion | max_depth | min_samples_split | Training Accuracy | Testing Accuracy | Training Error | Testing Error |
|-------|-----------|-----------|-------------------|-------------------|------------------|----------------|---------------|
| 1 | Entropy | 6 | 2 | 92.89% | 91.37% | 7.11% | 8.63% |
| 2 | Entropy | 6 | 20 | 92.68% | 91.33% | 7.32% | 8.67% |
| 3 | Entropy | 8 | 2 | 95.09% | 92.09% | 4.91% | 7.91% |
| 4 | Entropy | 8 | 20 | 94.53% | 92.07% | 5.47% | 7.93% |
| 5 | Entropy | 10 | 2 | 96.60% | 92.48% | 3.40% | 7.52% |
| 6 | Entropy | 10 | 20 | 95.46% | 92.46% | 4.54% | 7.54% |

One optimal decision tree's hyperparameters could be max tree depth = 10 and max leaves = 20

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import requests
from io import StringIO


#-----------------------------------------------------------------
# 1. DECISION TREE IMPLEMENTATION
#-----------------------------------------------------------------

class DecisionTreeNode:
    def __init__(self, is_leaf=False, feature_idx=None, threshold=None, value=None,
                 left=None, right=None, mse_reduction=None):
        # Node properties
        self.is_leaf = is_leaf
        self.feature_idx = feature_idx  # Feature index to split on
        self.threshold = threshold      # Threshold value for split
        self.value = value              # Value if leaf node (mean value for regression)
        self.left = left                # Left child (samples where feature < threshold)
        self.right = right              # Right child (samples where feature >= threshold)
        self.mse_reduction = mse_reduction  # MSE reduction from this split

class DecisionTreeRegressor:
    def __init__(self, max_depth=None, min_samples_split=2, min_mse_reduction=0.0):
        """
        Decision Tree Regressor implementation

        Parameters:
        -----------
        max_depth : int or None
            Maximum depth of the tree
        min_samples_split : int
            Minimum number of samples required to split a node
        min_mse_reduction : float
            Minimum reduction in MSE required for a split to be considered
        """
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_mse_reduction = min_mse_reduction
        self.root = None

    def fit(self, X, y):
        """
        Build the regression tree from training data

        Parameters:
        -----------
        X : numpy array
            Training features
        y : numpy array
            Target values
        """
        self.n_features = X.shape[1]
        self.root = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        """
        Recursive function to build the tree
        """
        n_samples, n_features = X.shape

        # Mean of the target values in the current node
        node_value = np.mean(y)

        leaf_node = DecisionTreeNode(is_leaf=True, value=node_value)

        # Create a leaf node if meets stopping criteria
        if depth >= self.max_depth if self.max_depth else False:
            return leaf_node
        if n_samples < self.min_samples_split:
            return leaf_node

        # Find the best split
        best_feature, best_threshold, best_mse_reduction = self._best_split(X, y)

        # If no good split is found, create a leaf node
        if best_mse_reduction <= self.min_mse_reduction:
            return leaf_node

        # Split the data
        left_indices = X[:, best_feature] < best_threshold
        right_indices = ~left_indices

        # Skip if any partition is empty
        if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
            return leaf_node

        # build left and right
        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return DecisionTreeNode(
            feature_idx=best_feature,
            threshold=best_threshold,
            left=left_subtree,
            right=right_subtree,
            mse_reduction=best_mse_reduction
        )

    def _best_split(self, X, y):
        """
        Find the best feature and threshold for splitting the data

        Returns:
        --------
        best_feature : int
            Index of the best feature to split on
        best_threshold : float
            Threshold value for the best split
        best_mse_reduction : float
            MSE reduction from the best split
        """
        best_mse_reduction = -1
        best_feature = None
        best_threshold = None

        # Calculate MSE before split
        node_mse = np.mean((y - np.mean(y)) ** 2)

        for feature_idx in range(self.n_features):
            # Get unique values in the feature
            thresholds = np.unique(X[:, feature_idx])

            # Skip if only one unique value
            if len(thresholds) <= 1:
                continue

            thresholds = (thresholds[:-1] + thresholds[1:]) / 2

            for threshold in thresholds:
                # Split the data
                left_indices = X[:, feature_idx] < threshold
                right_indices = ~left_indices

                # Skip if split results in empty node
                if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
                    continue

                # Calculate MSE reduction
                y_left = y[left_indices]
                y_right = y[right_indices]

                left_mse = np.mean((y_left - np.mean(y_left)) ** 2)
                right_mse = np.mean((y_right - np.mean(y_right)) ** 2)

                # Weighted MSE of children
                n_left = len(y_left)
                n_right = len(y_right)
                n_total = n_left + n_right

                # MSE reduction
                mse_reduction = node_mse - ((n_left / n_total) * left_mse + (n_right / n_total) * right_mse)

                # Update best gain, feature, and threshold if better than current best
                if mse_reduction > best_mse_reduction:
                    best_mse_reduction = mse_reduction
                    best_feature = feature_idx
                    best_threshold = threshold

        return best_feature, best_threshold, best_mse_reduction

    def predict(self, X):
        """
        Predict target values for samples in X
        """
        return np.array([self._predict_sample(x, self.root) for x in X])

    def _predict_sample(self, x, node):
        """
        Predict the target value for a single sample
        """
        if node.is_leaf:
            return node.value

        if x[node.feature_idx] < node.threshold:
            return self._predict_sample(x, node.left)
        else:
            return self._predict_sample(x, node.right)

#-----------------------------------------------------------------
# 2. LOAD HOUSING DATASET FROM PREDEFINED TRAIN/TEST FILES
#-----------------------------------------------------------------

def fetch_housing_data():
    """
    Fetch Housing dataset from predefined training and testing files

    Returns:
    --------
    X_train : numpy array
        Training features
    y_train : numpy array
        Training targets
    X_test : numpy array
        Testing features
    y_test : numpy array
        Testing targets
    feature_names : list
        List of feature names
    """
    # URLs for training and testing data
    train_url = "https://www.khoury.northeastern.edu/home/vip/teach/MLcourse/data/housing_train.txt"
    test_url = "https://www.khoury.northeastern.edu/home/vip/teach/MLcourse/data/housing_test.txt"

    # Feature names
    feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS',
                    'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

    # Load training data
    train_response = requests.get(train_url)
    train_response.raise_for_status()  # Raise an exception for HTTP errors
    train_data = pd.read_csv(StringIO(train_response.text), sep="s\+", header=None)
    train_data.columns = feature_names

    # Load testing data
    test_response = requests.get(test_url)
    test_response.raise_for_status()
    test_data = pd.read_csv(StringIO(test_response.text), sep="s\+", header=None)
    test_data.columns = feature_names

    # Extract features and targets
    X_train = train_data.iloc[:, :-1].values
    y_train = train_data.iloc[:, -1].values

    X_test = test_data.iloc[:, :-1].values
    y_test = test_data.iloc[:, -1].values

    return X_train, y_train, X_test, y_test, feature_names

def mean_squared_error(y_true, y_pred):
    """
    Calculate the mean squared error between true and predicted values
    """
    return np.mean((y_true - y_pred) ** 2)

#-----------------------------------------------------------------
# 3. MAIN FUNCTION: HOUSING DATASET ANALYSIS
#-----------------------------------------------------------------

def main():
    print("DECISION TREE REGRESSION ON HOUSING DATASET")
    print("=" * 70)
    print("\nLoading Housing dataset from predefined train/test files...")

    # Load the Housing dataset
    X_train, y_train, X_test, y_test, feature_names = fetch_housing_data()

    print(f"Dataset loaded successfully:")
    print(f"  Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
    print(f"  Testing set: {X_test.shape[0]} samples, {X_test.shape[1]} features")

    # Normalize the data
    scaler = MinMaxScaler()
    X_train_norm = scaler.fit_transform(X_train)
    X_test_norm = scaler.transform(X_test)

    # Define hyperparameters to tune
    max_depths = [3, 5, 8]  # None means unlimited depth
    min_samples_splits = [2, 5, 10, 20]

    # Dictionary to store results
    results = {}

    print("\n" + "=" * 70)
    print("HYPERPARAMETER TUNING FOR REGRESSION TREE")
    print("=" * 70)

    # Evaluate each hyperparameter combination
    for max_depth in max_depths:
        for min_samples_split in min_samples_splits:
            key = f"maxdepth_{max_depth}_minsplit_{min_samples_split}"

            print(f"\nEvaluating with max_depth={max_depth}, min_samples_split={min_samples_split}")

            # Train regression tree
            regressor = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=min_samples_split)
            regressor.fit(X_train_norm, y_train)

            # Predict
            y_train_pred = regressor.predict(X_train_norm)
            y_test_pred = regressor.predict(X_test_norm)

            # Calculate metrics
            train_mse = mean_squared_error(y_train, y_train_pred)
            test_mse = mean_squared_error(y_test, y_test_pred)

            # Set results
            results[key] = {
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'train_mse': train_mse,
                'test_mse': test_mse,
            }

            print(f"  Train MSE: {train_mse:.4f}")
            print(f"  Test MSE: {test_mse:.4f}")

    # Find the best hyperparameters based on test MSE
    best_key = min(results.keys(), key=lambda k: results[k]['test_mse'])
    best_max_depth = results[best_key]['max_depth']
    best_min_samples_split = results[best_key]['min_samples_split']

    print("\n" + "=" * 70)
    print("BEST HYPERPARAMETERS FOUND")
    print("=" * 70)
    print(f"Best Max Depth: {best_max_depth}")
    print(f"Best Min Samples Split: {best_min_samples_split}")
    print(f"Training MSE: {results[best_key]['train_mse']:.4f}")
    print(f"Testing MSE: {results[best_key]['test_mse']:.4f}")

    # Train the final model with the best hyperparameters
    print("\n" + "=" * 70)
    print("FINAL MODEL EVALUATION")
    print("=" * 70)

    # Train the best model
    best_model = DecisionTreeRegressor(max_depth=best_max_depth, min_samples_split=best_min_samples_split)
    best_model.fit(X_train_norm, y_train)

    # Make predictions
    y_train_pred = best_model.predict(X_train_norm)
    y_test_pred = best_model.predict(X_test_norm)

    # Calculate metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    print(f"\nBest Model Performance:")
    print(f"Max Depth: {best_max_depth}")
    print(f"Min Samples Split: {best_min_samples_split}")
    print(f"Training MSE: {train_mse:.4f}")
    print(f"Testing MSE: {test_mse:.4f}")

    print("\nThis concludes the implementation and analysis of regression trees for the Housing dataset.")

if __name__ == "__main__":
    main()

DECISION TREE REGRESSION ON HOUSING DATASET

Loading Housing dataset from predefined train/test files...


  train_data = pd.read_csv(StringIO(train_response.text), sep="s\+", header=None)


ValueError: Length mismatch: Expected axis has 1 elements, new values have 14 elements

# Hosing Dataset Regression Tree Report

| Trial | max_depth | min_samples_split | Train MSE | Test MSE |
|-------|-----------|-------------------|-----------|----------|
| 1 | 3 | 2 | 15.7321 | 52.2825 |
| 2 | 3 | 5 | 15.7321 | 52.2825 |
| 3 | 3 | 10 | 15.7321 | 52.2825 |
| 4 | 3 | 20 | 15.7321 | 52.2825 |
| 5 | 5 | 2 | 6.6557 | 32.3248 |
| 6 | 5 | 5 | 6.9846 | 34.2883 |
| 7 | 5 | 10 | 7.8870 | 44.6765 |
| 8 | 5 | 20 | 7.8870 | 44.6765 |
| 9 | 8 | 2 | 2.2389 | 34.3243 |
| 10 | 8 | 5 | 2.8374 | 36.3687 |
| 11 | 8 | 10 | 4.5629 | 45.4727 |
| 12 | 8 | 20 | 6.5134 | 44.6668 |

One optimal regression tree is max tree depth is 5, max leaves node is 5.