step 1:  Importing necessary libraries

In [3]:
import numpy as np
from itertools import product
from sklearn.metrics import mean_squared_error
import pandas as pd

step 2: Define a class to represent a node in the decision tree

    Constructor for a tree node.



In [4]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, var_red=None, value=None):
        '''         
        Parameters:
        feature_index (int): Index of the feature used for splitting.
        threshold (float): Threshold value for splitting.
        left (Node): Left child node.
        right (Node): Right child node.
        var_red (float): Variance reduction achieved by the split.
        value (float): Value of the leaf node (for predictions).
        '''

        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.var_red = var_red

        # for leaf node
        self.value = value





step 3: Define a class for a custom decision tree regressor


In [20]:
class MyDecisionTreeRegressor():
    def __init__(self, min_samples_split=2, ):
        """
        Constructor for a custom decision tree regressor.

        Parameters:
        min_samples_split (int): Minimum number of samples required to split.
        """
        self.root = None  # Root node of the tree
        self.min_samples_split = min_samples_split

    def build_tree(self, dataset, curr_depth=0):
        """
        Recursive function to build the decision tree.

        Parameters:
        dataset (ndarray): Training data including features and target values.
        curr_depth (int): Current depth of the tree.

        Returns:
        Node: A node representing the decision tree structure.
        """
        X, Y = dataset[:, :-1], dataset[:, -1]
        num_samples, num_features = np.shape(X)

        # Check stopping conditions
        if num_samples >= self.min_samples_split:
            best_split = self.get_best_split(dataset, num_samples, num_features)
            if best_split and best_split.get("var_red", 0) > 0:
                # Build left and right subtrees recursively
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth + 1)
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth + 1)
                return Node(best_split["feature_index"], best_split["threshold"], left_subtree, right_subtree, best_split["var_red"])

        # Compute and return a leaf node if stopping conditions are met
        leaf_value = self.calculate_leaf_value(Y)
        return Node(value=leaf_value)

    def get_best_split(self, dataset, num_samples, num_features):
        """
        Find the best split for the dataset.

        Parameters:
        dataset (ndarray): Training data including features and target values.
        num_samples (int): Number of samples in the dataset.
        num_features (int): Number of features in the dataset.

        Returns:
        dict: Information about the best split.
        """
        best_split = {}
        max_var_red = -float("inf")

        # Iterate over all features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)

            # Try each possible threshold
            for threshold in possible_thresholds:
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                if len(dataset_left) > 0 and len(dataset_right) > 0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    curr_var_red = self.variance_reduction(y, left_y, right_y)
                    if curr_var_red > max_var_red:
                        best_split = {
                            "feature_index": feature_index,
                            "threshold": threshold,
                            "dataset_left": dataset_left,
                            "dataset_right": dataset_right,
                            "var_red": curr_var_red
                        }
                        max_var_red = curr_var_red

        return best_split

    def split(self, dataset, feature_index, threshold):
        """
        Split the dataset based on a feature and a threshold.

        Parameters:
        dataset (ndarray): Training data.
        feature_index (int): Index of the feature to split on.
        threshold (float): Threshold value for the split.

        Returns:
        tuple: Left and right splits of the dataset.
        """
        dataset_left = np.array([row for row in dataset if row[feature_index] <= threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index] > threshold])
        return dataset_left, dataset_right

    def variance_reduction(self, parent, l_child, r_child):
        """
        Calculate variance reduction achieved by a split.

        Parameters:
        parent (ndarray): Target values of the parent node.
        l_child (ndarray): Target values of the left child.
        r_child (ndarray): Target values of the right child.

        Returns:
        float: Variance reduction.
        """
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
        return reduction

    def calculate_leaf_value(self, Y):
        """
        Calculate the value of a leaf node.

        Parameters:
        Y (ndarray): Target values of the samples in the leaf.

        Returns:
        float: Mean of the target values.
        """
        return np.mean(Y)

    def print_tree(self, tree=None, indent=" "):
        """
        Print the decision tree.

        Parameters:
        tree (Node): The root node of the tree (default is None, which means the root of the current tree).
        indent (str): Indentation for tree visualization.
        """
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)
        else:
            print(f"X_{tree.feature_index} <= {tree.threshold}? {tree.var_red}")
            print(f"{indent}left:", end="")
            self.print_tree(tree.left, indent + indent)
            print(f"{indent}right:", end="")
            self.print_tree(tree.right, indent + indent)

    def fit(self, X, Y):
        """
        Train the decision tree.

        Parameters:
        X (ndarray): Feature matrix.
        Y (ndarray): Target values.
        """
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)

    def make_prediction(self, x, tree):
        """
        Make a prediction for a single sample.

        Parameters:
        x (ndarray): Feature vector of the sample.
        tree (Node): The root node of the tree.

        Returns:
        float: Predicted value.
        """
        if tree.value is not None:
            return tree.value
        feature_val = x[tree.feature_index]
        if feature_val <= tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

    def predict(self, X):
        """
        Predict target values for multiple samples.

        Parameters:
        X (ndarray): Feature matrix.

        Returns:
        list: Predicted values.
        """
        return [self.make_prediction(x, self.root) for x in X]


step 4: Define a class for Ensemble Regression Tree


In [26]:
class EnsembleRegressionTree:
    def __init__(self, n_trees=5, min_samples_split=10):
        """
        Constructor for Ensemble Regression Tree.

        Parameters:
        n_trees (int): Number of trees in the ensemble.
        min_samples_split (int): Minimum number of samples required to split a node.
        """
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.trees = []  # List to store individual trees
        self.initial_prediction = None  # Initial prediction (mean of the target values)

    def fit(self, X, y):
        """
        Train the ensemble regression tree.

        Parameters:
        X (ndarray): Feature matrix.
        y (ndarray): Target values.
        """
        # Initialize the prediction as the mean of the target values
        self.initial_prediction = np.mean(y)
        predictions = np.full(y.shape, self.initial_prediction)

        for i in range(self.n_trees):
            # Calculate residuals (difference between actual and predicted values)
            residuals = y.ravel() - predictions.ravel()
            # Train a new tree to predict the residuals
            tree = MyDecisionTreeRegressor(min_samples_split=self.min_samples_split)
            tree.fit(X, residuals.reshape(-1, 1))  # Fit the tree with residuals
            self.trees.append(tree)

            # Update predictions with the current tree's output
            predictions += np.array(tree.predict(X)).reshape(-1, 1)

    def predict(self, X):
        """
        Predict target values for new data using the trained ensemble.

        Parameters:
        X (ndarray): Feature matrix.

        Returns:
        ndarray: Predicted values.
        """
        predictions = np.full((X.shape[0], 1), self.initial_prediction)
        for tree in self.trees:
            predictions += np.array(tree.predict(X)).reshape(-1, 1)
        return predictions.ravel()  # Return predictions as a 1D array

    def print_tree(self):
        """
        Print the structure of all trees in the ensemble.
        """
        print(f"Initial prediction: {self.initial_prediction}")
        for i, tree in enumerate(self.trees):
            print(f"\nTree {i + 1}:")
            tree.print_tree()


step 5: find best msp condition

In [9]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np

def auto_tune_min_samples_split(X, y, n_trees, start_msp=18, step=2, max_msp=40, epsilon=1e-4, patience=3, mse_threshold=0.6):
    """
    Automatically tune the best `min_samples_split` for an ensemble regression tree .
    """
    best_msp = None
    best_mse = float('inf')
    previous_mse = float('inf')
    no_improvement_steps = 0
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    tried_msp = []  # Track all tried msp values for logging

    msp = start_msp

    while msp <= max_msp:
        mse_list = []

        # Perform K-Fold cross-validation
        for train_index, val_index in kf.split(X):
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            # Train the ensemble with current min_samples_split
            ensemble = EnsembleRegressionTree(n_trees=n_trees, min_samples_split=msp)
            ensemble.fit(X_train, y_train)

            # Validate the model
            y_pred = ensemble.predict(X_val)
            mse = mean_squared_error(y_val, y_pred)
            mse_list.append(mse)

        # Calculate average MSE for current `min_samples_split`
        avg_mse = np.mean(mse_list)
        tried_msp.append((msp, avg_mse))
        print(f"min_samples_split={msp}, Avg MSE={avg_mse}, Fold MSEs={mse_list}")

        # Update the best MSE and parameters if improved
        if avg_mse < best_mse:
            best_mse = avg_mse
            best_msp = msp
            no_improvement_steps = 0  # Reset patience counter
        else:
            no_improvement_steps += 1

        # Early stopping based on epsilon
        if avg_mse < mse_threshold and abs(previous_mse - avg_mse) < epsilon:
            no_improvement_steps += 1

        if no_improvement_steps >= patience:
            print("No significant improvement for", patience, "steps. Final stopping.")
            break

        previous_mse = avg_mse
        msp += step  # Increment `min_samples_split`

    # Log all tried values
    print("\nTried min_samples_split values and corresponding MSEs:")
    for msp_val, mse_val in tried_msp:
        print(f"min_samples_split={msp_val}, Avg MSE={mse_val}")

    return best_msp, best_mse


step 6: Main program for running the custom and ensemble regression tree models

In [15]:
    from sklearn.preprocessing import MinMaxScaler

    # Load and preprocess data
    train = pd.read_csv('train_x.csv')
    trainGT = pd.read_csv('train_y.csv')
    test = pd.read_csv('test_x.csv')
    testGT = pd.read_csv('test_y.csv')

    # Extract features and targets from the datasets
    X_train = train.iloc[:, :-1].values
    X_test = test.iloc[:, :-1].values
    y_train = trainGT.iloc[:, -1].values.reshape(-1, 1)  # Reshape to 2D
    y_test = testGT.iloc[:, -1].values.reshape(-1, 1)  # Reshape to 2D



    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)  # Fit on training data
    X_test = scaler.transform(X_test)  # Apply the same transformation to test data


    # Automatically tune min_samples_split for ensemble regression tree
    print("\n===== Auto-Tuning for Ensemble Regression Tree =====")
    best_msp_ensemble, best_ensemble_mse = auto_tune_min_samples_split(X_train, y_train, n_trees=5,start_msp=2)
    print(f"Best min_samples_split for ensemble regression tree: {best_msp_ensemble}, Best MSE: {best_ensemble_mse}")

    




===== Auto-Tuning for Ensemble Regression Tree =====
min_samples_split=2, Avg MSE=0.5712439576797841, Fold MSEs=[0.5373522927234916, 0.6340690167165753, 0.5039318480466096, 0.6076035389908367, 0.5732630919214068]
min_samples_split=4, Avg MSE=0.5598420212053671, Fold MSEs=[0.5148538542719803, 0.6240564528256615, 0.4974698741085892, 0.5905757374499255, 0.572254187370679]
min_samples_split=6, Avg MSE=0.5556263242652387, Fold MSEs=[0.5184067168330802, 0.6126470473009001, 0.4886126253529694, 0.5882366059623276, 0.5702286258769163]
min_samples_split=8, Avg MSE=0.5546729362825766, Fold MSEs=[0.5239644551410465, 0.6147590236149829, 0.48718644047600723, 0.5747438257773182, 0.572710936403528]
min_samples_split=10, Avg MSE=0.5519193441553465, Fold MSEs=[0.5265777540915385, 0.6164172105632053, 0.48758164881840754, 0.561349111842056, 0.5676709954615256]
min_samples_split=12, Avg MSE=0.5523972298701171, Fold MSEs=[0.5328375375891711, 0.6176423314937809, 0.48555133748025125, 0.559050054635783, 0.566

In [29]:
    # Load and preprocess data
    train = pd.read_csv('train_x.csv')
    trainGT = pd.read_csv('train_y.csv')
    test = pd.read_csv('test_x.csv')
    testGT = pd.read_csv('test_y.csv')

    # Extract features and targets from the datasets
    X_train = train.iloc[:, :-1].values
    X_test = test.iloc[:, :-1].values
    y_train = trainGT.iloc[:, -1].values.reshape(-1, 1)  # Reshape to 2D
    y_test = testGT.iloc[:, -1].values.reshape(-1, 1)  # Reshape to 2D

# print(f"X_train shape: {X_train.shape}")
#     print(f"y_train shape: {y_train.shape}")

    # Check and reshape y_train to ensure compatibility
    if len(y_train.shape) == 1:
        y_train = y_train.reshape(-1, 1)

    print(f"y_train reshaped: {y_train.shape}")

    # Train and compare sklearn decision tree and custom decision tree with auto-tuned min_samples_split
    print("\n===== Training Models with Tuned Parameters =====")
    sklearn_tree = DecisionTreeRegressor()
    sklearn_tree.fit(X_train, y_train)

    my_tree = MyDecisionTreeRegressor()
    my_tree.fit(X_train, y_train)

    # Predictions for sklearn and custom decision trees
    y_pred_sklearn = sklearn_tree.predict(X_test)
    y_pred_custom = my_tree.predict(X_test)

    # Train ensemble regression tree with specific parameters: {'n_trees': 5, 'msp': 10}
    print("\n===== Training Ensemble Regression Tree with Fixed Parameters =====")
    ensemble = EnsembleRegressionTree(n_trees=5, min_samples_split=best_msp_ensemble)
    ensemble.fit(X_train, y_train)
    y_pred_ensemble = ensemble.predict(X_test)

    # Print predictions and performance comparison
    print("\n===== Model Performance =====")
    print("MSE (sklearn):", mean_squared_error(y_test, y_pred_sklearn))
    print("MSE (custom):", mean_squared_error(y_test, y_pred_custom))

    print("MSE (ensemble with parameters {'n_trees': 5, 'msp': best msp}):", 
          mean_squared_error(y_test, y_pred_ensemble))

y_train reshaped: (5196, 1)

===== Training Models with Tuned Parameters =====

===== Training Ensemble Regression Tree with Fixed Parameters =====

===== Model Performance =====
MSE (sklearn): 0.6674364896073903
MSE (custom): 0.6674364896073903
MSE (ensemble with parameters {'n_trees': 5, 'msp': 10}): 0.7103643014814119


print tree structure

In [25]:
    print("-------my_tree structure-------")
    my_tree.print_tree()

    # print("-------ensemble_tree structure-------")
    # ensemble.print_tree()



IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

