In [2]:
import numpy as np
import pandas as pd

In [3]:
def MSE(y: pd.Series) -> float:
    if y.size == 0:
        return 0
    else:
        m = y.mean()
        sum_of_square_errors = 0
        for y_i in y:
            sum_of_square_errors = sum_of_square_errors + np.square(y_i - m)
        return sum_of_square_errors/y.size

In [4]:
def MSER_real(x: pd.Series, y: pd.Series, index: int) -> float:
    mse_y = MSE(y)
    df = pd.DataFrame({'x':x, 'y': y})
    df_sort = df.sort_values(by='x', ascending=True)
    set1 = df_sort['y'].iloc[0 : index + 1]
    prob1 = set1.size/y.size
    set2 = df_sort['y'].iloc[index + 1 : ]
    prob2 = set2.size/y.size
    
    w_mse = prob1*MSE(set1)  + prob2*MSE(set2)
    
    return (mse_y - w_mse)

In [5]:
def best_split_real_basedon_MSER(x: pd.Series, y: pd.Series)-> tuple[int, float] :
    MSERarr = [MSER_real(x, y, i) for i in range(y.size)]
    return MSERarr.index(max(MSERarr)), max(MSERarr)

In [6]:
def best_attr_real_basedon_MSER(X: pd.DataFrame, y:pd.Series)-> tuple[int, any]:
    MSERAttrArray = []
    split_array = []
    attr_array = []
    for attr in X.columns:
        MSERAttrArray.append(best_split_real_basedon_MSER(X[attr], y)[1])
        attr_array.append(attr)
        split_array.append(best_split_real_basedon_MSER(X[attr], y)[0])
    
    return attr_array[MSERAttrArray.index(max(MSERAttrArray))], split_array[MSERAttrArray.index(max(MSERAttrArray))]

In [7]:
def build_tree_real(X, y, max_depth, current_depth=0):
    node_mse = MSE(y)
    
    if len(y.unique()) == 1:
        return {'Class': y.iloc[0], 'entropy': node_mse}
    
    if X.empty or current_depth == max_depth:
        return {'Class': y.mode()[0], 'entropy': node_mse}
    
    # Find best attribute and split index
    best_attr, split_index = best_attr_real_basedon_MSER(X, y)

    # Calculate split value as average of jth and (j+1)th sorted unique values
    sorted_vals = sorted(X[best_attr].unique())
    split_val = (sorted_vals[split_index] + sorted_vals[split_index + 1]) / 2
    
    tree = {
        'attribute': best_attr,
        'split_value': split_val,
        'entropy': node_mse,
        'nodes': {},
        'output': y.mean(),
        'samples': y.size
    }
    
    left_mask = X[best_attr] <= split_val
    right_mask = X[best_attr] > split_val
    
    X_left, y_left = X[left_mask], y[left_mask]
    X_right, y_right = X[right_mask], y[right_mask]
    
    if y_left.empty:
        tree['nodes']['<='] = {'Class': y.mean(), 'entropy': node_mse}
    else:
        tree['nodes']['<='] = build_tree_real(X_left, y_left, max_depth, current_depth + 1)
    
    if y_right.empty:
        tree['nodes']['>'] = {'Class': y.mean(), 'entropy': node_mse}
    else:
        tree['nodes']['>'] = build_tree_real(X_right, y_right, max_depth, current_depth + 1)
    
    return tree


In [8]:
from graphviz import Digraph

def visualize_decision_tree(tree):
    def add_nodes_edges(tree, dot=None, parent=None, edge_label=None, node_id=0):
        if dot is None:
            dot = Digraph()
            dot.attr('node', shape='box', style='filled', fillcolor='lightyellow', fontname='helvetica')

        current_id = str(node_id)

        # Node label for internal node or leaf
        if 'Class' in tree:
            label = f"y = {tree['Class']}"
        else:
            label = (
                f"X[{tree['attribute']}]\n"
                f"y = {tree['output']}\n"
                f"Samples: {tree['samples']}\n"
                f"MSE = {tree['entropy']:.4f}"
            )

        dot.node(current_id, label)

        # If not root, add edge from parent
        if parent is not None:
            dot.edge(parent, current_id, label=str(edge_label))

        node_id += 1

        # Recurse children if internal node
        if 'nodes' in tree:
            for edge_condition, subtree in tree['nodes'].items():
                split_value = tree.get('split_value', None)
                if split_value is not None:
                    if edge_condition == '<=':
                        edge_label = f"<= {split_value:.4f}"
                    elif edge_condition == '>':
                        edge_label = f"> {split_value:.4f}"
                    else:
                        edge_label = str(edge_condition)
                else:
                    edge_label = str(edge_condition)

                dot, node_id = add_nodes_edges(subtree, dot, current_id, edge_label, node_id)

        return dot, node_id

    dot, _ = add_nodes_edges(tree)
    return dot


In [10]:
x1 = pd.Series([1,2,3,4,5,6])
y = pd.Series([0,0,1,1,2,2])
X = pd.DataFrame({'x1':x1})

tree = build_tree_real(X, y, max_depth=10)
dot = visualize_decision_tree(tree)
dot.render('my_decision_tree', format='png', cleanup=True)
dot.view()

'my_decision_tree.pdf'