In [1]:
import pandas as pd
import numpy as np

In [2]:
def MSE(y: pd.Series) -> float:
    if y.size == 0:
        return 0
    else:
        m = y.mean()
        sum_of_square_errors = 0
        for y_i in y:
            sum_of_square_errors = sum_of_square_errors + np.square(y_i - m)
        return sum_of_square_errors/y.size

In [3]:
ar = [20,24,40,50,60,10,4,10,60,40,45,40,35,20]
y = pd.Series(ar)

In [4]:
MSE(y)

311.34693877551024

In [5]:
def MSER(x: pd.Series, y: pd.Series) -> float:
    x_cat = x.cat.categories
    df = pd.DataFrame({'x':x, 'y':y})
    M = MSE(y)
    m = 0
    for cls in x_cat:
        df_temp = df[ df['x'] == cls ]
        prob = (df_temp['x'].size)/(x.size)
        m  = m + prob*MSE(df_temp['y'])

    return M-m

In [6]:
def best_node_basedOn_high_MSER(X: pd.DataFrame, y: pd.Series) -> int:
    mser_array = []
    for attr in X.columns:
        mser_array.append(MSER(X[attr], y))
        
    return mser_array.index(max(mser_array))

In [7]:
#Making A Tree

def build_tree(X1, y1, max_depth, current_depth=0):
    X = X1
    y = y1
    node_mse = MSE(y)

    # If all labels same -> return leaf node with entropy
    if len(y.unique()) == 1:
        return {'Class': y.iloc[0], 'mse': node_mse}
    
    # If no features left or max depth reached -> return majority class with entropy
    if X.empty or current_depth == max_depth:
        return {'Class': y.mean(), 'mse': node_mse}
    
    # 1. Find best attribute to split on using your best_node function
    best_attr_idx = best_node_basedOn_high_MSER(X, y)
    best_attr = X.columns[best_attr_idx]
    
    # 2. Create tree node with attribute and entropy
    tree = {'attribute': best_attr, 'mse': node_mse, 'nodes': {}, 'output': y.mode()[0], 'samples': y.size}
    
    # 3. For each possible category value of this attribute, build subtree recursively
    for attr_val in X[best_attr].cat.categories:
        X_subset = X[X[best_attr] == attr_val].drop(columns=[best_attr])
        y_subset = y[X[best_attr] == attr_val]
        
        # 4. If subset empty, assign majority class label as leaf with entropy
        if y_subset.empty:
            mean_label = y.mean()
            subset_mse = 0  # no uncertainty in a single class
            tree['nodes'][attr_val] = {'Class': mean_label, 'mse': subset_mse}
        else:
            # 5. Otherwise, recurse on the subset
            tree['nodes'][attr_val] = build_tree(X_subset, y_subset, max_depth, current_depth + 1)
    
    return tree


In [8]:
from graphviz import Digraph

def visualize_decision_tree(tree):
    def add_nodes_edges(tree, dot=None, parent=None, edge_label=None, node_id=0):
        if dot is None:
            dot = Digraph()
            dot.attr('node', shape='box', style='filled', fillcolor='lightyellow', fontname='helvetica')

        current_id = str(node_id)

        # Node label for internal node or leaf
        if 'Class' in tree:
            label = f"y = {tree['Class']}\n MSE = {tree['mse']:.4f}"
        else:
            label = f"X[{tree['attribute']}]\ny = {tree['output']}\nSamples: {tree['samples']}\nMSE = {tree['mse']:.4f}"

        dot.node(current_id, label)

        # If not root, add edge from parent
        if parent is not None:
            dot.edge(parent, current_id, label=str(edge_label))

        node_id += 1

        # Recurse children if internal node
        if 'nodes' in tree:
            for attr_val, subtree in tree['nodes'].items():
                dot, node_id = add_nodes_edges(subtree, dot, current_id, attr_val, node_id)

        return dot, node_id

    dot, _ = add_nodes_edges(tree)
    return dot

In [9]:
np.random.seed(42)
N = 10
P = 3
X = pd.DataFrame({i: pd.Series(np.random.randint(P, size=N), dtype="category") for i in range(3)})
y = pd.Series(np.random.rand(N))


# Suppose you already built the tree like this:
tree = build_tree(X, y, max_depth=10)

# Then visualize it:
dot = visualize_decision_tree(tree)

# Save to PNG file (or pdf, svg, etc.)
dot.render('my_decision_tree_DIRO', format='png', cleanup=True)

# Or just view immediately (requires Graphviz installed)
dot.view()


'my_decision_tree_DIRO.pdf'

In [10]:
df = X
df['3'] = y
df

Unnamed: 0,0,1,2,3
0,2,2,0,0.139494
1,0,2,0,0.292145
2,2,0,1,0.366362
3,2,2,1,0.45607
4,0,1,0,0.785176
5,0,0,0,0.199674
6,2,1,0,0.514234
7,1,1,2,0.592415
8,2,1,2,0.04645
9,2,1,2,0.607545
