In [1]:
import numpy as np
import pandas as pd

In [4]:
def entropy(y: pd.Series) -> float:
    classes = y.cat.categories
    counts = y.value_counts()
    ent = 0.0
    for cls in classes:
        count_for_cls = counts.get(cls, 0)
        if y.size > 0:
            prob = count_for_cls / y.size
        else:
            prob = 0.0
            return 0
        
        if prob > 0:
            ent += -prob * np.log2(prob)
            
    return ent

In [5]:
def inf_gain_real(x: pd.Series, y: pd.Series, index: int) ->float: #After Index
    S = entropy(y)
    df = pd.DataFrame({'x':x, 'y':y})
    df_sort = df.sort_values(by=['x'], ascending=True)
    set1 = df_sort['y'].iloc[0:index+1]
    set2 = df_sort['y'].iloc[index+1:]
    prob1 = set1.size/y.size
    prob2 = set2.size/y.size
    w_entropy = prob1*entropy(set1) + prob2*entropy(set2)
    return (S - w_entropy)



In [6]:
def best_split_real_basedon_maxInfoGain(x: pd.Series, y: pd.Series)-> tuple[int, float] :
    InfoGainarr = [inf_gain_real(x, y, i) for i in range(y.size)]
    return InfoGainarr.index(max(InfoGainarr)), max(InfoGainarr)

In [7]:
def best_attr_real_basedon_maxInfoGain(X: pd.DataFrame, y:pd.Series)-> tuple[int, any]:
    InfoGainAttrArray = []
    split_array = []
    attr_array = []
    for attr in X.columns:
        InfoGainAttrArray.append(best_split_real_basedon_maxInfoGain(X[attr], y)[1])
        attr_array.append(attr)
        split_array.append(best_split_real_basedon_maxInfoGain(X[attr], y)[0])
    
    return attr_array[InfoGainAttrArray.index(max(InfoGainAttrArray))], split_array[InfoGainAttrArray.index(max(InfoGainAttrArray))]
    

In [8]:
def build_tree_real(X, y, max_depth, current_depth=0):
    node_entropy = entropy(y)
    
    if len(y.unique()) == 1:
        return {'Class': y.iloc[0], 'entropy': node_entropy}
    
    if X.empty or current_depth == max_depth:
        return {'Class': y.mode()[0], 'entropy': node_entropy}
    
    # Find best attribute and split index
    best_attr, split_index = None, None
    max_info_gain = -float('inf')
    
    for attr in X.columns:
        j, info_gain = best_split_real_basedon_maxInfoGain(X[attr], y)
        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_attr = attr
            split_index = j
    
    # Calculate split value as average of jth and (j+1)th sorted unique values
    sorted_vals = sorted(X[best_attr].unique())
    split_val = (sorted_vals[split_index] + sorted_vals[split_index + 1]) / 2
    
    tree = {
        'attribute': best_attr,
        'split_value': split_val,
        'entropy': node_entropy,
        'nodes': {},
        'output': y.mode()[0],
        'samples': y.size
    }
    
    left_mask = X[best_attr] <= split_val
    right_mask = X[best_attr] > split_val
    
    X_left, y_left = X[left_mask], y[left_mask]
    X_right, y_right = X[right_mask], y[right_mask]
    
    if y_left.empty:
        tree['nodes']['<='] = {'Class': y.mode()[0], 'entropy': 0}
    else:
        tree['nodes']['<='] = build_tree_real(X_left, y_left, max_depth, current_depth + 1)
    
    if y_right.empty:
        tree['nodes']['>'] = {'Class': y.mode()[0], 'entropy': 0}
    else:
        tree['nodes']['>'] = build_tree_real(X_right, y_right, max_depth, current_depth + 1)
    
    return tree


In [9]:
from graphviz import Digraph

def visualize_decision_tree(tree):
    def add_nodes_edges(tree, dot=None, parent=None, edge_label=None, node_id=0):
        if dot is None:
            dot = Digraph()
            dot.attr('node', shape='box', style='filled', fillcolor='lightyellow', fontname='helvetica')

        current_id = str(node_id)

        # Node label for internal node or leaf
        if 'Class' in tree:
            label = f"y = {tree['Class']}"
        else:
            label = (
                f"X[{tree['attribute']}]\n"
                f"y = {tree['output']}\n"
                f"Samples: {tree['samples']}\n"
                f"Entropy: {tree['entropy']:.4f}"
            )

        dot.node(current_id, label)

        # If not root, add edge from parent
        if parent is not None:
            dot.edge(parent, current_id, label=str(edge_label))

        node_id += 1

        # Recurse children if internal node
        if 'nodes' in tree:
            for edge_condition, subtree in tree['nodes'].items():
                split_value = tree.get('split_value', None)
                if split_value is not None:
                    if edge_condition == '<=':
                        edge_label = f"<= {split_value:.4f}"
                    elif edge_condition == '>':
                        edge_label = f"> {split_value:.4f}"
                    else:
                        edge_label = str(edge_condition)
                else:
                    edge_label = str(edge_condition)

                dot, node_id = add_nodes_edges(subtree, dot, current_id, edge_label, node_id)

        return dot, node_id

    dot, _ = add_nodes_edges(tree)
    return dot


In [10]:
x1 = pd.Series([40,48,60,72,80,90])
y = pd.Series([0,0,1,1,1,0], dtype="category")
X = pd.DataFrame({'x1':x1})

tree = build_tree_real(X, y, max_depth=10)
dot = visualize_decision_tree(tree)
dot.render('my_decision_tree', format='png', cleanup=True)
dot.view()

'my_decision_tree.pdf'

In [11]:
np.random.seed(42)
N = 10
P = 3
X = pd.DataFrame({i: pd.Series(np.random.rand(N)) for i in range(3)})
y = pd.Series(np.random.randint(P, size=N), dtype="category")

tree = build_tree_real(X, y, max_depth=10)
dot = visualize_decision_tree(tree)
dot.render('my_decision_tree', format='png', cleanup=True)
dot.view()

'my_decision_tree.pdf'

In [12]:
df = X
df['3'] = y
df

Unnamed: 0,0,1,2,3
0,0.37454,0.020584,0.611853,2
1,0.950714,0.96991,0.139494,2
2,0.731994,0.832443,0.292145,0
3,0.598658,0.212339,0.366362,0
4,0.156019,0.181825,0.45607,2
5,0.155995,0.183405,0.785176,1
6,0.058084,0.304242,0.199674,0
7,0.866176,0.524756,0.514234,1
8,0.601115,0.431945,0.592415,1
9,0.708073,0.291229,0.04645,1
