In [4]:
import pandas as pd
import numpy as np

$\textbf{Check If Real}$

In [5]:
def check_ifreal(y: pd.Series) -> bool:
    dtype = y.dtype
    if dtype == 'float64' or dtype == 'float32' or dtype == 'float16' or dtype == 'float':
        return True
    else:
        return False
        

$\textbf{Entropy}$

In [6]:
def entropy(y: pd.Series) -> float:
    classes = y.cat.categories
    counts = y.value_counts()
    ent = 0.0
    for cls in classes:
        count_for_cls = counts.get(cls, 0)
        if y.size > 0:
            prob = count_for_cls / y.size
        else:
            prob = 0.0
            return 0
        
        if prob > 0:
            ent += -prob * np.log2(prob)
            
    return ent

In [7]:
def information_gain_entropy(x: pd.Series, y: pd.Series) -> float:
    S = entropy(y)
    classes_x = x.cat.categories
    df = pd.DataFrame({'x':x , 'y':y})
    s = 0
    for cls in classes_x:
        df_temp = df[df['x'] == cls]
        prob = (df_temp['x'].size)/(x.size)
        s = s + prob*entropy(df_temp['y'])
    
    return S-s

    

In [8]:
def best_node_basedOn_HighInformationGain(X: pd.DataFrame, y: pd.Series) -> int:
    information_gain_array = []
    for attr in X.columns:
        information_gain_array.append(information_gain_entropy(X[attr], y))
        
    return information_gain_array.index(max(information_gain_array))

In [9]:
#Making A Tree

def build_tree(X1, y1, max_depth, current_depth=0):
    X = X1
    y = y1
    node_entropy = entropy(y)

    # If all labels same -> return leaf node with entropy
    if len(y.unique()) == 1:
        return {'Class': y.iloc[0], 'entropy': node_entropy}
    
    # If no features left or max depth reached -> return majority class with entropy
    if X.empty or current_depth == max_depth:
        return {'Class': y.mode()[0], 'entropy': node_entropy}
    
    # 1. Find best attribute to split on using your best_node function
    best_attr_idx = best_node_basedOn_HighInformationGain(X, y)
    best_attr = X.columns[best_attr_idx]
    
    # 2. Create tree node with attribute and entropy
    tree = {'attribute': best_attr, 'entropy': node_entropy, 'nodes': {}, 'output': y.mode()[0], 'samples': y.size}
    
    # 3. For each possible category value of this attribute, build subtree recursively
    for attr_val in X[best_attr].cat.categories:
        X_subset = X[X[best_attr] == attr_val].drop(columns=[best_attr])
        y_subset = y[X[best_attr] == attr_val]
        
        # 4. If subset empty, assign majority class label as leaf with entropy
        if y_subset.empty:
            majority_label = y.mode()[0]
            subset_entropy = 0  # no uncertainty in a single class
            tree['nodes'][attr_val] = {'Class': majority_label, 'entropy': subset_entropy}
        else:
            # 5. Otherwise, recurse on the subset
            tree['nodes'][attr_val] = build_tree(X_subset, y_subset, max_depth, current_depth + 1)
    
    return tree



        


In [14]:
from graphviz import Digraph

def visualize_decision_tree(tree):
    def add_nodes_edges(tree, dot=None, parent=None, edge_label=None, node_id=0):
        if dot is None:
            dot = Digraph()
            dot.attr('node', shape='box', style='filled', fillcolor='lightyellow', fontname='helvetica')

        current_id = str(node_id)

        # Node label for internal node or leaf
        if 'Class' in tree:
            label = f"y = {tree['Class']}"
        else:
            label = f"X[{tree['attribute']}]\ny = {tree['output']}\nSamples: {tree['samples']}\nEntropy: {tree['entropy']:.4f}"

        dot.node(current_id, label)

        # If not root, add edge from parent
        if parent is not None:
            dot.edge(parent, current_id, label=str(edge_label))

        node_id += 1

        # Recurse children if internal node
        if 'nodes' in tree:
            for attr_val, subtree in tree['nodes'].items():
                dot, node_id = add_nodes_edges(subtree, dot, current_id, attr_val, node_id)

        return dot, node_id

    dot, _ = add_nodes_edges(tree)
    return dot


In [15]:
N = 10
P = 3
X = pd.DataFrame({i: pd.Series(np.random.randint(P, size=N), dtype="category") for i in range(3)})
y = pd.Series(np.random.randint(P, size=N), dtype="category")


# Suppose you already built the tree like this:
tree = build_tree(X, y, max_depth=10)

# Then visualize it:
dot = visualize_decision_tree(tree)

# Save to PNG file (or pdf, svg, etc.)
dot.render('my_decision_tree', format='png', cleanup=True)

# Or just view immediately (requires Graphviz installed)
dot.view()


'my_decision_tree.pdf'

In [None]:
# Test case 3
# Discrete Input and Discrete Output

N = 30
P = 5
X = pd.DataFrame({i: pd.Series(np.random.randint(P, size=N), dtype="category") for i in range(5)})
y = pd.Series(np.random.randint(P, size=N), dtype="category")

best_node_basedOn_HighInformationGain(X,y)

'my_decision_tree.pdf'