In [6]:
import math
from collections import Counter
import graphviz

# Define the dataset
table = """
Cuaca	Temperatur	Kelembaban	Main
Cerah	Panas	Tinggi	Tidak
Cerah	Panas	Normal	Tidak
Mendung	Panas	Tinggi	Ya
Hujan	Sejuk	Tinggi	Ya
Hujan	Dingin	Normal	Ya
Hujan	Dingin	Normal	Tidak
Mendung	Dingin	Normal	Ya
Cerah	Sejuk	Tinggi	Tidak
Cerah	Dingin	Normal	Ya
Hujan	Sejuk	Normal	Ya
Cerah	Sejuk	Normal	Ya
Mendung	Sejuk	Tinggi	Ya
Mendung	Panas	Normal	Ya
Hujan	Sejuk	Tinggi	Tidak
"""

# Parse the dataset
table = table.strip().split('\n')
header = table[0].split('\t')
data = [row.split('\t') for row in table[1:]]

# Helper functions to calculate entropy and gain
def entropy(values):
    total = len(values)
    counter = Counter(values)
    ent = 0
    for count in counter.values():
        p = count / total
        ent -= p * math.log2(p)
    return ent

def gain(data, attribute_index, target_index):
    total_entropy = entropy([row[target_index] for row in data])
    values = [row[attribute_index] for row in data]
    value_counts = Counter(values)
    subsets_entropy = 0

    for value, count in value_counts.items():
        subset = [row for row in data if row[attribute_index] == value]
        subset_entropy = entropy([row[target_index] for row in subset])
        subsets_entropy += (count / len(data)) * subset_entropy

    gain_value = total_entropy - subsets_entropy

    # Print the detailed calculations
    print(f"Values({header[attribute_index]}) = {', '.join(value_counts.keys())}")
    print(f"S = {Counter([row[target_index] for row in data])} |S| = {len(data)}")
    for value, count in value_counts.items():
        subset = [row for row in data if row[attribute_index] == value]
        subset_counter = Counter([row[target_index] for row in subset])
        print(f"S{value} = {subset_counter} |S{value}| = {len(subset)}")
        print(f"Entropy(S{value}) = {entropy([row[target_index] for row in subset]):.10f}")
    print(f"Entropy(S) = {total_entropy:.10f}")
    print(f"Gain(S) = {gain_value:.10f}\n")

    return gain_value

# Function to split the dataset based on an attribute and value
def split_data(data, attribute_index, value):
    return [row for row in data if row[attribute_index] == value]

# Recursive function to build the decision tree
def build_tree(data, attributes, target_index, graph, parent_name=None, edge_label=""):
    target_values = [row[target_index] for row in data]
    if len(set(target_values)) == 1:
        leaf_node_name = f"Leaf: {target_values[0]}"
        graph.node(leaf_node_name)
        if parent_name:
            graph.edge(parent_name, leaf_node_name, label=edge_label)
        return target_values[0]

    if not attributes:
        most_common_target = Counter(target_values).most_common(1)[0][0]
        leaf_node_name = f"Leaf: {most_common_target}"
        graph.node(leaf_node_name)
        if parent_name:
            graph.edge(parent_name, leaf_node_name, label=edge_label)
        return most_common_target

    gains = [gain(data, i, target_index) for i in range(len(attributes))]
    best_attribute_index = gains.index(max(gains))
    best_attribute = attributes[best_attribute_index]

    print(f"Best attribute to split: {best_attribute} (Gain: {gains[best_attribute_index]:.4f})\n")

    node_name = f"Node: {best_attribute}"
    graph.node(node_name)
    if parent_name:
        graph.edge(parent_name, node_name, label=edge_label)

    tree = {best_attribute: {}}
    best_attribute_values = set([row[best_attribute_index] for row in data])

    for value in best_attribute_values:
        subset = split_data(data, best_attribute_index, value)
        subtree = build_tree(subset, attributes[:best_attribute_index] + attributes[best_attribute_index + 1:], target_index, graph, node_name, value)
        tree[best_attribute][value] = subtree

    return tree

# Build the decision tree and create the graph
attributes = header[:-1]
target_index = len(header) - 1
graph = graphviz.Digraph()

tree = build_tree(data, attributes, target_index, graph)

# Display the decision tree diagram
graph.view()


Values(Cuaca) = Cerah, Mendung, Hujan
S = Counter({'Ya': 9, 'Tidak': 5}) |S| = 14
SCerah = Counter({'Tidak': 3, 'Ya': 2}) |SCerah| = 5
Entropy(SCerah) = 0.9709505945
SMendung = Counter({'Ya': 4}) |SMendung| = 4
Entropy(SMendung) = 0.0000000000
SHujan = Counter({'Ya': 3, 'Tidak': 2}) |SHujan| = 5
Entropy(SHujan) = 0.9709505945
Entropy(S) = 0.9402859587
Gain(S) = 0.2467498198

Values(Temperatur) = Panas, Sejuk, Dingin
S = Counter({'Ya': 9, 'Tidak': 5}) |S| = 14
SPanas = Counter({'Tidak': 2, 'Ya': 2}) |SPanas| = 4
Entropy(SPanas) = 1.0000000000
SSejuk = Counter({'Ya': 4, 'Tidak': 2}) |SSejuk| = 6
Entropy(SSejuk) = 0.9182958341
SDingin = Counter({'Ya': 3, 'Tidak': 1}) |SDingin| = 4
Entropy(SDingin) = 0.8112781245
Entropy(S) = 0.9402859587
Gain(S) = 0.0292225657

Values(Kelembaban) = Tinggi, Normal
S = Counter({'Ya': 9, 'Tidak': 5}) |S| = 14
STinggi = Counter({'Tidak': 3, 'Ya': 3}) |STinggi| = 6
Entropy(STinggi) = 1.0000000000
SNormal = Counter({'Ya': 6, 'Tidak': 2}) |SNormal| = 8
Entropy(S

KeyError: 'parent'