# Problem 3: Poisonous Mushrooms?

In [1]:
import numpy as np
import pandas as pd
import math
from collections import deque

In [2]:
labels = 'class, cap-shape, cap-surface, cap-color, bruises, odor, gill-attachment, gill-spacing, gill-size, '\
'gill-color, stalk-shape, stalk-root,' \
'stalk-surface-above-ring, stalk-surface-below-ring, stalk-color-above-ring, stalk-color-below-ring, veil-type, veil-color,'\
'ring-number, ring-type, spore-print-color, population, habitat'.split(',')
labels = [label.strip() for label in labels]
label_idx = {label:idx for idx, label in enumerate(labels)}

In [3]:
df_train = pd.read_csv('mush_train.data',header=None, names=labels)
df_test = pd.read_csv('mush_test.data', header=None, names=labels)

In [4]:
y_train, X_train = df_train.iloc[:, 0], df_train.iloc[:, 1:]
y_test, X_test = df_test.iloc[:, 0], df_test.iloc[:, 1:]

In [19]:
class InternalNode:

    def __init__(self, attr, attr_vals, data, ig=None, height=0, split_attrs=None):
        self.attr = attr
        self.children = {attr_val: None for attr_val in attr_vals}
        self.children_count = 0
        self.ig = ig
        self.height = height
        self.data = data
        self.split_attrs = split_attrs

    def set_child(self, attr_val, child):
        self.children[attr_val] = child
        self.children_count += 1

    def get_children(self):
        return self.children.items()

    def get_children_count(self):
        return self.children_count

    def __repr__(self):
        return f"Attribute: {self.attr}({label_idx.get(self.attr)}) Attrs: {self.children_count} IG: {self.ig}"


class LeafNode:

    def __init__(self, attr, attr_val, prediction, height):
        self.attr = attr
        self.attr_val = attr_val
        self.prediction = prediction
        self.height = height

    def predict(self):
        return self.prediction

    def __repr__(self):
        return f"Attribute: {self.attr}({label_idx.get(self.attr)}), Prediction: {self.prediction}"

In [6]:
def find_best_split(data, attributes):
    # Finds best attribute to split on based on Conditional entropy (IG)
    m, n = data.shape
    min_entropy = 1
    best_attr = -1
    for attr in attributes:
        cond_entropy = 0
        attr_vals, counts = np.unique(data[attr], return_counts=True)
        for attr_val, attr_count in zip(attr_vals, counts):
            sub_data = data[data[attr] == attr_val]
            subset_len, _ = sub_data.shape
            subclass_counts = sub_data['class'].value_counts()
            p = subclass_counts['p'] if 'p' in subclass_counts else 0
            e = subclass_counts['e'] if 'e' in subclass_counts else 0
            plogp, eloge = 0, 0
            if p > 0:
                plogp = - (p/subset_len) * math.log2(p/subset_len)
            if e > 0:
                eloge = - (e/subset_len) * math.log2(e/subset_len)
            cond_entropy += (attr_count/m) * (plogp +eloge)
        if cond_entropy <= min_entropy:
            if cond_entropy == min_entropy and label_idx[attr] > label_idx[best_attr]:
                # In case of a tie, first occurring attribute is used
                continue
            min_entropy = cond_entropy
            best_attr = attr
    return (best_attr, entropy - min_entropy)
            

In [32]:
def fit(data, attributes, root=None):
    split_attr, ig = find_best_split(data, attributes)
    attributes.remove(split_attr)
    if not root:
        root = InternalNode(split_attr, data[split_attr].unique(), data, ig, 0, attributes[:])
    # Using queue to construct the tree
    queue = deque()
    queue.append(root)
    while queue:
        current_node = queue.popleft()
        current_node_attr = current_node.attr

        # Data filtered with current attribute value
        current_data = current_node.data
        for attr_val, child in current_node.get_children():
            new_node = None

            # Create new dataset with attribute = attribute value
            subset_data = current_data[current_data[current_node_attr] == attr_val]
            subset_len, _ = subset_data.shape
            subclass_counts = subset_data['class'].value_counts()
            p = subclass_counts['p'] if 'p' in subclass_counts else 0
            e = subclass_counts['e'] if 'e' in subclass_counts else 0
            # if subset_len == 0:
            #     # Majority vote
            #     p = global_class_counts['p']
            #     e = global_class_counts['e']
            #     new_node = LeafNode(current_node_attr, attr_val, 'p' if p > e else 'e', current_node.height + 1)
            # else:
            if p == subset_len:
                new_node = LeafNode(current_node_attr, attr_val, 'p', current_node.height + 1)
            elif e == subset_len:
                new_node = LeafNode(current_node_attr, attr_val, 'e', current_node.height + 1)
            else:
                attributes = current_node.split_attrs[:]
                split_attr, ig = find_best_split(subset_data, attributes)
                attributes.remove(split_attr)
                new_node = InternalNode(split_attr, data[split_attr].unique(), subset_data, ig, current_node.height + 1,
                                        attributes[:])
                queue.append(new_node)
            current_node.set_child(attr_val, new_node)
    return root

In [33]:
m, n = df_train.shape
global_class_counts = df_train['class'].value_counts()
p_p = global_class_counts['p']/m
p_e = global_class_counts['e']/m
entropy = - (p_p * math.log2(p_p)) - (p_e * math.log2(p_e))

In [34]:
root = fit(df_train, X_train.columns.to_list())

In [35]:
dummy_print = lambda x: x
def print_tree(root, print=print):
    q = deque()
    q.append(root)
    print(root)
    max_height = 0
    while q:
        e = q.popleft()
        for key, val in e.get_children():
            if isinstance(val, LeafNode):
                print('\t'*val.height + f'{key} -> {val.prediction}')
            elif val is not None:
                print('\t'*val.height + f'{key} -> {val}')
                q.append(val)
            if val is not None and val.height > max_height:
                max_height = val.height
    return max_height

**1. Assuming you break ties using the attribute that occurs ﬁrst (left to right) in the data, draw the resulting decision tree and report the maximum information gain for each node that you added to the tree.**

In [36]:
print("Depth = ", print_tree(root))

Attribute: odor(5) Attrs: 9 IG: 0.9078035498174333
	n -> Attribute: spore-print-color(20) Attrs: 9 IG: 0.9290297416617572
	a -> e
	p -> p
	y -> p
	l -> e
	f -> p
	c -> p
	s -> p
	m -> p
		n -> e
		k -> e
		w -> Attribute: habitat(22) Attrs: 7 IG: 0.850436790821723
		u -> p
		h -> e
		o -> e
		r -> p
		b -> e
		y -> e
			d -> Attribute: gill-size(8) Attrs: 2 IG: 0.9992092075027244
			p -> e
			g -> e
			w -> e
			l -> Attribute: cap-color(3) Attrs: 10 IG: 0.9992092075027244
			m -> p
			u -> p
				b -> e
				n -> p
				n -> e
				y -> p
				w -> p
				e -> p
				g -> p
				b -> p
				p -> p
				u -> p
				c -> e
				r -> p
Depth =  4


In [12]:
def predict(test, root):
    m, n = test.shape
    
    def tree_predictor(row):
        predicted = False
        current = root
        while not predicted:
            row_val = row[current.attr]
            next_node = current.children[row_val]
            if isinstance(next_node, LeafNode):
                return next_node.predict()
            else:
                current = next_node
        return None
    
    return test.apply(tree_predictor, axis=1)

def get_accuracy(y_pred, y_test):
    return 100 * np.mean(y_pred.ravel() == y_test.ravel())


**2. What is the accuracy of this decision tree on the test data?**

In [13]:
y_pred_test = predict(X_test, root)
print("Testing Accuracy:",  get_accuracy(y_pred_test, y_test))

Testing Accuracy: 100.0


**3. Now consider arbitrary input data. Suppose that you decide to limit yourself to decision trees of height one, i.e., only one split. Is the tree produced by the information gain heuristic optimal on the training data (that is, no other decision tree has higher accuracy)?**

In [26]:
def split_all(data, attributes):
    # Generate all decision trees with height = 1
    m, n = data.shape
    split_nodes = []
    for attr in attributes:
        cond_entropy = 0
        attr_vals, counts = np.unique(data[attr], return_counts=True)
        root = InternalNode(attr, attr_vals, data)
        for attr_val, attr_count in zip(attr_vals, counts):
            sub_data = data[data[attr] == attr_val]
            subset_len, _ = sub_data.shape
            subclass_counts = sub_data['class'].value_counts()
            p = subclass_counts['p'] if 'p' in subclass_counts else 0
            e = subclass_counts['e'] if 'e' in subclass_counts else 0
            plogp, eloge = 0, 0
            if p > 0:
                plogp = - (p/subset_len) * math.log2(p/subset_len)
            if e > 0:
                eloge = - (e/subset_len) * math.log2(e/subset_len)
            child_node = LeafNode(attr, attr_val, 0 if p < e else 1, root.
            height+1)
            root.set_child(attr_val, child_node)
            cond_entropy += (attr_count/m) * (plogp + eloge)
        root.ig = -cond_entropy
        split_nodes.append(root)
    return split_nodes


In [27]:
l1 = split_all(df_train, X_train.columns)
import operator
l1.sort(key=operator.attrgetter('ig'))
l1.reverse()
l1

[Attribute: odor(5) Attrs: 9 IG: -0.09140565768529108,
 Attribute: spore-print-color(20) Attrs: 9 IG: -0.5093150773316704,
 Attribute: gill-color(9) Attrs: 12 IG: -0.5754889926227642,
 Attribute: ring-type(19) Attrs: 5 IG: -0.6876800078596315,
 Attribute: stalk-surface-above-ring(12) Attrs: 4 IG: -0.7134085115998995,
 Attribute: stalk-surface-below-ring(13) Attrs: 4 IG: -0.7252832004148113,
 Attribute: stalk-color-above-ring(14) Attrs: 9 IG: -0.7481755300768533,
 Attribute: gill-size(8) Attrs: 2 IG: -0.7627584032539828,
 Attribute: stalk-color-below-ring(15) Attrs: 9 IG: -0.7670986423396887,
 Attribute: population(21) Attrs: 6 IG: -0.8039852277643933,
 Attribute: bruises(4) Attrs: 2 IG: -0.8085099308884869,
 Attribute: habitat(22) Attrs: 7 IG: -0.8374491946662814,
 Attribute: stalk-root(11) Attrs: 5 IG: -0.8602970496616003,
 Attribute: gill-spacing(7) Attrs: 2 IG: -0.9004355588075248,
 Attribute: cap-shape(1) Attrs: 6 IG: -0.9466999436762378,
 Attribute: ring-number(18) Attrs: 3 IG: -0