In [1]:
import pandas as pd
import numpy as np
import math
data = df = pd.read_csv('agaricus-lepiota.data',header=None)

In [2]:
data.columns =['class', 'cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor', 
               'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color', 
               'stalk_shape', 'stalk_root', 'stalk_surface_above_ring', 'stalk_surface_below_ring', 'stalk_color_above_ring', 'stalk_color_below_ring', 
               'veil_type', 'veil_color', 'ring_number', 'ring_type', 'spore_print_color', 'population', 'habitat']

In [3]:
data['stalk_root'] = data['stalk_root'].replace(["?"],np.nan)
new_data=data.copy()
new_data=new_data.dropna()

In [4]:
def train_test_set(X):
  new_X=X.sample(frac=1)
  separate=int(len(new_X.index)*0.8)
  train=new_X.iloc[:separate]
  test=new_X.iloc[separate:]
  return[train,test]

In [5]:
[train_data,test_data]=train_test_set(new_data)

In [6]:
train_y = train_data['class']
train_X = train_data.drop('class', axis = 1)

In [7]:
test_y = test_data['class']
test_X = test_data.drop('class', axis = 1)

In [8]:
data
new_data
train_data

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
4597,p,f,f,y,f,f,f,c,b,g,...,k,b,b,p,w,o,l,h,y,d
2209,e,x,y,g,t,n,f,c,b,w,...,s,p,p,p,w,o,p,n,v,d
4475,p,x,y,y,f,f,f,c,b,h,...,k,n,b,p,w,o,l,h,y,g
4359,p,f,f,y,f,f,f,c,b,p,...,k,n,p,p,w,o,l,h,y,d
4091,p,f,f,y,f,f,f,c,b,h,...,k,b,b,p,w,o,l,h,v,d
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3755,p,x,f,g,f,f,f,c,b,h,...,k,p,b,p,w,o,l,h,y,g
3296,e,f,f,g,t,n,f,c,b,w,...,s,g,p,p,w,o,p,k,y,d
5575,p,x,s,b,t,f,f,c,b,h,...,s,w,w,p,w,o,p,h,v,g
4998,p,x,y,g,f,f,f,c,b,g,...,k,p,b,p,w,o,l,h,v,d


In [9]:
class Node():
    def __init__(self, value):
      self.value = value
      self.depth = 0
      self.children = []
      self.parent='None'

    def add_child(self, item):
      self.children.append(item)
      item.depth=self.depth+1
      item.parent=self

In [10]:
def print_tree(node):
    print("     |" *node.depth+"==="+str(node.value))
    if node.children!=[]:
      for child in node.children:
          print_tree(child)

In [11]:
def get_entropy(y):
    unique, counts = np.unique(y, return_counts = True) #unique is an array of unique values in y, counts is an array contains times that those values appear
    probabilities = counts / len(y) #return an array of probabilities with each element corresponds to elements in count array
    entropy = sum(-probabilities * np.log2(probabilities))
    return entropy

In [12]:
def get_best_split(X, y):
    max_info_gain = 0.01
    prev_entropy = get_entropy(y)
    best_column = ''
    for colname in X.columns:
        column = X[colname]
        sum_entropy = 0
        for category in column.unique():
            is_in_category = X[colname] == category
            y_after_split = y[is_in_category]
            sum_entropy += get_entropy(y_after_split) * len(y_after_split) / len(X)
        info_gain = prev_entropy - sum_entropy
        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_column = colname
    return best_column 

In [13]:
def split_subtree(X, y, stopping_depth, parent_node):
    best_split_attribute = get_best_split(X, y)
    if parent_node.depth >= stopping_depth:
      majority=y.mode()
      child_node=Node(">"+str(majority)[5:-13])
      parent_node.add_child(child_node)
    else:
      for category in X[best_split_attribute].unique():
        to_include = X[best_split_attribute] == category
        new_X = X[to_include].drop(best_split_attribute, axis = 1)
        new_y = y[to_include]
        child_node=Node(category)
        parent_node.add_child(child_node)
        current=get_best_split(new_X, new_y)
        if current=='':
          the_class=new_y[to_include].mode()
          current_node=Node(str(the_class)[5:-14])
          child_node.add_child(current_node)
        else:
          current_node=Node(current)
          child_node.add_child(current_node)
          split_subtree(new_X, new_y, stopping_depth, current_node)

In [14]:
def root_node(X, y, z):
  best_split_attribute = get_best_split(X, y)
  root=Node(best_split_attribute)
  split_subtree(X,y,z,root)
  return root

In [15]:
tree=root_node(train_X, train_y, 6)
print_tree(tree) #this is the decision tree

===odor
     |===f
     |     |===p
Name: class,
     |===n
     |     |===spore_print_color
     |     |     |===n
     |     |     |     |===e
Name: class,
     |     |     |===k
     |     |     |     |===e
Name: class,
     |     |     |===r
     |     |     |     |===p
Name: class,
     |     |     |===w
     |     |     |     |===cap_color
     |     |     |     |     |===n
     |     |     |     |     |     |===e
Name: class,
     |     |     |     |     |===c
     |     |     |     |     |     |===e
Name: class,
     |     |     |     |     |===w
     |     |     |     |     |     |===p
Name: class,
     |     |     |     |     |===g
     |     |     |     |     |     |===e
Name: class,
     |     |     |     |     |===y
     |     |     |     |     |     |===p
Name: class,
     |     |     |     |     |===p
     |     |     |     |     |     |===e
Name: class,
     |===a
     |     |===e
Name: class,
     |===l
     |     |===e
Name: class,
     |===c
     |     |===p
Name: cl

In [16]:
# test_data
# test_X
# test_y

In [17]:
## d: pd.Series recording the predicted classification
d=test_y.copy()
d=d.str.replace('p','u')
d=d.str.replace('e','u')


## test_X_copy: pd.dataframe of the testing data without classification column
test_X_copy=test_X.copy()



def test_procedure(data,d, tree):
  attribute=tree.value
  for category in tree.children:
    for label in category.children:
      # print(label.value)
      if label.value=='p' or label.value=='e':
        indices=data[data[tree.value] == category.value].index
        # print(indices)
        d.update(pd.Series([label.value]*len(indices), index=indices))
        data=data.drop(indices)
        return [d,data]
      else:
        [d,data]=test_procedure(data, d, label)

In [18]:
test_procedure(test_X_copy,d, tree)

TypeError: cannot unpack non-iterable NoneType object

In [None]:
d,test_X_copy=test_procedure(test_X_copy,d, tree)

In [None]:
d

3797    u
5911    u
3617    u
3704    u
2159    u
       ..
273     u
908     u
5065    u
1040    u
401     u
Name: class, Length: 1129, dtype: object

In [None]:
test_y


3797    e
5911    p
3617    e
3704    p
2159    e
       ..
273     e
908     p
5065    p
1040    e
401     e
Name: class, Length: 1129, dtype: object