# Theory

# Implementation

In [1]:
from sklearn import datasets

iris_ds = datasets.load_iris()

In [2]:
def groupby(func,items):
    d = {}
    for itm in items:
        k = func(itm)
        if k in d:
            d[k].append(itm)
        else:
            d[k] = [itm]
    return d
    
# A nifty little function, mirrors Clojure's frequencies function
def freqs(items):
    "Returns a dictionary of the form  {item : items_frequency}"
    d = {}
    for x in items:
        d[x] = (d[x] + 1 if x in d else 1)
    return d

## Helper Functions

In [3]:
import math

def class_ratios(v):
    "Returns a list of all class ratios, helper function for impurity functions"
    return [b/len(v) for b in freqs(v).values()]

def gini(classes):
    return 1 - sum([x**2 for x in class_ratios(classes)])

def entropy(classes):
    return -1.0 * sum([x*math.log(x,2) for x in class_ratios(classes) if x > 0.0])

#assumes parent_examples is a vector and partitions is a vector of vectors
def gain_ratio(parent_examples, impurity_fn, partitions):
    childrens_impurity = sum([(len(p)/len(parent_examples)) * impurity_fn(p) for p in partitions])
    
    split_gain = -1.0 * sum([x*math.log(x,2) for x in map(lambda a : len(a)/len(parent_examples), partitions)])
    
    #put 0 in here to handle a feature that has a single value repeated as every element
    return 0 if split_gain == 0 else (impurity_fn(parent_examples) - childrens_impurity) / split_gain

## Decision Tree Class

In [4]:
#I created class_of and get_attr functions so that I could define an interface for the 
#rest of the code.  These two functions can deal with the complexity of the item's data layout
#while the rest of the functions just expect a contract with these functions.  Admittedly, this is hacky.

# For this assignment, I assume that an item is a vector of (unnamed) attributes and that the 
# dependent features is the last item in the vector
def class_of(item):
    return item[-1]

#for now, we assume that attr is an index into an item
def get_attr(attr,item):
    return item[attr]


#Even though I hate to use OOP (functional programming is way better), I think that it makes sense in this case

class DecisionTree:

    def __init__(self, impurity_measure):
        self._impurity_fnc = gini if ('gini' == impurity_measure) else entropy
    
    
    def __calc_gain(self, items,partitions):
        class_partitions = [[class_of(y) for y in p] for p in partitions.values()]
    
        return gain_ratio([class_of(y) for y in items], self._impurity_fnc, class_partitions)
    
    
    #Enumerate all possible splits, and some meta data about those splits, for a set of binary attributes
    def __enum_splits(self, items,attrs):
        splits = []

        for attr in attrs:
            partitions = groupby(lambda x : get_attr(attr,x),items) #attribute vals are keys, items are vals
            splits.append({"partitions" : partitions,
                           "split_attr" : attr,
                           "gain_ratio" : self.__calc_gain(items,partitions)})

        return splits
    
    #Our recursive function for splitting up items based on gain ratio
    def __make_node(self,items, attrs):
        if (1 == len(set(map(class_of,items)))):   #homogeneous
            return {"class" : class_of(items[0])}
        elif not attrs:                            #out of attributes, choose majority
            sorted_by_freq = sorted(freqs(map(class_of,items)).items(),key=(lambda x : x[1]))
            return {"class" : sorted_by_freq[-1][0]} 

        #the best split is the split with the highest gain ratio
        best_split = max(self.__enum_splits(items,attrs), 
                         key=(lambda x : x['gain_ratio']))

        reproduce = lambda i,a : self.__make_node(i,[x for x in attrs if x != a])

        best_split['children'] = {att_val : reproduce(itmz,best_split['split_attr']) for att_val,itmz in best_split['partitions'].items()}
        best_split.pop('partitions', None)  #drop the partitions key, it isn't useful anymore

        return best_split
    
    def fit(self, features, labels):
        items = np.hstack((features,np.reshape(labels,[len(features),1])))
        num_attributes = len(features[0])
        self.__root = self.__make_node(items,list(range(num_attributes)))
    
    def __classify_recur(self, node, item):
        if 'class' in node:
            return node['class']

        child_key = get_attr(node['split_attr'],item)

        return self.__classify_recur(node['children'][child_key],item)
    
    def classify(self, item):
        assert self.__root
        return self.__classify_recur(self.__root, item)    


## Example Usage

In [5]:
import numpy as np

#convert the iris data set into binary data, 1 -> greater than attribute mean, 0 -> less than or equal to mean
# remove all classes

#TODO convert to take a DataFrame and work on non binary data.  Also implement pruning

deps = iris_ds.target
tmp = iris_ds.data

means = np.mean(tmp,axis=0)

indeps = tmp / means

indeps[indeps <= 1.0] = 0
indeps[indeps > 1.0] = 1

# indeps now contains binary data, let us build a tree

#Use a different impurity metric
tree = DecisionTree('entropy')

tree.fit(indeps,deps)

classified = [tree.classify(x) for x in indeps]

results = [[deps[i],classified[i]] for i in range(len(deps)) if deps[i] != classified[i]]

print("Accuracy: ",(1.0 - len(results)/len(deps)))

Accuracy:  0.7533333333333333
