In [1]:
import math

In [2]:
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [3]:
import numpy as np

In [4]:
class Node(object):
    def __init__(self, value, neighbours=[]):
        self.value = value
        self.neighbours = neighbours
        
    def get_value(self):
        return self.value
    
    def get_neighbours(self):
        return self.neighbours

In [5]:
class TreeNode(Node):
    def __init__(self, value, children=[], parent=None):
        Node.__init__(self, value=value, neighbours=None)
        
        self.children = children
        self.parent = parent
        
    def get_children(self):
        return self.children
        
    def get_parent(self):
        return self.parent
    
    def is_root(self):
        return self.parent == None
    
    def is_leaf(self):
        return self.children == []
    
    def get_depth(self):
        if self.is_root():
            return 0
        else:
            return 1 + self.parent.get_depth()

In [6]:
class Leaf(TreeNode):
    def __init__(self, value, parent):
        TreeNode.__init__(self, value=value, children=[], parent=parent)

In [7]:
class DecisionNode(TreeNode):
    def __init__(self, argument, dtype="Binary", right=None, left=None, parent=None):
        TreeNode.__init__(self, value=argument, children=[left, right], parent=parent)
        self.type  = dtype
        
    def get_side(self):
        if self.is_root():
            return None
        else:
            if self == self.parent.get_children()[0]:
                return 'left'
            else:
                return 'right'

In [8]:
class DecisionTree(object):
    def __init__(self, max_depth=2, result_label='label'):
        self.arguments = None
        self.max_depth = max_depth
        self.tree = None
        self.result_label = result_label
        self.arg_property = dict()
        
    def compute_gini_index(self, data, argument, result_label='label'):
        left_N = data[data[argument] == False].shape[0]
        left_n = data[(data[argument] == False) & (data[result_label] == True)].shape[0]

        try:
            left_gini = 1 - (left_n/left_N) ** 2 - ( (left_N-left_n)/left_N) ** 2
        except:
            left_gini = 1

        right_N = data[data[argument] == True].shape[0]
        right_n = data[(data[argument] == True) & (data[result_label] == True)].shape[0]

        try:
            right_gini = 1 - (right_n/right_N) ** 2 - ( (right_N-right_n)/right_N) ** 2
        except:
            right_gini = 1

        return left_N/(left_N+right_N) * left_gini + right_N/(left_N+right_N)*right_gini
    
    def get_best_split(self, data, arguments):
        min_arg = None
        min_E   = 1
        
        for arg in arguments:
            E = self.compute_gini_index(data, arg, self.result_label)
            if E < min_E:
                min_E = E
                min_arg = arg
        
        return min_arg, min_E 
    
    def _filter_by_arg(self, data, argument, val):
        if self.arg_property[argument]['type'] == 'categorical':
            return data[data[argument] == val]
        elif self.arg_property[argument]['type'] == 'numerical':
            return data[data[argument] >= val]
        else:
            return None
    
    def train(self, data, arguments):
        self.arguments = arguments
        self._compute_arguments_properties(data)
        
        self.tree = self._build_tree(data, arguments=set(self.arguments), parent=None, level=0)
        
    def _build_tree(self, data, arguments, parent=None, level=0):
        if len(arguments) != 0:
            min_arg, min_gini = self.get_best_split(data, list(arguments))
            node = DecisionNode(argument=min_arg, parent=parent)
            
            data_false = data[data[min_arg]==False]
            data_true  = data[data[min_arg]==True]
            
            arguments = arguments - { min_arg }

            node.left  = self._build_tree(data_false, arguments=arguments, parent=node, level=level+1)
            node.right = self._build_tree(data_true,  arguments=arguments, parent=node, level=level+1)
        else:
            N = data.shape[0]
            n = data[data[self.result_label] == True].shape[0]

            node = Leaf({'True' : n/N, 'False' : (N-n)/N}, parent=parent)
            
        return node
    
    def _compute_arguments_properties(self, data):
        self.arg_property = dict()
        
        for argument in self.arguments:
            if is_string_dtype(data[argument]) or is_bool_dtype(data[argument]):
                self.arg_property[argument] = { 'type' : 'categorical' , 'classes' : data[argument].unique() }
            else:
                self.arg_property[argument] = { 'type' : 'numerical' }
                
    def predict(self, data):
        node = self.tree
        
        while not node.is_leaf():
            if data[node.get_value()]:
                node = node.right
            else:
                node = node.left
                
        return node.get_value()

In [9]:
def print_tree(node):
    print(node.value)
    
    if isinstance(node, Leaf):
        return
    else:
        if node.left != None:
            print('left: ')
            print_tree(node.left)
        else:
            pass

        if node.right != None:
            print('right: ')
            print_tree(node.right)
        else:
            pass

In [10]:
DC = DecisionTree()

In [11]:
data = pd.DataFrame({
    'gender': [True, True, False, False],
    'rainy': [True, False, True, False],
    'label': [False, True, True, False]
})

In [12]:
DC.train(data, ['rainy', 'gender'])

In [13]:
print( DC.predict({'gender': True, 'rainy': True }) )

{'True': 0.0, 'False': 1.0}
