In [129]:
import numpy as np
import pandas as pd

class Node:
    def __init__(self, X = None, y = None, level = None):
        self.X = X
        self.y = y
        self.level = level
        self.left = None
        self.right = None
        self.feat = None
        self.threshold = None
        self.gain = None
        self.entropy = None
        
    def _f_y(self):
        _y = pd.Series([1 if i == 'Yes' else 0 for i in self.y])
        n = len(_y)
        a = sum(_y)
        b = n-a
        if (b != 0):
            print(f"Count of 0 = {b}")
        if (a !=  0):
            print(F"Count of 1 = {a}")
        return ([a,b])

    def _f_X(self, X, y, param = None):
        _y = pd.Series([True if i == 'Yes' else False for i in self.y])
        if (param != None):
            mask = (X == param)
            X = X[mask]
            _y = _y[mask]
        l = []
        a = X.value_counts()
        b = X[_y].value_counts()
        for i in a.index:
            if i in b:
                l.append([b[i], a[i]-b[i]])
        return(l)
        
    def _entropy(self, l):
        n = len(l)
        total = sum(l)
        h = 0
        for i in l:
            if i == 0:
                h -= 0
            else:
                h -= (i/total)*np.log2(i/total)
        return (h)
    
    def _get_gain(self):
        initial_entropy = self._entropy(self._f_y())
        maxGain = float('-inf')
        param = ''
        for i in self.X:
            ex_Inf = self._expected_inf(self._f_X(self.X[i], self.y))
            gain = initial_entropy - ex_Inf
            if (gain > maxGain):
                maxGain = gain
                param = i
        return param, maxGain, initial_entropy
    
    def _expected_inf(self, l):
        total = 0
        for i in l:
            total += sum(i)
        inf = 0
        for i in l:
            h = self._entropy(i)
            inf += (sum(i)/total)*h
        return (inf)
    
    def update_node(self):
        self.feat, self.gain, self.entropy = self._get_gain()
        return
    
    def print_Node(self):
        print(f"\nLevel {self.level}")
        print(f"Current Entropy at is = {self.entropy}")
        if (self.entropy <= 0):
            print("Reached leaf Node")
            return
        else:
            print(f"Splitting on feature = {self.feat} with gain ratio {self.gain}")
            return


class Tree:
    
    def __init__(self, X = None, y = None, level = None):
        self.X = X
        self.y = y
        self.level = 0
        self.root=None
    
    def make_tree(self, X, y, level = 0):
        print("\n")
        new_node = Node(X, y, level)
        new_node.update_node()
        self.root = new_node
        print(f"Current Entropy at is = {new_node.entropy}")
        if (new_node.entropy <= 0):
            print("Reached leaf Node")
            return new_node
        print(f"Splitting on feature = {new_node.feat} with gain ratio {new_node.gain}")
        sub_cat = X[new_node.feat].unique()
        print(f"Sub-cat {sub_cat}")
        for i in sub_cat:
            tiny_X, tiny_y = self.split(X, y, new_node.feat, category = i)
            if (i == 'T'):
                new_node.left = self.make_tree(tiny_X, tiny_y, level = new_node.level+1)
            if (i == 'F'):
                new_node.right = self.make_tree(tiny_X, tiny_y, level = new_node.level+1)
        return new_node
            
    def split(self, X, y, feature = None, category = None):
        if feature == None:
            return 0
        mask = X[feature] == category
        t_y = y[mask].reset_index(drop=True)
        t_X = X[mask].reset_index(drop=True)
        t_X = t_X.drop(feature, axis = 1)
        return t_X, t_y

In [131]:
if __name__ == '__main__':
    data = {
        'x1':['T', 'F', 'T', 'F'],
        'x2':['T', 'T', 'F', 'F'],
        'y':['Yes', 'Yes', 'Yes', 'No']
    }

    df = pd.DataFrame(data)
    X = df.iloc[:, 0:2]
    y = df.iloc[:, 2]
    
    # root is a tree
    arbol = Tree(X, y)
    root = arbol.make_tree(X, y)



Count of 0 = 1
Count of 1 = 3
Current Entropy at is = 0.8112781244591328
Splitting on feature = x1 with gain ratio 0.31127812445913283
Sub-cat ['T' 'F']


Count of 1 = 2
Current Entropy at is = 0.0
Reached leaf Node


Count of 0 = 1
Count of 1 = 1
Current Entropy at is = 1.0
Splitting on feature = x2 with gain ratio 1.0
Sub-cat ['T' 'F']


Count of 1 = 1
Current Entropy at is = 0.0
Reached leaf Node


Count of 0 = 1
Current Entropy at is = 0.0
Reached leaf Node


In [132]:
def inOrderPrint(r):
    if r is None:
        return 
    else :
        inOrderPrint(r.left)
        print(f"Level {r.level}")
        yes_count = r.y.value_counts().get('Yes', 0)
        no_count = r.y.value_counts().get('No', 0)
        if (yes_count != 0):
            print(f"Count of 1 = {yes_count}")
        if (no_count != 0):
            print(F"Count of 0 = {no_count}")
        print(f"Currency Entropy at is {r.entropy}")
        if (r.entropy <= 0):
            print("Reached leaf Node\n")
        else:
            print(f"Splitting on feature = {r.feat}with gain ratio {r.gain}\n")
        inOrderPrint(r.right)

inOrderPrint(root)

Level 1
Count of 1 = 2
Currency Entropy at is 0.0
Reached leaf Node

Level 0
Count of 1 = 3
Count of 0 = 1
Currency Entropy at is 0.8112781244591328
Splitting on feature = x1with gain ratio 0.31127812445913283

Level 2
Count of 1 = 1
Currency Entropy at is 0.0
Reached leaf Node

Level 1
Count of 1 = 1
Count of 0 = 1
Currency Entropy at is 1.0
Splitting on feature = x2with gain ratio 1.0

Level 2
Count of 0 = 1
Currency Entropy at is 0.0
Reached leaf Node

