In [42]:
import numpy as np
import pandas as pd

class Node:
    def __init__(self, X = None, y = None, level = None):
        self.X = X
        self.y = y
        self.level = level
        self.left = None
        self.right = None
        self.feat = None
        self.threshold = None
        self.gain = None
        self.entropy = None
        
    def _f_y(self):
        _y = pd.Series([1 if i == 'Yes' else 0 for i in self.y])
        n = len(_y)
        a = sum(_y)
        b = n-a
        if (b != 0):
            print(f"Count of 0 = {b}")
        if (a !=  0):
            print(F"Count of 1 = {a}")
        return ([a,b])
        
    def _f_X(self, X, y, param = None):
        _y = pd.Series([True if i == 'Yes' else False for i in self.y])
        if (param != None):
            mask = (X == param)
            X = X[mask]
            _y = _y[mask]
        l = []
        a = X.value_counts()
        b = X[_y].value_counts()
        for i in a.index:
            if i in b:
                l.append([b[i], a[i]-b[i]])
        return(l)
        
    def _entropy(self, l):
        n = len(l)
        total = sum(l)
        h = 0
        for i in l:
            if i == 0:
                h -= 0
            else:
                h -= (i/total)*np.log2(i/total)
        return (h)
    
    def _get_gain(self):
        initial_entropy = self._entropy(self._f_y())
        maxGain = float('-inf')
        param = ''
        for i in self.X:
            ex_Inf = self._expected_inf(self._f_X(self.X[i], self.y))
            gain = initial_entropy - ex_Inf
            if (gain > maxGain):
                maxGain = gain
                param = i
        return param, maxGain, initial_entropy
    
    def _expected_inf(self, l):
        total = 0
        for i in l:
            total += sum(i)
        inf = 0
        for i in l:
            h = self._entropy(i)
            inf += (sum(i)/total)*h
        return (inf)
    
    def get_threshold(self):
        thresholds = self.X[self.feat].unique()
        less_entropy = float('inf')
        best_threshold = 0 
        for i in thresholds:
            mask_minus = self.X[self.feat] <= i
            mask_plus = self.X[self.feat] > i
            entropy_minus = self._entropy(self.y[mask_minus])
            entropy_plus = self._entropy(self.y[mask_plus])
            len_minus = len(self.y[mask_minus])
            len_plus = len(self.y[mask_plus])
            weighted_entropy = ((entropy_minus * len_minus) + (entropy_plus * len_plus)) / len(self.X)
            if weighted_entropy < less_entropy:
                less_entropy = weighted_entropy
                best_threshold = i
        return best_threshold
    
    def update_node(self):
        self.feat, self.gain, self.entropy = self._get_gain()
        self.threshold = self.get_threshold()
        return
    
    def print_Node(self):
        print(f"\nLevel {self.level}")
        print(f"Current Entropy at is = {self.entropy}")
        if (self.entropy <= 0):
            print("Reached leaf Node")
            return
        else:
            print(f"Splitting on feature = {self.feat} with gain ratio {self.gain}")
            return

In [43]:
class Tree:
    
    def __init__(self, X = None, y = None, level = None):
        self.X = X
        self.y = y
        self.level = 0
        self.root=None
    
    def make_tree(self, X, y, level = 0):
        print("\n")
        new_node = Node(X, y, level)
        new_node.update_node()
        self.root = new_node
        print(f"Current Entropy at is = {new_node.entropy}")
        if (new_node.entropy <= 0):
            print("Reached leaf Node")
            return new_node
        print(f"Splitting on feature = {new_node.feat} with gain ratio {new_node.gain}")
        mask_left = X[new_node.feat] <= threshold
        mask_right = X[new_node.feat] > threshold
        new_node.left = self.make_tree(X[mask_left], y[mask_left], level = new_node.level+1)
        new_node.right = self.make_tree(X[mask_right], y[mask_right], level = new_node.level+1)
        return new_node
            
    def split(self, X, y, feature = None, category = None):
        if feature == None:
            return 0
        mask = X[feature] == category
        t_y = y[mask].reset_index(drop=True)
        t_X = X[mask].reset_index(drop=True)
        t_X = t_X.drop(feature, axis = 1)
        return t_X, t_y

In [44]:
def inOrderPrint(r):
    if r is None:
        return 
    else :
        inOrderPrint(r.left)
        print(f"Level {r.level}")
        yes_count = r.y.value_counts().get('Yes', 0)
        no_count = r.y.value_counts().get('No', 0)
        if (yes_count != 0):
            print(f"Count of 1 = {yes_count}")
        if (no_count != 0):
            print(F"Count of 0 = {no_count}")
        print(f"Currency Entropy at is {r.entropy}")
        if (r.entropy <= 0):
            print("Reached leaf Node\n")
        else:
            print(f"Splitting on feature = {r.feat}with gain ratio {r.gain}\n")
        inOrderPrint(r.right)

In [53]:
from sklearn.datasets import load_iris
import pandas as pd
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

X = df.copy()
y = df['target'] .copy()

In [55]:
tree = Tree(X, y)
root = tree.make_tree(X, y)




Count of 0 = 150
Current Entropy at is = 0.0
Reached leaf Node


In [54]:
type(y)

pandas.core.series.Series

In [58]:
y

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 150, dtype: int32

In [56]:
root.y.value_counts()

target
0    50
1    50
2    50
Name: count, dtype: int64

In [57]:
inOrderPrint(root)

Level 0
Currency Entropy at is 0.0
Reached leaf Node

