# Creating decision tree from the ground up

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [38]:
class Node:
    def __init__(self, attribute, value, left=None, right=None, leaf=False):
        self.attribute = attribute
        self.value = value
        self.left = left
        self.right = right
        self.leaf = leaf
    
class DecisionTreeLearning:
    def __init__(self, data, depth):
        self.data = data
        self.depth = depth
        self.build_tree()
        
    def calculate_entropy(self, sub_data):
        room_nrs = np.unique(sub_data[:,7])
        rooms = len(room_nrs)
        entropy = 0
        for room_nr in room_nrs:
            entropy += -(np.sum(sub_data[:,7]==room_nr)/rooms) * (np.log2(np.sum(sub_data[:,7]==room_nr)/rooms))
        return entropy
        
        
    def find_split(self):
        # X, y = self.data[:,0:7], self.data[:,7]
        attribute_len = self.data[:,0:7].shape[1]
        room_nrs = np.unique(self.data[:,7])

        overall_entropy = self.calculate_entropy(self.data)
        min_entropy_tracker = []
        
        for i in range(attribute_len):
            min_entropy = math.inf
            for data_point in sorted(np.unique(self.data[:,0:7][:,i]))[0:-1]:
                left_data = self.data[self.data[:,i]<=data_point]
                right_data = self.data[self.data[:,i]>data_point]
                left_entropy = self.calculate_entropy(left_data)
                right_entropy = self.calculate_entropy(right_data)
                weighted_entropy = (left_entropy+right_entropy)/2
                if weighted_entropy < min_entropy:
                    min_entropy = weighted_entropy
                    split_point = data_point
            min_entropy_tracker.append((split_point, min_entropy, i))
            
        print(min_entropy_tracker)
        
        best_ent = math.inf
        best_split_point = float()
        attribute_col_nr = int()
        for point, ent, col in min_entropy_tracker:
            if ent < best_ent:
                best_ent = ent
                best_split_point = point
                attribute_col_nr = col
                
        print(best_split_point, best_ent, attribute_col_nr)
        # return best_split_point
                    
    def build_tree(self):
        self.find_split()

In [39]:
data = np.loadtxt("../data/wifi_db/clean_dataset.txt")
X, y = data[:,0:7], data[:,7]
print(X.shape, y.shape)

(2000, 7) (2000,)


In [40]:
cls = DecisionTreeLearning(data, 4)

[(-42.0, -2978.5553572542512, 0), (-70.0, -1740.67196145728, 1), (-44.0, -1760.1154608949773, 2), (-40.0, -2537.65641921469, 3), (-54.0, -3298.505309620094, 4), (-73.0, -2541.4825470856317, 5), (-73.0, -2350.9971639226833, 6)]
-54.0 -3298.505309620094 4


In [31]:
data[data[:,0]<-71]

array([[-72., -63., -64., -70., -81., -85., -87.,   1.],
       [-73., -62., -66., -65., -81., -87., -89.,   1.],
       [-73., -61., -65., -65., -77., -89., -90.,   1.],
       [-73., -63., -65., -70., -80., -89., -93.,   1.],
       [-74., -62., -66., -70., -76., -89., -87.,   1.],
       [-72., -62., -63., -76., -81., -84., -91.,   1.],
       [-72., -59., -65., -75., -81., -84., -91.,   1.]])

In [15]:
data[0]<-68

array([False, False, False, False,  True,  True,  True, False])

In [21]:
math.inf

inf