In [1]:
import logging
import pandas as pd
from math import sqrt as sqrt
from math import log2 as log2
import numpy as np
import copy
import datetime
import math
import re
from numpy.linalg import norm
from itertools import product, combinations
try:
    import cPickle as pickle
except ImportError:  # python 3.x
    import pickle

In [2]:
logging.basicConfig(format="%(asctime)s - %(thread)s - %(levelname)s - %(message)s")
logger = logging.getLogger()
logger.setLevel(logging.INFO)

## Prior Knowledge Required to Understand The Concepts Explained Here Better:

#### Entropy And InformationGain (Pages:1-4):
https://www.math.unipd.it/~aiolli/corsi/0708/IR/Lez12.pdf

#### More Detail on Entropy And InformationGain:
https://www.cs.cmu.edu/~mgormley/courses/10601-s17/slides/lecture27-infotheory.pdf

#### Supervised Learning, Decision Tree Algorithm:

#### Decision Tree 1: how it works:
https://www.youtube.com/watchv=eKD5gxPPeY0&list=PLBv09BD7ez_4temBw7vLA19p3tdQH6FYO&index=1

#### ID3 algorithm: how it works:
https://www.youtube.com/watch?v=_XhOdSLlE5c&list=PLBv09BD7ez_4temBw7vLA19p3tdQH6FYO&index=2

#### Decision Tree 3: which attribute to split on?:
https://www.youtube.com/watch?v=AmCV4g7_-QM&list=PLBv09BD7ez_4temBw7vLA19p3tdQH6FYO&index=3

#### Decision Tree 4: Information Gain:
https://www.youtube.com/watch?v=nodQ2s0CUbI&list=PLBv09BD7ez_4temBw7vLA19p3tdQH6FYO&index=4

#### Clustering Through/Via Decision Tree Construction:
###### Paper(Shorter Version):
     http://ftp.cse.buffalo.edu/users/azhang/disc/disc01/cd1/out/papers/cikm/p20.pdf
###### Paper(More Detailed Version):
     http://web.cs.ucla.edu/~wwc/course/cs245a/CLTrees.pdf

#### Paper Presentation Video (There are perhaps many but this one I find more clean than others):
https://www.youtube.com/watch?v=9nisRs_vkGo&list=PL5dMgjRyXQMD4jqNet7A-2IDOFj4whfDM&index=2&t=780s
[Atleast watch till 11:30.]


In [3]:
def readPicklefile(path):
    try:
        with open(path, 'rb') as fp:
            data = pickle.load(fp)
        return data
    except IOError:
        logger.error("Could not read the file at {}".format(path))
        return None
    
def writePickleFile(path, data):
    try:
        with open(path, 'wb') as fp:
            pickle.dump(data, fp, protocol=pickle.HIGHEST_PROTOCOL)
            return True
    except IOError:
        logger.error("Could not write to the file at {}".format(path))
        return False

In [4]:
def _relative_density(dataset):
    return float(dataset.length())/dataset.nr_virtual_points


In [5]:
class Data:
    '''DataFrame Data'''
    def __init__(self, instance_values, types):
        self.instance_values = instance_values
        self.attr_types = types
        self.attr_idx = dict()
        self.attr_names = list()
        
        self._init_attr_names()
        self._init_max_min() # Could replace with self.calculate_limits()
                    
        self.nr_virtual_points = len(self.instance_values)
        self.nr_total_instances = 2*self.nr_virtual_points
                    
    def _init_max_min(self):
        if len(self.instance_values) > 1:
            self.instance_view = self.instance_values.view(dtype=float).reshape(len(self.instance_values),-1)
            self.max_values = np.amax(self.instance_view, 0) 
            self.min_values = np.amin(self.instance_view, 0)             
        else:
            self.instance_view = self.instance_values.view(dtype=float)
            self.max_values = copy.copy(self.instance_view)
            self.min_values = copy.copy(self.instance_view)
            
    def _init_attr_names(self):
        for i, attr in enumerate(self.attr_types):
            if i < 1:
                continue
            attr_name, attr_type = attr
            self.attr_idx[attr_name] = i
            self.attr_names.append(attr_name)
        
    def __str__(self):
        s = 'Data: ' + str(len(self.instance_values)) + "\n"
        s += str(self.attr_names) + "\n"  
        s += " Max :" + str(self.max_values)+ "\n"  
        s += " Min :" + str(self.min_values)+ "\n"
#         s += str(self.instance_values)
        s += '\n--------\n'
        return s
    
    def calculate_limits(self):
        self._init_max_min()
        
    def sort(self, attribute):
        self.instance_values = np.sort(self.instance_values, order=attribute)
        self.instance_view = self.instance_values.view(dtype=float).reshape(len(self.instance_values), -1)
        
    def length(self):
        return len(self.instance_values)
    
    def getInstances(self, attribute):
        idx = self.attr_idx[attribute]
        if self.length() > 1:
            return self.instance_view[:,idx]
        elif self.length() == 1:
            return [self.instance_view[0,idx]]
        else:
            return []
        
    def getInstanceIndex(self, id):
        if self.length() > 1:                
            idx = np.argwhere(self.instance_view[:,0] == id)
            return idx[0, 0]
        elif self.length() == 1 and id == self.instance_view[0]:            
            return 0
        else:
            return None
    
    def getId(self, idx):
        if self.length() > 1:        
            return self.instance_view[idx][0]
        elif self.length() == 1:
            return self.instance_view[0]
        else:
            return -1
                    
    def get_max(self, attribute):
        idx = self.attr_idx[attribute]
        return self.max_values[idx]
    
    def get_min(self, attribute):
        idx = self.attr_idx[attribute]
        return self.min_values[idx]
    
    def set_max(self, attribute, value):
        if len(self.max_values) > 0:
            idx = self.attr_idx[attribute]
            self.max_values[idx] = value
            
    def set_min(self, attribute, value):
        if len(self.min_values) > 0:
            idx = self.attr_idx[attribute]
            self.min_values[idx] = value


class DataFrameReader:
    def __init__(self, df):
        self.df = df
        pass
    
    def read(self):
        types = self._read_columnNames()
        output = self._read_instances()
        
        output = np.array(output, dtype=types)
        data = Data(output, types)
        
        logger.info("read " + str(len(self.df)))
        logger.info("attribute names: {}".format(data.attr_types))
        
        return data
        
    def _read_columnNames(self):
        dtype = list() 
        dtype.append(("id", float))  
        for col in list(self.df.columns):
            dtype.append((col, float))
                
        return dtype
        
    def _read_instances(self): 
        output = list()
        count = 0.0
        for index, row in self.df.iterrows():
            a = tuple(row.values)
            a = tuple([count]) + a
            output.append(a)
            count += 1.0
        return output

In [6]:
class DatasetSplitter:
    def __init__(self):
        pass
    
    def split(self, dataset, attribute, value, idx, adjust=False): # Changes Made
        
        l = dataset.instance_values[0: idx]
        r = dataset.instance_values[idx:]
        lhs_set = Data(l, dataset.attr_types)
        rhs_set = Data(r, dataset.attr_types)

        # Set the Lhs min and Max and Rhs Max. 
        lhs_set.calculate_limits()
        rhs_set.calculate_limits()
        
        self._splitNrVirtualPoints(dataset, attribute, value, lhs_set, rhs_set)
        if adjust: # Changes Made
            print ("Before Adjusting The Number of Virtual Points:")
            print ("\t LHS - Actual Points: {}, Virtual Points: {}, Total Number of Data Points: {}".format(\
                                            lhs_set.length(), lhs_set.nr_virtual_points, lhs_set.nr_total_instances))
            print ("\t RHS - Actual Points: {}, Virtual Points: {}, Total Number of Data Points: {}".format(\
                                            rhs_set.length(), rhs_set.nr_virtual_points, rhs_set.nr_total_instances))
            self._updateVirtualPoints(lhs_set)
            self._updateVirtualPoints(rhs_set)
            
            print ("After Adjusting The Number of Virtual Points:")
            print ("\t LHS - Actual Points: {}, Virtual Points: {}, Total Number of Data Points: {}".format(\
                                            lhs_set.length(), lhs_set.nr_virtual_points, lhs_set.nr_total_instances))
            print ("\t RHS - Actual Points: {}, Virtual Points: {}, Total Number of Data Points: {}".format(\
                                            rhs_set.length(), rhs_set.nr_virtual_points, rhs_set.nr_total_instances))
        else:
            lhs_set.nr_total_instances = lhs_set.length() + lhs_set.nr_virtual_points
            rhs_set.nr_total_instances = rhs_set.length() + rhs_set.nr_virtual_points
            
        return lhs_set, rhs_set
        
    def _splitNrVirtualPoints(self, dataset, attribute, value, in_set, out_set):
        minV = dataset.get_min(attribute)
        maxV = dataset.get_max(attribute)
        
        in_set.nr_virtual_points = max(int(abs(dataset.nr_virtual_points*((value-minV+1)/(maxV-minV+1)))),1) # Changes Made
        out_set.nr_virtual_points = dataset.nr_virtual_points - in_set.nr_virtual_points
        if in_set.nr_virtual_points ==0:
            print ("Original Virtual Points: {}, Out Set Virtual Points: {},\n "
                   "abs(dataset.nr_virtual_points*((value-minV+1)/(maxV-minV+1))): {}".format(dataset.nr_virtual_points,out_set.nr_virtual_points, abs(dataset.nr_virtual_points*((value-minV+1)/(maxV-minV+1))) ))
        if out_set.nr_virtual_points < 0:
            self.raiseUndefinedNumberOfPoints()
            
        return
    
    def _updateVirtualPoints(self, data_set):            
        nr_points_in_set = data_set.length()
        data_set.nr_virtual_points = self._calcNumberOfPointsToAdd(nr_points_in_set, data_set.nr_virtual_points)
        data_set.nr_total_instances = nr_points_in_set + data_set.nr_virtual_points
        return
    
    def _calcNumberOfPointsToAdd(self, nr_points_in_node, nr_points_inherited):    
        if nr_points_inherited < nr_points_in_node:
            nr_points = nr_points_in_node
        else:
            nr_points = nr_points_inherited
        return nr_points
    
    def raiseUndefinedNumberOfPoints(self):
        raise DatasetSplitter.UndefinedNumberOfPoints()
    class UndefinedNumberOfPoints(Exception):
        pass
    

In [7]:
class BuildTree(object):
    def __init__(self, min_split):       
        self.cutCreator = InfoGainCutFactory(min_split)
        self.datasetSplitter = DatasetSplitter()
        self.min_split = min_split
        self.root = None
        self.maxNodeId = 0
            
    def build(self, dataset):
        self._build_tree(dataset, None, 0)        
        return self.root

    def _build_tree(self, dataset, parent, depth):
        
        print ("XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX")
        if parent is not None:
            print ("At previous level({}), the cut happened on Attribute:{}, Value: {}".
                   format(parent.depth, parent.attribute, parent.cutValue))
        else:
            print ("This is the starting point!")
            
        print ("Underlying Dataset: \n{}".format(dataset.__str__()))
        
        bestCut = self._findBestCut(dataset)
        cutValue = None if bestCut is None else bestCut.value
        attribute = "" if bestCut is None else bestCut.attribute
        
        if bestCut is not None:
            print ("At this level the best cut is found on Attribute: {}, at: {}".format(attribute, cutValue))
        else:
            print ("No cut is found at current level!")
                   
        dt_node = CLNode(dataset, parent, attribute, depth, cutValue)
        dt_node.setID(self.maxNodeId)
        self.maxNodeId+=1
        if parent: parent.addChildNode(dt_node)
        if self._isRootNode(depth): self.root = dt_node
        
        if bestCut is None:
            return
        
        lhs_dataset, rhs_dataset = self._splitDatasetUsingBestCut(dataset, bestCut, adjust = True) # Changes Made
        
        if lhs_dataset.length() > self.min_split:
            self._build_tree(lhs_dataset, dt_node, (depth+1))
        else:
            
            print ("XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX")
            print ("Inadequate data points to create further split on left child!")
            print ("Level: {}, Datapoints: \n{}".format(depth+1, lhs_dataset.__str__()))
            
            lcNode = CLNode(lhs_dataset, dt_node, None, depth+1, None)
            dt_node.addChildNode(lcNode)
            lcNode.setID(self.maxNodeId)
            self.maxNodeId+=1
        if rhs_dataset.length() > self.min_split:
            self._build_tree(rhs_dataset, dt_node, (depth+1))
        else:
            
            print ("XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX")
            print ("Inadequate data points to create further split on right child!")
            print ("Level: {}, Datapoints: \n{}".format(depth+1, rhs_dataset.__str__()))
            
            rcNode = CLNode(rhs_dataset, dt_node, None, depth+1, None)
            dt_node.addChildNode(rcNode)
            rcNode.setID(self.maxNodeId)
            self.maxNodeId+=1
        return    
            
    def _isRootNode(self, depth):
        if depth==0 and self.root is None: return True
        
    def _splitDatasetUsingBestCut(self, dataset, bestCut, adjust): # Changes Made
        dataset.sort(bestCut.attribute)        
        idx = dataset.getInstanceIndex(bestCut.inst_id)
#         try:
#             idx = idx[-1]
#         except:
#             pass
        
        lhs_set, rhs_set = self.datasetSplitter.split(dataset, bestCut.attribute, bestCut.value, idx+1, adjust)  # Changes Made                      
        lhs_set.calculate_limits()
        rhs_set.calculate_limits()
        
        return lhs_set, rhs_set            
            
    def _findBestCut(self, dataset):
        dataset.calculate_limits()
        bestCut = None
        print("Trying to find best cut among the following attributes: {}".format(list(dataset.attr_names)))
        for attribute in dataset.attr_names:
            dataset.sort(attribute) # Sorting the dataset based on a attribute value at a time.
            di_cut = self._calcCut(dataset, attribute) # Best cut on current attribute based on "Information Gain"
            if di_cut is None: # Ignore dimension go to the next dimension.
                continue
            
            bestCut = self._selectLowerDensityCut(di_cut, bestCut) # Keeping Track of best cut incrementally. 
            
        return bestCut # Returning the best cut at this level and the local dataset, which is a subset of the entire dataset.
            
    def _calcCut(self, dataset, attribute): 
        return self.cutCreator.cut(dataset, attribute) # Acquire the first cut on the dimension based on Information Gain
    
    def _selectLowerDensityCut(self, cut1, cut2):
        if cut1 is None: return cut2 
        if cut2 is None: return cut1
        rd1 = cut1.getRelativeDensityOfLowerDensityRegion() # Acquire the lower density region by cut1
        rd2 = cut2.getRelativeDensityOfLowerDensityRegion() # Acquire the lower density region by cut2
        print ("Relative Density of cut on Attribute: {}, at Value: {}, is {}".format(cut1.attribute, cut1.value, rd1))
        print ("Relative Density of cut on Attribute: {}, at Value: {}, is {}".format(cut2.attribute, cut2.value, rd2))
        if rd1 < rd2:
            print ("The lowest density cut is cut1")
            return cut1 # Return the cut which has the lower density region with the lowest relative density.
        else:
            print ("The lowest density cut is cut2")
            return cut2
        

In [8]:
class Cut:
    def __init__(self, attribute, value, inst_id, lhsset, rhsset):
        self.attribute = attribute
        self.value = value
        self.inst_id = inst_id
        self.lhs_set = lhsset
        self.rhs_set = rhsset

    def __str__(self):
        s = 'Cut: ' + self.attribute + "\n"
        s += str(self.lhs_set.attr_names) + "\n"  
        s += " Max lhs:" + str(self.lhs_set.max_values)+ "\n"  
        s += " Min lhs:" + str(self.lhs_set.min_values)+ "\n"
        s += " Max rhs:" + str(self.rhs_set.max_values)+ "\n" 
        s += " Min rhs:" + str(self.rhs_set.min_values)        
        s += '\n--------\n'
        return s
                
    def getNonAdjacentRegion(self, value, attribute):    
        dataset = self.getAdjacentRegion(value, attribute)
        if dataset is self.lhs_set:
            return self.rhs_set
        if dataset is self.rhs_set:
            return self.lhs_set
        return None
        
    def getAdjacentRegion(self, value, attribute):
        def getMinimumDistanceFromValue(dataset, attribute, value):
            distance1 = abs(dataset.get_max(attribute) - value)
            distance2 = abs(dataset.get_min(attribute) - value)
            return min(distance1, distance2)
        rhs_distance = getMinimumDistanceFromValue(self.rhs_set, attribute, value)
        lhs_distance = getMinimumDistanceFromValue(self.lhs_set, attribute, value)
        print ("*****************")
        print (self.rhs_set)
        print (rhs_distance)
        print (self.lhs_set)
        print (lhs_distance)
        print (value)
        print ("*****************")
        if lhs_distance < rhs_distance: return self.lhs_set
        
        else: return self.rhs_set
    
    def getRelativeDensityOfLowerDensityRegion(self):    
        lower_density_set = self.getLowerDensityRegion()                                                    
        r_density = _relative_density(lower_density_set)                
        return r_density

    def getLowerDensityRegion(self):
        if self.lhs_set is None or self.rhs_set is None:
            self.raiseNoRegionsDefined()
            
        if _relative_density(self.lhs_set) > _relative_density(self.rhs_set):
            return self.rhs_set
        else:
            return self.lhs_set  
    
    def raiseNoRegionsDefined(self):
        raise Cut.NoRegionsDefined()
    class NoRegionsDefined(Exception):
        pass  

In [9]:
class InfoGainCutFactory:
    def __init__(self, min_split):
        self.min_split = min_split
        self.datasetSplitter = DatasetSplitter()

    def cut(self, dataset, attribute):
        dataset.sort(attribute)
        di_cut = None
        max_info_gain = -1
        max_info_gain_weighted = float('-inf')
        instances = dataset.getInstances(attribute)
        nunique = np.unique(instances)
        for value in nunique:
            print ("Current Attribute: {}, Current Value: {}".format(attribute, value))
            i = np.argwhere(instances == value)
            i = i[-1] 
            i = i[0]
                
            if self._hasRectangle(dataset, attribute, value):
                
                lhs_set, rhs_set = self.datasetSplitter.split(dataset, attribute, value, i+1)                
                ig = self._info_gain(dataset, lhs_set, rhs_set)
                rdL = _relative_density(lhs_set)
                rdR = _relative_density(rhs_set)
                inverseWeight = min(rdL, rdR)
                weightedIg = ig/inverseWeight
                if ig > max_info_gain and weightedIg > max_info_gain_weighted:
                    print("Max Info. Gain Modified! Att.: {}, Val: {}, new Max: {}, new Weighted Max IG: {}"\
                          .format(attribute, value, ig, weightedIg))
                    max_info_gain = ig
                    max_info_gain_weighted = weightedIg
                    di_cut = Cut(attribute, value, dataset.getId(i), lhs_set, rhs_set)
            
        return di_cut
    
    def _hasRectangle(self, dataset, attribute, value):
        if dataset.get_max(attribute) == dataset.get_min(attribute): 
            return False
        else:
            if dataset.get_max(attribute) == value:
                return False
            else:
                return True

    def _info_gain(self, dataset, lhs_set, rhs_set):
        print ("XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX")
        #if (lhs_set.nr_total_instances < self.min_split) or (rhs_set.nr_total_instances < self.min_split):
        if (lhs_set.nr_total_instances < 1) or (rhs_set.nr_total_instances < 1):
            return -1
    
        ratio_instances_lhs = (float(lhs_set.nr_total_instances)/dataset.nr_total_instances)
        ratio_instances_rhs = (float(rhs_set.nr_total_instances)/dataset.nr_total_instances)
        print ("LHS Ratio: {}, RHS Ratio: {}".format(ratio_instances_lhs, ratio_instances_rhs))
        print("Calculating Entropy Within LHS:")
        entropyLHS = self._calc_entropy(lhs_set)
        print("Calculating Entropy Within RHS:")
        entropyRHS = self._calc_entropy(rhs_set)
        entropy2 = ratio_instances_lhs*entropyLHS + ratio_instances_rhs*entropyRHS
        print("\nCalculating Entropy of the Whole Dataset at Current Situation:")
        entropy1 = self._calc_entropy(dataset)
        print ("\nInformation-Gain if this cut is made: EntropyDataSet - (Weighted Sum of LHS and RHS Entropy) = {}".format(entropy1 - entropy2))
        print ("XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX")
        return (entropy1 - entropy2)

    def _calc_entropy(self, dataset):
        # Changes Made
        nr_existing_instances = dataset.length()
        total = nr_existing_instances + dataset.nr_virtual_points
        terms = list()
        print("Calculating actual data-point's contribution in the Entropy of this subset of data!")
        print("Equation: - (actual-datapoints ÷ total-datapoints) * log2(actual-datapoints ÷ total-datapoints)")
        print ("- ({} ÷ {}) * log2({} ÷ {})".format(nr_existing_instances, total, nr_existing_instances, total))
        actualDataPointEntropyContribution = (float(nr_existing_instances)/float(total))*(np.log2(float(nr_existing_instances)/float(total)))
        print ("Result: -1 * {}".format(actualDataPointEntropyContribution))
        terms.append(actualDataPointEntropyContribution)    
        print("Calculating virtual data-point's contribution in the Entropy of this subset of data!")
        print("Equation: - (virtual-datapoints ÷ total-datapoints) * log2(virtual-datapoints ÷ total-datapoints)")
        print ("- ({} ÷ {}) * log2({} ÷ {})".format(dataset.nr_virtual_points, total, dataset.nr_virtual_points, total))
        virtualPointEntropyContribution = (float(dataset.nr_virtual_points)/float(total))*(np.log2(float(dataset.nr_virtual_points)/float(total)))
        print ("Result: -1 * {}".format(virtualPointEntropyContribution))
        terms.append(virtualPointEntropyContribution)                
        return sum(terms)*-1

In [10]:
class CLNode(object):
    # Added cut to it. So that It can be reused. Cut can be represented by value and attribute.
    def __init__(self, dataset, parent, attribute, depth, cutValue):
        self.dataset = dataset
        self.parent = parent
        self.attribute = attribute
        self.cutValue = cutValue
        self.depth = depth
        self.children = list()
        self.can_prune = False
        self.includedInCluster = False
        self.clusterId = None
        self.nodeId = None
        self.touchedNodes = None
        self.modifiedLength = None

    def setPruneState(self, prune):
        self.can_prune = prune

    def isPrune(self):
        return self.can_prune

    def getRelativeDensity(self):
        return _relative_density(self.dataset)*100.0

    def getNrInstancesInNode(self):
        return self.dataset.length()
    
    def addChildNode(self, node):
        self.children.append(node)

    def getChildNodes(self):
        return self.children
    
    def getLeft(self):
        if len(self.children)==0:
            return None
        else:
            return self.children[0]
        
    def getRight(self):
        if len(self.children)<2:
            return None
        else:
            return self.children[1]
    
    def getParent(self):
        return self.parent
    
    def isLeaf(self):
        if len(self.children) == 0: 
            return True
        else: 
            return False
        
    def setID(self, nodeId):
        self.nodeId = nodeId
        self.dataset.calculate_limits()
        
    def getID(self):
        return self.nodeId
    
    def getTouchedNodes(self):
        return self.touchedNodes
    
    def addTouchedNodes(self, nodeId):
        if self.touchedNodes is None:
            self.touchedNodes = list()
            
        self.touchedNodes.append(nodeId)
        return
    
    def setModifiedDataLength(self, modifiedLength):
        self.modifiedLength = modifiedLength
        return
    
    def getModifiedDataLength(self):
        return self.modifiedLength 
    
    def raiseAddNode(self):
        raise CLNode.AddNodeIlogical("hi")
        
    class AddNodeIlogical(Exception):
        pass   
    
    def __str__(self):
        attr = list()
        vals = list()
        p = self.parent
        while p:
            attr.append(p.attribute)
            vals.append(p.cutValue)
            p = p.parent
        
        s = 'Node: ' + '\n'
        s += str(self.dataset.length()) + ' instances, '
#         s += ", " + str(int(self.getRelativeDensity())) + " relative density " + '\n'
#         s += ", " + str(self.depth) + " depth " + '\n'
#         s += "Cuts " + str(attr)+ " At Values " + str(vals) + '\n'
        
#         self.dataset.calculate_limits()
#         try: 
#             for name in self.dataset.attr_names:
#                 s += name + ' max: ' + str(self.dataset.get_max(name))+\
#                     ' min: ' + str(self.dataset.get_min(name))+'\n'
#         except:
#             pass
        
        return s
    def str2(self):
        instance = self.dataset.length()
        s = 'ID: {}'.format(self.nodeId)
        s+= ' Inst: {} '.format(instance)
#         self.dataset.calculate_limits()
#         try:
#             for name in self.dataset.attr_names:
#                 s += ' ' + name + ' max: ' + str(self.dataset.get_max(name))+\
#                     ' min: ' + str(self.dataset.get_min(name))+ '\n'
#         except:
#             pass
        s+= '' if self.clusterId is None else self.clusterId
        return s

In [11]:
class CLTree:
    def __init__(self, dataset, min_split=100):
        if dataset is None:
            self.raiseUndefinedDataset()
        self.dataset = dataset        
        self.min_split = min_split
        self.clusters = dict()
        self.nodes = dict()
        self.clustersStatistics = None
        self.root = None

    def buildTree(self):
        b = BuildTree(self.min_split)
        self.root = b.build(self.dataset)
        allNodes = self._collectAllNodes(self.root, dict())
        self.nodes  = allNodes
        
    def getRoot(self):
        return self.root
    
    def getAllClusters(self):
        return self.clusters
    
    def getAllNodes(self):
        return self.nodes
    
    def getClustersStatistics(self):
        return self.clustersStatistics
    
    def setClustersStatistics(self, result):
        self.clustersStatistics = result
        allClusters = self._collectAllClusters(self.root, dict())
        self.clusters = allClusters
        return
    
    def _collectAllClusters(self, node, allClusters):
        
        if node.includedInCluster:
            clusterId = node.clusterId
            if clusterId in allClusters.keys():
                val = allClusters[clusterId]
                val.append(node.nodeId)
                allClusters[clusterId] = val
            else:
                allClusters[clusterId] = [node.nodeId]

        else:
            children = node.getChildNodes()
            self._collectAllClusters(children[0], allClusters)
            self._collectAllClusters(children[1], allClusters)

        return allClusters
    
    def _collectAllNodes(self, node, allNodes):
        # In-order Traversal
        if node.isLeaf():
            allNodes[node.nodeId] = node
            return allNodes
        
        children = node.getChildNodes()
        allNodes = self._collectAllNodes(children[0], allNodes)
        allNodes[node.nodeId] = node
        allNodes = self._collectAllNodes(children[1], allNodes)
        return allNodes
    
    def raiseUndefinedDataset(self):
        raise CLTree.UndefinedDataset()
        
    class UndefinedDataset(Exception):
        pass
    
    def raiseUndefinedTree(self):
        raise CLTree.UndefinedTree()
        
    class UndefinedTree(Exception):
        pass  

In [12]:
df = pd.read_csv('./ToDemotestData.csv', delimiter=';')
df.drop(columns='Unnamed: 0', inplace=True)
print("Columns: {}, Shape: {}".format(list(df.columns), df.shape))

Columns: ['People', 'avgSpend'], Shape: (100, 2)


In [13]:
r = DataFrameReader(df)

In [14]:
data = r.read()

2020-01-30 15:19:51,394 - 4731979200 - INFO - read 100
2020-01-30 15:19:51,395 - 4731979200 - INFO - attribute names: [('id', <class 'float'>), ('People', <class 'float'>), ('avgSpend', <class 'float'>)]


In [15]:
min_split = np.ceil(data.length()* 0.01)
cltree = CLTree(data, min_split)

In [16]:
cltree.buildTree()

XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX
This is the starting point!
Underlying Dataset: 
Data: 100
['People', 'avgSpend']
 Max :[99. 10. 91.]
 Min :[ 0.  1. 21.]

--------

Trying to find best cut among the following attributes: ['People', 'avgSpend']
Current Attribute: People, Current Value: 1.0
XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX
LHS Ratio: 0.07, RHS Ratio: 0.93
Calculating Entropy Within LHS:
Calculating actual data-point's contribution in the Entropy of this subset of data!
Equation: - (actual-datapoints ÷ total-datapoints) * log2(actual-datapoints ÷ total-datapoints)
- (4 ÷ 14) * log2(4 ÷ 14)
Result: -1 * -0.5163871205878868
Calculating virtual data-point's contribution in the Entropy of this subset of data!
Equation: - (virtual-datapoints ÷ total-datapoints) * log2(virtual-datapoints ÷ total-datapoints)
- (10 ÷ 14) * log2(10 ÷ 14)
Result: -1 * -0.3467334479787441
Calculating 

Result: -1 * -0.5
Calculating virtual data-point's contribution in the Entropy of this subset of data!
Equation: - (virtual-datapoints ÷ total-datapoints) * log2(virtual-datapoints ÷ total-datapoints)
- (95 ÷ 190) * log2(95 ÷ 190)
Result: -1 * -0.5

Information-Gain if this cut is made: EntropyDataSet - (Weighted Sum of LHS and RHS Entropy) = 0.008275468952039677
XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX
Current Attribute: avgSpend, Current Value: 58.0
XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX
LHS Ratio: 0.7473684210526316, RHS Ratio: 0.25263157894736843
Calculating Entropy Within LHS:
Calculating actual data-point's contribution in the Entropy of this subset of data!
Equation: - (actual-datapoints ÷ total-datapoints) * log2(actual-datapoints ÷ total-datapoints)
- (76 ÷ 142) * log2(76 ÷ 142)
Result: -1 * -0.4826640145115728
Calculating virtual data-point's contribution in the Entropy of th

Result: -1 * -0.5057215970441366
Calculating virtual data-point's contribution in the Entropy of this subset of data!
Equation: - (virtual-datapoints ÷ total-datapoints) * log2(virtual-datapoints ÷ total-datapoints)
- (17 ÷ 23) * log2(17 ÷ 23)
Result: -1 * -0.3223341283353674

Information-Gain if this cut is made: EntropyDataSet - (Weighted Sum of LHS and RHS Entropy) = 0.012498666768408051
XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX
Max Info. Gain Modified! Att.: People, Val: 4.0, new Max: 0.012498666768408051, new Weighted Max IG: 0.062493333842040255
Current Attribute: People, Current Value: 5.0
XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX
LHS Ratio: 0.6521739130434783, RHS Ratio: 0.34782608695652173
Calculating Entropy Within LHS:
Calculating actual data-point's contribution in the Entropy of this subset of data!
Equation: - (actual-datapoints ÷ total-datapoints) * log2(actual-datapoints ÷ 

XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX----------XXXXXXXXXX
LHS Ratio: 0.7758620689655172, RHS Ratio: 0.22413793103448276
Calculating Entropy Within LHS:
Calculating actual data-point's contribution in the Entropy of this subset of data!
Equation: - (actual-datapoints ÷ total-datapoints) * log2(actual-datapoints ÷ total-datapoints)
- (23 ÷ 45) * log2(23 ÷ 45)
Result: -1 * -0.49490436058380494
Calculating virtual data-point's contribution in the Entropy of this subset of data!
Equation: - (virtual-datapoints ÷ total-datapoints) * log2(virtual-datapoints ÷ total-datapoints)
- (22 ÷ 45) * log2(22 ÷ 45)
Result: -1 * -0.5047393890940511
Calculating Entropy Within RHS:
Calculating actual data-point's contribution in the Entropy of this subset of data!
Equation: - (actual-datapoints ÷ total-datapoints) * log2(actual-datapoints ÷ total-datapoints)
- (6 ÷ 13) * log2(6 ÷ 13)
Result: -1 * -0.5148356388092012
Calculating virtual data-point's contribution in the Entro

Calculating Entropy Within LHS:
Calculating actual data-point's contribution in the Entropy of this subset of data!
Equation: - (actual-datapoints ÷ total-datapoints) * log2(actual-datapoints ÷ total-datapoints)
- (4 ÷ 6) * log2(4 ÷ 6)
Result: -1 * -0.38997500048077083
Calculating virtual data-point's contribution in the Entropy of this subset of data!
Equation: - (virtual-datapoints ÷ total-datapoints) * log2(virtual-datapoints ÷ total-datapoints)
- (2 ÷ 6) * log2(2 ÷ 6)
Result: -1 * -0.5283208335737187
Calculating Entropy Within RHS:
Calculating actual data-point's contribution in the Entropy of this subset of data!
Equation: - (actual-datapoints ÷ total-datapoints) * log2(actual-datapoints ÷ total-datapoints)
- (4 ÷ 10) * log2(4 ÷ 10)
Result: -1 * -0.5287712379549449
Calculating virtual data-point's contribution in the Entropy of this subset of data!
Equation: - (virtual-datapoints ÷ total-datapoints) * log2(virtual-datapoints ÷ total-datapoints)
- (6 ÷ 10) * log2(6 ÷ 10)
Result: -1

Equation: - (actual-datapoints ÷ total-datapoints) * log2(actual-datapoints ÷ total-datapoints)
- (1 ÷ 5) * log2(1 ÷ 5)
Result: -1 * -0.46438561897747244
Calculating virtual data-point's contribution in the Entropy of this subset of data!
Equation: - (virtual-datapoints ÷ total-datapoints) * log2(virtual-datapoints ÷ total-datapoints)
- (4 ÷ 5) * log2(4 ÷ 5)
Result: -1 * -0.2575424759098898

Calculating Entropy of the Whole Dataset at Current Situation:
Calculating actual data-point's contribution in the Entropy of this subset of data!
Equation: - (actual-datapoints ÷ total-datapoints) * log2(actual-datapoints ÷ total-datapoints)
- (4 ÷ 15) * log2(4 ÷ 15)
Result: -1 * -0.5085041588289383
Calculating virtual data-point's contribution in the Entropy of this subset of data!
Equation: - (virtual-datapoints ÷ total-datapoints) * log2(virtual-datapoints ÷ total-datapoints)
- (11 ÷ 15) * log2(11 ÷ 15)
Result: -1 * -0.328136583112229

Information-Gain if this cut is made: EntropyDataSet - (Wei

In [17]:
from graphviz import Digraph, nohtml

class Queue:
    def __init__(self):
        self.items = []

    def isEmpty(self):
        return self.items == []

    def enqueue(self, item):
        self.items.insert(0,item)

    def dequeue(self):
        return self.items.pop()

    def size(self):
        return len(self.items)


def TreeVisualization(root, filename):
    node = root
    g = Digraph('g', filename=filename, node_attr={'shape': 'record', 'height': '.1'})
    queue = Queue()
    i = 0
    str1 = node.str2()
    queue.enqueue(('node'+str(i), node))
    nodeVal = '<f0> |<f1> {}|<f2>'.format(str1)
    g.node('node'+str(i), nohtml(nodeVal))
    i+=1
    while not queue.isEmpty(): 
  
            # Dequeue a vertex from  
            # queue and print it 
            keyValue = queue.dequeue()
            key = keyValue[0]
            val = keyValue[1]
            if not val.isLeaf():
                lChild = val.getLeft()
                if not lChild.isLeaf():
                    str1 =lChild.str2()
                else:
                    str1 = lChild.str2()
                    
                queue.enqueue(('node'+str(i), lChild))
                nodeVal = '<f0> |<f1> {}|<f2>'.format(str1)
                g.node('node'+str(i), nohtml(nodeVal))
                g.edge('{}:f0'.format(key), '{}:f1'.format('node'+str(i)))
                i+=1
                if val.getRight():
                    rChild = val.getRight()
                    if not rChild.isLeaf():
                        str2 = rChild.str2()
                    else:
                        str2 = rChild.str2()
                        
                    queue.enqueue(('node'+str(i), rChild))
                    nodeVal = '<f0> |<f1> {}|<f2>'.format(str2)
                    g.node('node'+str(i), nohtml(nodeVal))
                    g.edge('{}:f2'.format(key), '{}:f1'.format('node'+str(i)))
                    i+=1
                
    g.view() 

In [18]:
TreeVisualization(cltree.root, './CLTreeApprox.gv')

In [20]:
max(cltree.nodes)

166

In [39]:
ar = np.reshape([np.random.randint(100) for i in range(16)], (4, -1))

In [42]:
ar

array([[80, 75, 49, 44],
       [71, 13, 42, 45],
       [80, 70,  7, 78],
       [51, 65, 25, 58]])

In [48]:
np.argwhere(ar[:, 0] == 80)[0, 0]

0