## 11. CART 

### We have used AllElectronic Dataset to test our implementation (https://sites.google.com/site/labitis462/lab-tutorials)

In [13]:
import numpy as np
import pandas as pd
from itertools import chain, combinations

# the database has 4 features and binary classification
dataset = pd.read_csv('AllElectronic.csv')

temp = np.array(dataset)

x_train = []
y_train = []
x_test  = []
y_test  = []

col = len(temp[0])-1

split = 0.9

# splitting the dataset
for i in range(len(temp)):
    if i <= len(temp)*split:
        t = []
        for j in range(col):
            t.append(temp[i,j])
        x_train.append(t)
        y_train.append(temp[i,-1])
    else:
        t = []
        for j in range(col):
            t.append(temp[i,j])
        x_test.append(t)
        y_test.append(temp[i,-1])

feature_names = dataset.columns
feature_index = {}

# hashing the index of feature column numbers
for i in range(len(feature_names)):
    feature_index[feature_names[i]] = i 
        
print("Features and target: ",feature_names)
print("Number of training samples: ", len(x_train))
print("Number of testing samples: ", len(x_test))

# max height of the tree
max_height = 6

# node to store the data of each node
class Node:
    def __init__(self, gini_val = 1, feature_name="Leaf", left = None, right = None, height = 0, subset = None, class_names=None, index=0):
        self.feature_name = feature_name
        self.left = left
        self.right = right
        self.height = height
        self.gini_val = gini_val
        self.subset = subset
        self.class_names = class_names
        self.index = index
        
    def __str__(self):
        if len(self.subset)>0:
            return "Node-> "+str(self.index)+", Split Feature: "+ str(self.feature_name)+", subset for left branch: "+str(self.subset)+", gini: "+str(self.gini_val)+", height: "+str(self.height)
        else:
            return "Leaf Node-> gini: "+str(self.gini_val)+" height: "+str(self.height)
            
# creating the root node
root = Node(height=1,index=1)

# utility function to find list of subsets from a set
def findsubsets(input_set):
    s = list(input_set)
    return list(chain.from_iterable(combinations(s, r) for r in range(1,len(input_set))))

# gini of a target list
def gini(target_list):
    
    g = 1 
    
    uni_vals = set(target_list)
    
    for ele in uni_vals:
        g -= (target_list.count(ele)/len(target_list))**2
    
#     print(g)
    return g

# gini of a split of a feature 
# (will return the minimum gini and corresponding subset from all subsets)
def gini_sub(column, target):
    
    ans_g = 1
    ans_subset = []
        
    uni_feature = set(column)
    
    subsets = findsubsets(uni_feature)
#     print(subsets)
    
    for sub in subsets:
#         print(sub)
        split1 = []
        split2 = []
        for i in range(len(target)):
            if column[i] in sub:
                split1.append(target[i])
            else:
                split2.append(target[i])
        
        g1 = len(split1)*gini(split1)/len(target)
        g2 = len(split2)*gini(split2)/len(target)

        if ans_g > g1+g2:
            ans_g = g1+g2
            ans_subset = sub
    
    return ans_g,ans_subset

# a funtion to decide the optimal split in the given data
def decide_split(x,y):
    
    g = gini(y)
    split_sub = []
    name = "Leaf"
    
    col = len(x[0])
    
    for i in range(col):
        column = [ele[i] for ele in x]

        g_temp,sub_temp = gini_sub(column,y)
        
        if g > g_temp:
            g = g_temp
            split_sub = sub_temp
            name = feature_names[i]

    return (g,name,split_sub)

# function to train, create and populate the tree
def create_tree(node, x, y):
    
    # decide the split on the data passed to the current node
    g, feature_name, subset = decide_split(x,y)
        
    if node.height < max_height:
    
        node.gini_val = g
        node.feature_name = feature_name
        node.subset = subset

        sep_string = ("   ")*node.height
        
        class_names = set(y)            
        node.class_names = []
        
        # for testing purpose
        for name in class_names:
            temp = {}
            temp['result'] = name
            temp['probability'] = y.count(name)/len(y)
            node.class_names.append(temp)
                
        # if true, the node can be split into more branches
        if len(node.subset)>0:
        
            ind = feature_index[feature_name]

            left = Node(height=node.height+1, index = 2*node.height)
            right = Node(height=node.height+1, index = 2*node.height+1)

            left_x = []
            left_y = []
            right_x = []
            right_y = []

            # splitting the dataset based on the subset found at the beginning of the function
            for i in range(len(x)):

                if x[i][ind] in node.subset:
                    left_x.append(x[i])
                    left_y.append(y[i])
                else:
                    right_x.append(x[i])
                    right_y.append(y[i])

            node.left = left
            node.right = right
            
            # printing the node branches
            print("\n")
            print(sep_string,"Left branch of",node,"\n")
            for i in range(len(left_x)):
                print(sep_string,left_x[i],left_y[i])
            create_tree(node.left,left_x,left_y)
            
            print("\n")
            print(sep_string,"Right branch of",node,"\n")
            for i in range(len(right_x)):
                print(sep_string,right_x[i],right_y[i])
            create_tree(node.right,right_x,right_y)
    
# starting from the whole dataset at the root node
create_tree(root,x_train,y_train)    

# prediction function which shows the path on tree and probability of getting the right answer
def predict(root,x,y):
    
    node = root
    print("\ninput: ",x,y)
    print("path: root->",end='')
    
    # finding the suitable node for current input
    while node.feature_name != "Leaf":
        
#         print(node.feature_name)
        ind = feature_index[node.feature_name]
        
        if x[ind] in node.subset:
            print("left->",end='')
            node = node.left
        else:
            print("right->",end='')
            node = node.right
    
    return node.class_names

print("\n\nprediction on test dataset")
print("probability:",predict(root,x_test[0],y_test[0]))
print("probability:",predict(root,x_test[1],y_test[1]))

Features and target:  Index(['age', 'income', 'student', 'credit_rating', 'buys_computer'], dtype='object')
Number of training samples:  26
Number of testing samples:  2


    Left branch of Node-> 1, Split Feature: student, subset for left branch: ('yes',), gini: 0.336996336996337, height: 1 

    ['senior', 'low', 'yes', 'fair'] yes
    ['senior', 'low', 'yes', 'excellent'] no
    ['middle', 'low', 'yes', 'excellent'] yes
    ['Youth', 'low', 'yes', 'fair'] yes
    ['senior', 'medium', 'yes', 'fair'] yes
    ['Youth', 'medium', 'yes', 'excellent'] yes
    ['middle', 'high', 'yes', 'fair'] yes
    ['middle', 'high', 'yes', 'fair'] yes
    ['middle', 'low', 'yes', 'excellent'] yes
    ['senior', 'low', 'yes', 'fair'] yes
    ['senior', 'low', 'yes', 'excellent'] no
    ['Youth', 'low', 'yes', 'fair'] yes
    ['Youth', 'medium', 'yes', 'excellent'] yes
    ['senior', 'medium', 'yes', 'fair'] yes


       Left branch of Node-> 2, Split Feature: age, subset for left branch: ('senior',), g