# Decision Trees from Scratch

In [15]:
import numpy as np

In [16]:
data = np.array([
    [12.0, 1.5, 1, 'Wine'],
    [5.0, 2.0, 0, 'Beer'],
    [40.0, 0.0, 1, 'Whiskey'],
    [13.5, 1.2, 1, 'Wine'],
    [4.5, 1.8, 0, 'Beer'],
    [38.0, 0.1, 1, 'Whiskey'],
    [11.5, 1.7, 1, 'Wine'],
    [5.5, 2.3, 0, 'Beer']
])



In [17]:
data

array([['12.0', '1.5', '1', 'Wine'],
       ['5.0', '2.0', '0', 'Beer'],
       ['40.0', '0.0', '1', 'Whiskey'],
       ['13.5', '1.2', '1', 'Wine'],
       ['4.5', '1.8', '0', 'Beer'],
       ['38.0', '0.1', '1', 'Whiskey'],
       ['11.5', '1.7', '1', 'Wine'],
       ['5.5', '2.3', '0', 'Beer']], dtype='<U32')

# Encode the Dataset

In [18]:
X= data[:,:-1].astype(np.float32)
y = data[:,-1]

label_encoding = {'Beer':0,'Wine':1,'Whiskey':2}
label_decoding = {0:'Beer',1:'Wine',2:'Whiskey'}
for i  in range(len(y)):
    y[i] = label_encoding[y[i]]

y = y.astype(np.int32)



# Gini Impurity


In [19]:
def giniImpurity(labels):
    n = np.size(labels)
    _,counts = np.unique(labels,return_counts=True)
    probabilites = counts/n

    sum = 0
    for p_i in probabilites:
        sum += p_i **2

    return 1 -sum





# Best Split Finder

In [20]:
def bestSplitFinder(X,y):

   features_min_impurities = []
   features_corresponding_thresholds = []


   for col in range(X.shape[1]):
      vals = X[:,col]
      unique_vals = np.unique(vals)
      n = np.size(unique_vals)
      thresholds=[]
      if(np.size(unique_vals)>1):
         thresholds = [(unique_vals[i] + unique_vals[i+1])/2 for i in range(n-1)]
      else:
         thresholds  = unique_vals



      res ={}
      for threshold in thresholds:

         yes =[]
         no =[]
         for index,val in enumerate(vals):
            if val > threshold:
               yes.append(index)
            else:
               no.append(index)



         yes_len = np.size(yes)
         labels_yes = y[yes]
         impurity_yes = giniImpurity(labels_yes)


       

         no_len = np.size(no)
         labels_no = y[no]
         impurity_no = giniImpurity(labels_no)

        


         avg_impurity = (yes_len * impurity_yes  + no_len * impurity_no)/(yes_len + no_len)

       
         res[threshold] = avg_impurity
       
      res_thresh_with_min_impurity = min(res,key=res.get)
      res_min_impurity = res[res_thresh_with_min_impurity]

      features_min_impurities.append(res_min_impurity)
      features_corresponding_thresholds.append(res_thresh_with_min_impurity)

   res_feature = np.argmin(features_min_impurities)
   res_threshold = features_corresponding_thresholds[res_feature]

   return res_feature,res_threshold,features_min_impurities[res_feature]

In [21]:
res_feature, res_threshold, best_gini = bestSplitFinder(X,y)
print(res_feature)
print(res_threshold)
print(best_gini)

0
8.5
0.3


# Decision Tree Node

In [22]:
class Node:
    def __init__(self,feature_index=None,threshold=None,left=None,right=None,value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value =value

    def is_leaf(self):
        return self.value
    







# Building the Decision Tree

In [23]:

def build(features,labels,cur_depth,max_depth =100):
    if len(np.unique(labels) )==1  or cur_depth == max_depth :
        leaf_value = np.bincount(labels).argmax()
        return Node(value = leaf_value)
    

    best_feature, best_threshold,_ = bestSplitFinder(features,labels)

   


    vals = features[:,best_feature]
    yes =[]
    no =[]
    for index,val in enumerate(vals):

       if val > best_threshold:
          yes.append(index)
       else:
          no.append(index)


    features_left = np.array(features[yes])
    features_right = np.array(features[no])


    labels_left = np.array(labels[yes])
    labels_right = np.array(labels[no])







    left_tree = build(features_left,labels_left,cur_depth+1,max_depth)
    right_tree = build(features_right,labels_right,cur_depth+1,max_depth)

    return Node(best_feature,best_threshold,left_tree,right_tree)


    


    
    

In [24]:
max_depth = 100
tree_root = build(X,y,0,max_depth)

# Testing 

In [25]:
test_data = np.array([
    [6.0, 2.1, 0],   # Expected: Beer
    [39.0, 0.05, 1], # Expected: Whiskey
    [13.0, 1.3, 1]   # Expected: Wine
])


predictions = []

for test_x in test_data:
    root = tree_root
    depth =0
    while root.left != None and root.right != None:
        print(f'depth {depth}  Feature {root.feature_index}   Threshold {root.threshold}')
        depth = depth +1
        if test_x[root.feature_index] > root.threshold:
            root = root.left
        else:
            root = root.right

    print(root.is_leaf())
    predictions.append(label_decoding[root.is_leaf()])

depth 0  Feature 0   Threshold 8.5
0
depth 0  Feature 0   Threshold 8.5
depth 1  Feature 0   Threshold 25.75
2
depth 0  Feature 0   Threshold 8.5
depth 1  Feature 0   Threshold 25.75
1


In [26]:
print(predictions)

['Beer', 'Whiskey', 'Wine']
