In [141]:
import pandas as pd
import numpy as np
import math

# read dog dataset, No is encoded as 0, Yes as 1, 
# bite refers to class 1, bark to class 0

data = pd.read_csv("dogs.csv") 
print(data)

   Heavy  Smelly  Big  Growling  Action
0      0       0    0         0       0
1      0       0    1         0       0
2      1       1    0         1       0
3      1       0    0         1       1
4      0       1    1         0       1
5      0       0    1         1       1
6      0       0    0         1       1
7      1       1    0         0       1


You can access individual columns by name to get a pandas series or convert it to a NumPy array:

In [2]:
print(data["Heavy"])
print(data["Heavy"].to_numpy())

0    0
1    0
2    1
3    1
4    0
5    0
6    0
7    1
Name: Heavy, dtype: int64
[0 0 1 1 0 0 0 1]


### What is the entropy of the target value 'Action' in the whole dataset? 
Define a function that calculates the entropy (for a list of 0/1 values):

In [103]:
def entropy(values):
    base = len(values)
    if base <= 1:
            return 0 
    
    count0=sum([1 for i in values if i == 0])
    count1=base-count0
    p0=count0 / base
    p1=count1 / base
    
    result = - (p0*math.log2(p0) if p0 > 0 else 0) - (p1*math.log2(p1) if p1 > 0 else 0)
    return result
    

In [105]:
def entropy(values):
        # could be empty  while trying for a split
        if values.size == 0:
            return 0
        
        ratio_ones = sum(values) / len(values)
        ratio_zeros = 1 - ratio_ones

        pd = [ratio_ones, ratio_zeros]
        if ratio_ones == 0 or ratio_zeros == 0:
            return 0
        return - sum( [ p_i * np.log2(p_i) for p_i in pd] )

Apply it to the "Action" column

In [106]:
print(entropy(data["Action"].to_numpy())) #Entropy of the target variable --> inital entropy of the dataset before any splits are made

0.954434002924965


### Which attribute would the ID3 algorithm choose to use for the root of the tree? What is its information gain? 

Iterate through all categories, make the potential split, compare the resulting entropies and take the greedy choice. You might need:

```
# iterating the columns 
for col in data.columns: 
    print(col)
```

and to filter entries of a dataframe that satisfy a certain condition, you might want to use:
```
data[data["Heavy"] == 0] # only those entries that are not heavy
```

In [107]:

for col in data.columns:
    print(f'{col}:{entropy((data[col] == 0).to_numpy())-entropy(data["Action"].to_numpy())}')
    print(f'{col}:{entropy((data[col] == 1).to_numpy())-entropy(data["Action"].to_numpy())}')
#Results indicate Growling as the best Attribute

Heavy:0.0
Heavy:0.0
Smelly:0.0
Smelly:0.0
Big:0.0
Big:0.0
Growling:0.04556599707503495
Growling:0.04556599707503495
Action:0.0
Action:0.0


###  Draw the full decision tree that would be learned for this data using ID3 without pruning.

Recursively apply the splitting procedure until all nodes are leaf nodes (= pure in the target). You may use the class `Node` as a start for your implementation and attributes as needed.

In [142]:
class Node:
    
    def entropy(self, values):
        # could be empty  while trying for a split
        if values.size == 0:
            return 0
        
        ratio_ones = sum(values) / len(values)
        ratio_zeros = 1 - ratio_ones

        pd = [ratio_ones, ratio_zeros]
        if ratio_ones == 0 or ratio_zeros == 0:
            return 0
        return - sum( [ p_i * np.log2(p_i) for p_i in pd] )
    
    def __init__(self, data, ancestor_features):
        # pure if there are all ones or all zeros 
        self.pure = sum(data["Action"]) == len(data) or  sum(data["Action"]) == 0
        self.split_feature = None
        self.data = data
        self.base_entropy = self.entropy(data["Action"].to_numpy())
        self.children = None
        # since we're using categorical (binary), each features should at most be used once in any branch
        self.ancestor_features = ancestor_features 
        # You can use these ancestor_features to get a nice identation:
        # print(" "*len(self.ancestor_features) + f"Initializing with data {data}")
        
    def split(self):
        # implement the split here
        features = [col for col in self.data.columns if col not in self.ancestor_features and col != "Action"]
        max_info_gain = -1
        best_feature = None
        
        for feature in features:
            for value in self.data[feature].unique():
                subset = self.data[self.data[feature] == value]
                entropy = self.entropy(subset["Action"].to_numpy())
                info_gain = self.base_entropy - entropy
                
                if info_gain > max_info_gain:
                    max_info_gain = info_gain
                    best_feature = feature
                    best_threshold = value
        self.split_threshold = best_threshold
        if best_feature is not None:
            self.split_feature = best_feature
            self.children = []
            
            for value in self.data[best_feature].unique():
                subset = self.data[self.data[best_feature] == value]
                child = Node(subset, self.ancestor_features + [best_feature])
                self.children.append(child)
    def train(self):        
        if not self.pure:
            self.split()
            for child in self.children:
                # assume that we can get nodes pure by looking at our features
                # having exactly the same features but two different classes would not work here
                child.train()
                # now child is either pure or split into nodes that are eventually pure
# start training with a root node consisting of all data
    def predict(self, new_data):
        if self.pure:
            return self.data["Action"].iloc[0]
        else:
            # Andernfalls entscheide anhand des geteilten Merkmals und Schwellenwerts
            feature_value = new_data[self.split_feature]
            if feature_value <= self.split_threshold:
                # Wenn der Wert kleiner oder gleich dem Schwellenwert ist, gehe zum linken Kindknoten
                return self.children[0].predict(new_data)
            else:
                # Andernfalls gehe zum rechten Kindknoten
                return self.children[1].predict(new_data)
root = Node(data, [])
root.train()


### Test on three new dogs
Here's the test set, evaluate the predicted "Action" classes using your implementation and compare them to the true "Action" classes.

In [144]:
data = pd.read_csv("dogs_test.csv").to_numpy()

data = [{"Heavy":1,"Smelly":0,"Big":1,"Growling":1}, {"Heavy": 0, "Smelly": 0, "Big": 0, "Growling": 1},{"Heavy": 1, "Smelly": 1, "Big": 0, "Growling": 0}]
for data_point in data:
    print("Prediction:"+ str(root.predict(data_point)))


Prediction:0
Prediction:0
Prediction:1


### Repeat with the Gini coefficient
Just alter your previous implementations 