#Zadanie 4 (7 pkt)
Celem zadania jest zaimplementowanie algorytmu drzewa decyzyjnego ID3 dla zadania klasyfikacji. Trening i test należy przeprowadzić dla zbioru Iris. Proszę przeprowadzić eksperymenty najpierw dla DOKŁADNIE takiego podziału zbioru testowego i treningowego jak umieszczony poniżej. W dalszej części należy przeprowadzić analizę działania drzewa dla różnych wartości parametrów. Proszę korzystać z przygotowanego szkieletu programu, oczywiście można go modyfikować według potrzeb. Wszelkie elementy szkieletu zostaną wyjaśnione na zajęciach.

* Implementacja funkcji entropii - **0.5 pkt**
* Implementacja funkcji entropii zbioru - **0.5 pkt**
* Implementacja funkcji information gain - **0.5 pkt**
* Zbudowanie poprawnie działającego drzewa klasyfikacyjnego i przetestowanie go na wspomnianym wcześniej zbiorze testowym. Jeśli w liściu występuje kilka różnych klas, decyzją jest klasa większościowa. Policzenie accuracy i wypisanie parami klasy rzeczywistej i predykcji. - **4 pkt**
* Przeprowadzenie eksperymentów dla różnych głębokości drzew i podziałów zbioru treningowego i testowego (zmiana wartości argumentu test_size oraz usunięcie random_state). W tym przypadku dla każdego eksperymentu należy wykonać kilka uruchomień programu i wypisać dla każdego uruchomienia accuracy. - **1.5 pkt**

In [39]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
from typing import Tuple

iris = load_iris()

x = iris.data
y = iris.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=123)

In [42]:
def entropy_func(class_count: int, num_samples: int) -> float:
    probability = class_count / num_samples
    entropy = - probability * np.log(probability)
    return entropy

def split(data: np.ndarray, classes: np.ndarray, split_feature: int, split_val: float) -> Tuple[np.ndarray, np.ndarray]:
    dataset = np.c_[data, classes]
    feature_column = dataset[:, split_feature].astype(float)
    mask = feature_column >= split_val
    
    child_a = dataset[mask]
    child_b = dataset[~mask]
    child_a = np.delete(child_a, split_feature, axis=1)
    child_b = np.delete(child_b, split_feature, axis=1)
    return child_a, child_b

class Group:
    def __init__(self, group_classes):
        self.group_classes = group_classes
        self.entropy = self.group_entropy()

    def __len__(self) -> int:
        return len(self.group_classes)

    def group_entropy(self) -> float:
        entropy = 0
        class_counts = Counter(self.group_classes)
        num_samples = len(self)
        for group_class_count in class_counts.values():
            entropy += entropy_func(group_class_count, num_samples)
        return entropy

class Node:
    def __init__(self, split_feature=None, split_val=None, depth=None, child_node_a=None, child_node_b=None, val=None):
        self.split_feature = split_feature
        self.split_val = split_val
        self.depth = depth
        self.child_node_a = child_node_a
        self.child_node_b = child_node_b
        self.val = val

    def predict(self, data) -> int:
        if self.val is not None:
            return self.val
        elif data[self.split_feature] > self.split_val:
            return self.child_node_a.predict(data)
        else:
            return self.child_node_b.predict(data)

class DecisionTreeClassifier(object):
    def __init__(self, max_depth):
        self.depth = 0
        self.max_depth = max_depth
        self.tree = None

    @staticmethod
    def get_split_entropy(group_a: Group, group_b: Group) -> float:
        split_entropy = 0
        parent_group_count = len(group_a) + len(group_b)
        child_groups = [group_a, group_b]
        for group in child_groups:
            split_entropy += (len(group) / parent_group_count) * group.group_entropy()
        return split_entropy

    def get_information_gain(self, parent_group: Group, child_group_a: Group, child_group_b: Group) -> float:
        information_gain = parent_group.group_entropy() - self.get_split_entropy(child_group_a, child_group_b)
        return information_gain

    def get_best_feature_split(self, feature_values: np.ndarray, classes: np.ndarray) -> float:
        parent = Group(classes)
        possible_thresholds = np.unique(feature_values)
        best_split_val = 0
        best_gain = 0
        
        #print("Possible", possible_thresholds)
        for threshold in possible_thresholds:
            child_a, child_b = split(feature_values, classes, 0, threshold)
            if child_a.shape[0] == 0 or child_b.shape[0] == 0:
                continue
            child_a = Group(child_a[:, -1])
            child_b = Group(child_b[:, -1])
            gain = self.get_information_gain(parent, child_a, child_b)
            
            if gain >= best_gain:
                best_gain = gain
                best_split_val = threshold
            #print(f"Gain: {gain}")
        return best_split_val
    
    def get_best_split(self, data: np.ndarray, classes: np.ndarray) -> Tuple[int, float, float]:
        best_argument = 0
        best_split = 0
        best_gain = 0
        for argument in range(data.shape[1]):
            #print(data[:, argument])
            split_val = self.get_best_feature_split(data[:, argument], classes)
            #split_val = np.random.choice(data[:, argument])
            child_a, child_b = split(data, classes, argument, split_val)
            child_a = Group(child_a[:, -1])
            child_b = Group(child_b[:, -1])
            gain = self.get_information_gain(Group(classes), child_a, child_b)

            if gain >= best_gain:
                best_gain = gain
                best_argument = argument
                best_split = split_val
                
        return best_argument, best_split, best_gain

    def build_tree(self, data: np.ndarray, classes: np.ndarray, depth=0) -> 'Node':
        if depth == self.max_depth or len(set(classes)) == 1:
            return Node(val=Counter(classes).most_common(1)[0][0])

        best_argument, best_split, best_gain = self.get_best_split(data, classes)
        
        child_a_data, child_b_data = split(data, classes, best_argument, best_split)
        child_a_classes = child_a_data[:, -1]
        child_b_classes = child_b_data[:, -1]
        
        child_a_node = self.build_tree(child_a_data[:, :-1], child_a_classes, depth + 1)
        child_b_node = self.build_tree(child_b_data[:, :-1], child_b_classes, depth + 1)

        return Node(split_feature=best_argument, split_val=best_split, depth=depth, child_node_a=child_a_node, child_node_b=child_b_node)

    def fit(self, data: np.ndarray, classes: np.ndarray) -> None:
        self.tree = self.build_tree(data, classes)

    def predict(self, data: np.ndarray) -> int:
        if self.tree is not None:
            return self.tree.predict(data)
    
    def evaluate(self, data: np.ndarray, classes: np.ndarray, verbose=0):
        if self.tree is not None:
            samples = len(classes)
            positively_predicted = 0
            for i, subdataset in enumerate(zip(data, classes), 1):
                sample, actual_class = subdataset
                predicted_class = self.tree.predict(sample)
                if predicted_class == actual_class:
                    positively_predicted += 1
                if verbose == 1:
                    print(f"Predicting {i:>2}/{samples}\tPrediction: {predicted_class}\tActual class: {actual_class}")
            accuracy = positively_predicted / samples
            print()
            print(f"Accuracy after predicting {samples} samples: {accuracy:.2f}%")
                


In [43]:
dc = DecisionTreeClassifier(3)
dc.fit(x_train, y_train)
dc.evaluate(x_test, y_test, verbose=1)
    

Predicting  1/15	Prediction: 1.0	Actual class: 1
Predicting  2/15	Prediction: 2.0	Actual class: 2
Predicting  3/15	Prediction: 2.0	Actual class: 2
Predicting  4/15	Prediction: 1.0	Actual class: 1
Predicting  5/15	Prediction: 0.0	Actual class: 0
Predicting  6/15	Prediction: 1.0	Actual class: 2
Predicting  7/15	Prediction: 1.0	Actual class: 1
Predicting  8/15	Prediction: 0.0	Actual class: 0
Predicting  9/15	Prediction: 0.0	Actual class: 0
Predicting 10/15	Prediction: 1.0	Actual class: 1
Predicting 11/15	Prediction: 2.0	Actual class: 2
Predicting 12/15	Prediction: 0.0	Actual class: 0
Predicting 13/15	Prediction: 1.0	Actual class: 1
Predicting 14/15	Prediction: 2.0	Actual class: 2
Predicting 15/15	Prediction: 2.0	Actual class: 2

Accuracy after predicting 15 samples: 0.93%
