<a href="https://colab.research.google.com/github/Renzou1/treinamento-h2ia/blob/main/08_arvore_de_decisao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Árvore de Decisão

In [1]:
from sklearn import datasets
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import math

In [2]:
def min_max_operation(value, min_val, max_val):
    return (value - min_val) / (max_val - min_val)

In [3]:
def min_max_normalization(array):
  rows, cols = array.shape

  for c in range(cols):
    min_val = np.min(array[:, c])
    max_val = np.max(array[:, c])

    for r in range(rows):
      array[r][c] = min_max_operation(array[r][c], min_val, max_val)

In [4]:
def get_entropy(proportion_list):
    non_zero_proportions = proportion_list[proportion_list > 0]
    return -np.sum(non_zero_proportions * np.log2(non_zero_proportions))

In [5]:
class Node:
  def __init__(self, attribute=None, comparison_value=None):
    self.attribute = attribute
    self.comparison_value = comparison_value
    self.true_child = None
    self.false_child = None
    self.leaf_node = False

  def add_true_child(self, true_child):
    self.true_child = true_child

  def add_false_child(self, false_child):
    self.false_child = false_child

  def turn_into_leaf(self, answer):
    self.answer = answer
    self.leaf_node = True

  def add_true_leaf(self, answer):
    self.true_child = Node()
    self.true_child.leaf_node = True
    self.true_child.answer = answer

  def add_false_leaf(self, answer):
    self.false_child = Node()
    self.false_child.leaf_node = True
    self.false_child.answer = answer

In [30]:
def decision_tree(dataset, target, father_node=None, condition=False, minimum_for_splitting=20):

  rows, cols = dataset.shape
  minimum = minimum_for_splitting

  if rows <= minimum or np.all(target == target[0]):
    count_array = np.bincount(target)
    answer = np.where(count_array == np.max(count_array))[0][0]
    if condition == True:
      father_node.add_true_leaf(answer)
    else:
      father_node.add_false_leaf(answer)
    return


  classes = [0,1,2]

  best_entropy = np.log2(len(classes)) #maximum entropy
  best_attribute = 0
  best_step = 0

  ## CALCULATING BEST ENTROPY
  for attribute in range(cols):
    min_val = np.min(dataset[:, attribute])
    max_val = np.max(dataset[:, attribute])

    for step in np.linspace(0, 1, 6):
      if step < min_val:
        continue
      if step > max_val:
        break

      condition_array = dataset[:, attribute] < step
      counter_list = np.zeros(len(classes))
      proportion_list = np.zeros(len(classes))

      for c in classes:
        counter_list[c] = len([idx for idx, x in enumerate(condition_array) if x == True and target[idx] == c])

      if counter_list.sum() < minimum or (rows - counter_list.sum()) < minimum:
        continue
      if counter_list.sum():
        for c in classes:
          proportion_list[c] = counter_list[c] / counter_list.sum()

        entropy = get_entropy(proportion_list)
        if entropy < best_entropy:
          best_entropy = entropy
          best_attribute = attribute
          best_step = step


  if best_entropy == np.log2(len(classes)): #worst possible entropy
    count_array = np.bincount(target)
    answer = np.where(count_array == np.max(count_array))[0][0]
    if condition == True:
      father_node.add_true_leaf(answer)
    else:
      father_node.add_false_leaf(answer)
    return
  ## SETTING NEW SUB DATASETS
  true_case = np.array([x for row, x in enumerate(dataset) if dataset[row][best_attribute] < best_step])
  indices = [row for row, x in enumerate(dataset) if dataset[row][best_attribute] < best_step]
  true_target = target[indices]

  false_case = np.array([x for row, x in enumerate(dataset) if dataset[row][best_attribute] >= best_step])
  indices = [row for row, x in enumerate(dataset) if dataset[row][best_attribute] >= best_step]
  false_target = target[indices]

  ## CREATING NEW NODE FOR FALSE AND TRUE CASE
  new_node = Node(best_attribute, best_step)
  if father_node != None:
    if condition == True:
      father_node.add_true_child(new_node)
    else:
      father_node.add_false_child(new_node)
  if dataset.size != true_case.size and dataset.size != false_case.size:
    decision_tree(true_case, true_target, father_node=new_node, condition=True)
    decision_tree(false_case, false_target, father_node=new_node, condition=False)


  return new_node

In [7]:
def print_tree(node, feature_names, condition=False, depth=0):
  for i in range(depth):
    print(" ", end="")
  if node == None:
    return
  if depth != 0:
    print(condition, ":", end="")
  if node.leaf_node == True:
    print("Class:", node.answer)
    return
  else:
    print(feature_names[node.attribute], "<", node.comparison_value)
  print_tree(node.true_child, feature_names, condition=True, depth=depth+1)
  print_tree(node.false_child, feature_names, condition=False, depth=depth+1)

In [8]:
def evaluate_wine(dataset, wine, node):
  if node == None:
    return
  if node.leaf_node == True:
    return node.answer

  if dataset[wine][node.attribute] < node.comparison_value:
    return evaluate_wine(dataset, wine, node.true_child)
  else:
    return evaluate_wine(dataset, wine, node.false_child)

In [9]:
def run_tree(dataset, target, tree):
  correct_guesses = 0
  incorrect_guesses = 0
  wines, attributes = dataset.shape
  for wine, x in enumerate(dataset):
    expected_class = evaluate_wine(dataset, wine, tree)
    if expected_class == target[wine]:
      correct_guesses += 1
    else:
      incorrect_guesses += 1

  print("Accuracy:", correct_guesses/wines)



In [29]:
def main():
  wine = datasets.load_wine()
  min_max_normalization(wine.data)
  wine_dataset = wine.data
  tree = decision_tree(wine_dataset, wine.target, minimum_for_splitting=15) #minimum defines tree shape, smaller == more overfit in theory
  print_tree(tree, wine.feature_names)
  run_tree(wine_dataset, wine.target, tree)

print("Obs: values are (min-max) normalized")
main()

Obs: values are (min-max) normalized
color_intensity < 0.2
 True :malic_acid < 0.2
  True :Class: 1
  False :Class: 1
 False :total_phenols < 0.2
  True :Class: 2
  False :flavanoids < 0.2
   True :Class: 2
   False :alcalinity_of_ash < 0.4
    True :malic_acid < 0.2
     True :Class: 0
     False :Class: 0
    False :Class: 0
Accuracy: 0.898876404494382
