<a href="https://colab.research.google.com/github/T0madon/IA/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import math

class Node:
    """Contains the information of the node and another nodes of the Decision Tree."""

    def __init__(self):
        self.value = None
        self.next = None
        self.childs = None

class DecisionTreeClassifier:
    """Decision Tree Classifier using ID3 algorithm."""

    def __init__(self, X, feature_names, labels):
        self.X = X  # features or predictors
        self.feature_names = feature_names  # name of the features
        self.labels = labels  # categories
        self.labelCategories = list(set(labels))  # unique categories
        # number of instances of each category
        self.labelCategoriesCount = [list(labels).count(x) for x in self.labelCategories]
        self.node = None  # nodes
        # calculate the initial entropy of the system
        self.entropy = self._get_entropy([x for x in range(len(self.labels))])

    def _get_entropy(self, x_ids):
        """ Calculates the entropy.
        Parameters
        __________
        :param x_ids: list, List containing the instances ID's
        __________
        :return: entropy: float, Entropy.
        """
        # sorted labels by instance id
        labels = [self.labels[i] for i in x_ids]
        # count number of instances of each category
        label_count = [labels.count(x) for x in self.labelCategories]
        # calculate the entropy for each category and sum them
        entropy = sum([-count / len(x_ids) * math.log(count / len(x_ids), 2)
                      if count else 0
                      for count in label_count
                      ])

        return entropy

    def _get_split_entropy(self, x_ids, feature_id):
        """
        Calculates the weighted entropy after a split by the given feature.
        """
        # store in a list all the values of the chosen feature
        x_features = [self.X[x][feature_id] for x in x_ids]
        # get unique values
        feature_vals = list(set(x_features))
        # get frequency of each value
        feature_v_count = [x_features.count(x) for x in feature_vals]
        # get the feature values ids
        feature_v_id = [
            [x_ids[i]
            for i, x in enumerate(x_features)
            if x == y]
            for y in feature_vals
        ]

        # compute the entropy after the split
        split_entropy = sum([v_counts / len(x_ids) * self._get_entropy(v_ids)
                            for v_counts, v_ids in zip(feature_v_count, feature_v_id)])

        return split_entropy

    def _get_feature_min_entropy(self, x_ids, feature_ids):
        """Finds the feature that minimizes the entropy after split."""
        features_entropy = [self._get_split_entropy(x_ids, feature_id) for feature_id in feature_ids]
        min_id = feature_ids[features_entropy.index(min(features_entropy))]
        return self.feature_names[min_id], min_id

    def id3(self):
        """Initializes ID3 algorithm to build a Decision Tree Classifier.
        :return: None
        """
        # assign an unique number to each instance
        x_ids = [x for x in range(len(self.X))]
        # assign an unique number to each featuer
        feature_ids = [x for x in range(len(self.feature_names))]
        # define node variable - instance of the class Node
        self.node = self._id3_recv(x_ids, feature_ids, self.node)

    def _id3_recv(self, x_ids, feature_ids, node):
        """ID3 algorithm. It is called recursively until some criteria is met.
        Parameters
        __________
        :param x_ids: list, list containing the samples ID's
        :param feature_ids: list, List containing the feature ID's
        :param node: object, An instance of the class Nodes
        __________
        :returns: An instance of the class Node containing all the information of the nodes in the Decision Tree
        """
        if not node:
            node = Node()  # initialize nodes
        # sorted labels by instance id
        labels_in_features = [self.labels[x] for x in x_ids]
        # if all the example have the same class (pure node), return node
        if len(set(labels_in_features)) == 1:
            node.value = self.labels[x_ids[0]]
            return node
        # if there are not more feature to compute, return node with the most probable class
        if len(feature_ids) == 0:
            node.value = max(set(labels_in_features), key=labels_in_features.count)  # compute mode
            return node
        # else...
        # choose the feature that maximizes the information gain
        best_feature_name, best_feature_id = self._get_feature_min_entropy(x_ids, feature_ids)
        node.value = best_feature_name
        node.childs = []
        # value of the chosen feature for each instance
        feature_values = list(set([self.X[x][best_feature_id] for x in x_ids]))
        # loop through all the values
        for value in feature_values:
            child = Node()
            child.value = value  # add a branch from the node to each feature value in our feature
            node.childs.append(child)  # append new child node to current node
            child_x_ids = [x for x in x_ids if self.X[x][best_feature_id] == value]
            if not child_x_ids:
                child.next = max(set(labels_in_features), key=labels_in_features.count)
                print('')
            else:
                if feature_ids and best_feature_id in feature_ids:
                    to_remove = feature_ids.index(best_feature_id)
                    feature_ids.pop(to_remove)
                # recursively call the algorithm
                child.next = self._id3_recv(child_x_ids, feature_ids, child.next)
        return node

In [None]:
import pandas as pd

path = "/content/house-votes-84.data"
column_names = ['ClassName', 'HandicappedInfants', 'WaterProjectCostSharing', 'AdoptionOfTheBudgetResolution', 'PhysicianFeeFreeze', 'ElSalvadorAid', 'ReligiousGroupsInSchools', 'AntiSatelliteTestBan', 'AidToNicaraguanContracts', 'MxMissile', 'Immigration', 'SynfuelsCorporationCutback', 'EducationSpendig', 'SuperfundRightToSue', 'Crime', 'DutyFreeExports', 'ExportAdministrationActSouthAfrica']

df = pd.read_csv(path, header=None, names=column_names)
df = df.rename(columns={"ClassName": "Decision"})
# BASE MODA POR CLASSE
base_moda_classe = df.copy()


# Itera sobre todas as colunas (menos 'ClassName')
for col in df.columns[1:]:
    for classe in df['Decision'].unique():
        # Filtra a coluna por classe, excluindo os '?'
        moda = df.loc[(df['Decision'] == classe) & (df[col] != '?'), col].mode()
        if not moda.empty:
            valor_moda = moda[0]
            # Substitui na cópia onde a classe é a atual e o valor é '?'
            base_moda_classe.loc[(base_moda_classe['Decision'] == classe) & (base_moda_classe[col] == '?'), col] = valor_moda

base_moda_classe


Unnamed: 0,Decision,HandicappedInfants,WaterProjectCostSharing,AdoptionOfTheBudgetResolution,PhysicianFeeFreeze,ElSalvadorAid,ReligiousGroupsInSchools,AntiSatelliteTestBan,AidToNicaraguanContracts,MxMissile,Immigration,SynfuelsCorporationCutback,EducationSpendig,SuperfundRightToSue,Crime,DutyFreeExports,ExportAdministrationActSouthAfrica
0,republican,n,y,n,y,y,y,n,n,n,y,n,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,y
2,democrat,y,y,y,n,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,n,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,n,y,y,y,y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,republican,n,n,y,y,y,y,n,n,y,y,n,y,y,y,n,y
431,democrat,n,n,y,n,n,n,y,y,y,y,n,n,n,n,n,y
432,republican,n,y,n,y,y,y,n,n,n,n,y,y,y,y,n,y
433,republican,n,n,n,y,y,y,n,n,n,y,n,y,y,y,n,y


Instanciar e treinar o classificador

In [None]:
# transformar em listas para passar ao classificador
X = base_moda_classe.iloc[:, 1:].values.tolist()  # features
y = base_moda_classe['Decision'].tolist()        # labels
feature_names = base_moda_classe.columns[1:].tolist()

clf = DecisionTreeClassifier(X, feature_names, y)
clf.id3()

def print_tree(node, depth=0):
    indent = "  " * depth
    if not node.childs:
        print(f"{indent}→ {node.value}")
    else:
        print(f"{indent}{node.value}?")
        for child in node.childs:
            print(f"{indent}== {child.value}")
            print_tree(child.next, depth + 1)

# Imprimir a árvore treinada
print_tree(clf.node)

PhysicianFeeFreeze?
== y
  SynfuelsCorporationCutback?
  == y
    AdoptionOfTheBudgetResolution?
    == y
      AntiSatelliteTestBan?
      == y
        → republican
      == n
        → democrat
    == n
      ElSalvadorAid?
      == y
        Immigration?
        == y
          → republican
        == n
          SuperfundRightToSue?
          == y
            EducationSpendig?
            == y
              WaterProjectCostSharing?
              == y
                → republican
              == n
                HandicappedInfants?
                == y
                  → republican
                == n
                  ExportAdministrationActSouthAfrica?
                  == y
                    → republican
                  == n
                    → democrat
            == n
              ReligiousGroupsInSchools?
              == y
                AidToNicaraguanContracts?
                == n
                  MxMissile?
                  == n
                    Crime?
   