In [31]:
import pandas as pd
import numpy as np

In [32]:
data = pd.read_csv("./PlayGolf.csv")
data

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes
5,Sunny,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Rainy,Mild,High,False,No
8,Rainy,Cool,Normal,False,Yes
9,Sunny,Mild,Normal,False,Yes


<strong>Funcion total_entropy</strong> <br>
Calcula la entropia total

![](entropy.png)

In [33]:
def total_entropy(data, label, class_list):
    total_row = data.shape[0]
    total_entr = 0
    
    for c in class_list:
        total_class_count = data[data[label] == c].shape[0]
        total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row) 
        total_entr += total_class_entr
    
    return total_entr

In [34]:
def entropy(feature_value_data, label, class_list):
    class_count = feature_value_data.shape[0]
    entropy = 0
    
    for c in class_list:
        label_class_count = feature_value_data[feature_value_data[label] == c].shape[0]
    
        entropy_class = 0
        if label_class_count != 0:
            probability_class = label_class_count/class_count
            entropy_class = - probability_class * np.log2(probability_class) 
        
        entropy += entropy_class
        
    return entropy

In [35]:
def gain(feature_name, data, label, class_list):
    feature_list = data[feature_name].unique()
    total_row = data.shape[0]
    feature_info = 0.0
    
    for feature in feature_list:
        feature_data = data[data[feature_name] == feature]
        feature_count = feature_data.shape[0]
        feature_entropy = entropy(feature_data, label, class_list)
        feature_probability = feature_count/total_row
        feature_info += feature_probability * feature_entropy
        
    return total_entropy(data, label, class_list) - feature_info

In [36]:
def make_tree(root, prev_feature, data, label, class_list):
    if data.shape[0] != 0:
        feature_list = data.columns.drop(label)
        max_info_gain = -1
        max_info_feature = None

        #find max info gain feature
        for feature in feature_list:
            feature_info_gain = gain(feature, data, label, class_list)
            if max_info_gain < feature_info_gain:
                max_info_gain = feature_info_gain
                max_info_feature = feature


        #generate sub tree            
        feature_count_dict = data[max_info_feature].value_counts(sort=False)
        tree = {}

        for feature, count in feature_count_dict.iteritems():
            feature_data = data[data[max_info_feature] == feature]

            assigned_to_node = False
            for c in class_list:
                class_count = feature_data[feature_data[label] == c].shape[0]

                if class_count == count:
                    tree[feature] = c
                    data = data[data[max_info_feature] != feature]
                    assigned_to_node = True
            if not assigned_to_node:
                tree[feature] = "?"

        next_root = None
        
        if prev_feature != None:
            root[prev_feature] = dict()
            root[prev_feature][max_info_feature] = tree
            next_root = root[prev_feature][max_info_feature]
        else:
            root[max_info_feature] = tree
            next_root = root[max_info_feature]
        
        for node, branch in list(next_root.items()):
            if branch == "?":
                feature_data = data[data[max_info_feature] == node]
                make_tree(next_root, node, feature_data, label, class_list)

In [37]:
def id3(data, label):
    tree = {}
    class_list = data[label].unique()
    make_tree(tree, None, data, label, class_list)
    
    return tree

In [38]:
tree = id3(data, 'Play Golf')
tree

{'Outlook': {'Rainy': {'Humidity': {'High': 'No', 'Normal': 'Yes'}},
  'Overcast': 'Yes',
  'Sunny': {'Windy': {False: 'Yes', True: 'No'}}}}