In [1]:
import numpy as np
import pandas as pd
from graphviz import Digraph


## -Loading datasets (df1 = Tenis Dataset , df2 = mushrooms dataset)

In [2]:
df = pd.read_csv('data.csv')
df2 = pd.read_csv('mushrooms.csv')

## - Tree Decision Class

In [81]:
class Tree:
    def __init__(self):
        self.sons = {}
        self.Attribute = None
    def add_son(self, son, att):
        self.sons[att] = son
    def set_att(self, att):
        self.Attribute = att
    def check(self, data):
        tmpAt  = self
        while (tmpAt.sons != {}):
            tmpAt = tmpAt.sons[data[tmpAt.Attribute]]
        return tmpAt.Attribute
    def genView(self, dot):
        label = self.Attribute
        dot.node(str(self), label)
        for key, son in self.sons.items():
            dot.edge(str(self), str(son), label=key)
            son.genView(dot)
   

## - Functions of decisions

### Entropy functions

In [110]:
def get_entropy(data,target):
    n = len(data)
    acc = 0
    for c  in data[target].value_counts():
        acc -= c/n * np.log2(c/n)
    return acc
def get_entropys(data, elt, target):
    ents = 0
    l = [x for _,x in data.groupby(elt)]
    n = 0
    for d in l:
        n += len(d)
        ents += len(d) * get_entropy(d, target)
    return ents/n
def get_std_gain(data, elt, target):
    return get_entropy(data, target) - get_entropys(data, elt, target)
def get_gain_ratio(data, elt, target):
    d = get_entropy(data, elt)
    return np.inf if (d == 0) else get_std_gain(data, elt, target) / d
    

### Gini index

In [111]:
def get_Gini(data,target):
    n = len(data)
    acc = 0
    for c  in data[target].value_counts():
        acc -= c/n 
    return 1 - acc
def get_Ginis(data, elt, target):
    ents = 0
    l = [x for _,x in data.groupby(elt)]
    n = 0
    for d in l:
        n += len(d)
        ents += len(d) * get_Gini(d, target)
    return ents/n
def get_std_gainG(data, elt, target):
    return get_Gini(data, target) - get_Ginis(data, elt, target)
def get_gain_ratioG(data, elt, target):
    d = get_Gini(data, elt)
    return np.inf if (d == 0) else get_std_gainG(data, elt, target) / d
    

## - ID3 Algorithm

In [106]:
def ID3(data, target, func_ent):
    root = Tree()
    if (len(set(data[target])) == 1 ):
        root.set_att(data[target].values[0])     
    else:
        lis_ent = []
        for elt in data:
            if(elt == target):
                lis_ent.append(-np.inf)
                continue
            ent = func_ent(data, elt, target)
            lis_ent.append(ent)
            indop = np.argmax(lis_ent)        
            optimal = data.columns[indop]
            root.set_att(optimal)
        for d in [(att, x) for att,x in data.groupby(optimal)]:
            root.add_son(ID3(d[1].loc[:, d[1].columns != optimal], target, func_ent), d[0])
    return root

 ## Tenis Dataset standard information Gain 

In [107]:
target = 'PlayTennis'
tree = ID3(df, target, get_std_gain)
dot = Digraph(comment="Decision tree")
tree.genView(dot)
dot.view(cleanup=True)

'Digraph.gv.pdf'

In [95]:
#check that all classifications are correct
np.all(np.array([tree.check(df.loc[i]) == df.loc[i][target] for i in range(len(df))]))

True

 ## Tenis Dataset  Gain ratio

In [86]:
tree = ID3(df, target, get_gain_ratio)
dot = Digraph(comment="Decision tree")
tree.genView(dot)
dot.view(cleanup=True)

'Digraph.gv.pdf'

In [87]:
#check that all classifications are correct
np.all(np.array([tree.check(df.loc[i]) == df.loc[i][target] for i in range(len(df))]))

True

## Mushrooms Dataset standard information Gain

In [88]:
target2 = 'type'
tree2 = ID3(df2, target2, get_std_gain)
dot = Digraph(comment="Decision tree")
tree2.genView(dot)
dot.view(cleanup=True)

'Digraph.gv.pdf'

In [89]:
#check that all classifications are correct
np.all(np.array([tree2.check(df2.loc[i]) == df2.loc[i][target2] for i in range(len(df2))]))

True

## Mushrooms Dataset Gain ratio

In [108]:
target2 = 'type'
tree2 = ID3(df2, target2, get_gain_ratio)
dot = Digraph(comment="Decision tree")
tree2.genView(dot)
dot.view(cleanup=True)

'Digraph.gv.pdf'

In [97]:
#check that all classifications are correct
np.all(np.array([tree2.check(df2.loc[i]) == df2.loc[i][target2] for i in range(len(df2))]))

True

## Use of Gini index instead of Entropy

In [112]:
target = 'PlayTennis'
tree = ID3(df, target, get_std_gainG)
dot = Digraph(comment="Decision tree")
tree.genView(dot)
dot.view(cleanup=True)

'Digraph.gv.pdf'

In [114]:
tree = ID3(df, target, get_gain_ratioG)
dot = Digraph(comment="Decision tree")
tree.genView(dot)
dot.view(cleanup=True)

'Digraph.gv.pdf'

In [115]:
target2 = 'type'
tree2 = ID3(df2, target2, get_std_gainG)
dot = Digraph(comment="Decision tree")
tree2.genView(dot)
dot.view(cleanup=True)

'Digraph.gv.pdf'

In [116]:
target2 = 'type'
tree2 = ID3(df2, target2, get_gain_ratioG)
dot = Digraph(comment="Decision tree")
tree2.genView(dot)
dot.view(cleanup=True)

'Digraph.gv.pdf'

### Remark
The use of Gini Index give much bigger Trees